Example #1
0
def parse_xml(file):
    tree = ET.parse(file)
    root = tree.getroot()
    i = 0
    d= []
    for tv in root.findall('programme'):
        cat = tv.find('category').text
        if cat == "Film":
            #Movie name
            try:
                title = tv.find('title').text.encode('utf8')
            except:
                title = "N/A"            
            #Movie date
            try:
                date = tv.find('date').text #.encode('utf8')
            except:
                date = "N/A"
            #Start date
            try:
                s = tv.get('start')
                st = parser.parse(s)
                lc = locale.getdefaultlocale()
                locale.setlocale (locale.LC_ALL ,lc)
                start = st.strftime('%A %C %B - %H:%M GMT+1')
            except: 
                start = "N/A"
            #Channel
            try :
                c = tv.get('channel').split('.',1)[0]            
                json_data=open('./res/channels.json')            
                data = json.load(json_data)
                chan = data[c]
            except:
                chan = "N/A"
            #Movie length    
            try:
                length = tv.find('length').text
            except:
                length = "N/A"
            #IMDB Rating and URL
            try: 
                rating = ImdbRating(title).rating + "/10"
            except:
                rating = "N/A"
              
            try:
                url = ImdbRating(title).url
            except:
                import urllib2
                url = "http://www.imdb.com/find?q=" + urllib2.quote(title)
            else:
                import urllib2
                url = "https://duckduckgo.com/?q=" + urllib2.quote(title)
            #You can remove sleep, if you're sure .. The size of a json file can be more than 20 MB and the scrappping could take a long time
            sleep(1)
            d.append({ 'title':title ,'date':date, 'start':start, 'chan':chan, 'length':length, 'rating':rating, 'url':url})
            
    newd = sorted(d, key=lambda k: k['rating'], reverse= True)
    return newd
Example #2
0
def _update_request_uri_query(request):
    """pulls the query string out of the URI and moves it into 
    the query portion of the request object.  If there are already
    query parameters on the request the parameters in the URI will
    appear after the existing parameters"""

    if "?" in request.path:
        pos = request.path.find("?")
        query_string = request.path[pos + 1 :]
        request.path = request.path[:pos]
        if query_string:
            query_params = query_string.split("&")
            for query in query_params:
                if "=" in query:
                    pos = query.find("=")
                    name = query[:pos]
                    value = query[pos + 1 :]
                    request.query.append((name, value))

    request.path = urllib2.quote(request.path, "/()$=',")

    # add encoded queries to request.path.
    if request.query:
        request.path += "?"
        for name, value in request.query:
            if value is not None:
                request.path += name + "=" + urllib2.quote(value, "/()$=',") + "&"
        request.path = request.path[:-1]

    return request.path, request.query
Example #3
0
	def send_message_via_kannel(self, identity, message):
		backend = PersistantBackend.objects.get(title="kannel")
		connection = PersistantConnection(backend = backend,identity = identity)
		#conf = {'kannel_host':'127.0.0.1', 'kannel_port':13013, 'kannel_password':'******', 'kannel_username':'******'}
		
		try:
			conf = settings.RAPIDSMS_CONF["kannel"]
		
			url = "http://%s:%s/cgi-bin/sendsms?to=%s&text=%s&password=%s&user=%s" % (
			    conf["kannel_host"], 
			    conf["kannel_port"],
			    urllib2.quote(connection.identity.strip()), 
			    urllib2.quote(message),
			    conf['kannel_password'],
			    conf['kannel_username'])

			f = urllib2.urlopen(url, timeout=10)
			if f.getcode() / 100 != 2:
			    print "Error delivering message to URL: %s" % url
			    raise RuntimeError("Got bad response from router: %d" % f.getcode())

			# do things at a reasonable pace
			time.sleep(.2)
			return True
		except KeyError:
			return settings.RAPIDSMS_CONF["kannel"]
def _update_request_uri_query(request):
    '''pulls the query string out of the URI and moves it into 
    the query portion of the request object.  If there are already
    query parameters on the request the parameters in the URI will
    appear after the existing parameters'''

    if '?' in request.path:
        request.path, _, query_string = request.path.partition('?')
        if query_string:
            query_params = query_string.split('&')
            for query in query_params:
                if '=' in query:
                    name, _, value = query.partition('=')
                    request.query.append((name, value))

    request.path = urllib2.quote(request.path, '/()$=\',')

    #add encoded queries to request.path. 
    if request.query:
        request.path += '?' 
        for name, value in request.query:
            if value is not None:
                request.path += name + '=' + urllib2.quote(value, '/()$=\',') + '&'
        request.path = request.path[:-1]

    return request.path, request.query
Example #5
0
    def test_import_to_shape(self):
        from gnmvidispine.vs_item import VSItem
        i = VSItem(host=self.fake_host,port=self.fake_port,user=self.fake_user,passwd=self.fake_passwd)

        i.name = "VX-123"
        i.sendAuthorized = MagicMock(return_value=self.MockedResponse(200,  self.import_job_doc))
        
        with self.assertRaises(ValueError):
            i.import_to_shape() #expect ValueError if neither uri nor file ref
        
        fake_uri="file:///path/to/newmedia.mxf"
        quoted_uri=quote(fake_uri,"")   #we are embedding a URI as a parameter with another URL so it must be double-encoded
        
        i.import_to_shape(uri=fake_uri,shape_tag="shapetagname",priority="HIGH")
        i.sendAuthorized.assert_called_with('POST',
                                            '/API/item/VX-123/shape?priority=HIGH&essence=false&tag=shapetagname&thumbnails=true&uri={0}'.format(quoted_uri)
                                            ,"",{'Accept':'application/xml'}, rawData=False)

        fake_uri = "file:///path/to/" + quote("media with spaces.mxf",safe="/")
        quoted_uri = quote(fake_uri,"")  # we are embedding a URI as a parameter with another URL so it must be double-encoded
        
        i.import_to_shape(uri=fake_uri, shape_tag="shapetagname", priority="HIGH")
        i.sendAuthorized.assert_called_with('POST',
                                            '/API/item/VX-123/shape?priority=HIGH&essence=false&tag=shapetagname&thumbnails=true&uri={0}'.format(
                                                quoted_uri)
                                            , "", {'Accept': 'application/xml'}, rawData=False)

        fake_uri = "file:///path/to/" + quote("media+with+plusses.mxf",safe="/+")
        quoted_uri = quote(fake_uri,"")  # we are embedding a URI as a parameter with another URL so it must be double-encoded
        
        i.import_to_shape(uri=fake_uri, shape_tag="shapetagname", priority="HIGH")
        i.sendAuthorized.assert_called_with('POST',
                                            '/API/item/VX-123/shape?priority=HIGH&essence=false&tag=shapetagname&thumbnails=true&uri={0}'.format(
                                                quoted_uri)
                                            , "", {'Accept': 'application/xml'}, rawData=False)
Example #6
0
def fetch(show, exact=False, ep=None):
    query_string = '?show=' + quote(show)
    if exact:
        query_string = query_string + '&exact=1'
    if ep:
        query_string = query_string + '&ep=' + quote(ep)
    resp = _fetch(BASE_URL + query_string).read()
    show_info = {}
    if 'No Show Results Were Found For' in resp:
        raise ShowNotFound(show)
    else:
        data = resp.replace('<pre>', '').splitlines()
        for line in data:
      ne in data:
            try:
                if '@@' in line:
                    line = line.replace('@@','@')
                    k, v = line.split('@')
                    v = '@' + v
                else:
                    k, v = line.split('@')

            except ValueError, err:

                except ValueError, err: #"Ended@"
                    k = line.replace('@',"")
                    v = ""
Example #7
0
 def set_language(self):
     "Set the language"
     nextpage = request.params.get('next', None)
     if not nextpage:
         nextpage = request.headers.get('Referer', None)
     if not nextpage:
         nextpage = '/'
     if '://' in nextpage:
         from_url = urlparse(nextpage)
         nextpage = from_url[2]
     lang_code = request.params.get('language', None)
     if lang_code and check_language(lang_code):
         session['lang'] = lang_code
         session.save()
     params = []
     for param in request.params:
         if not param in ['language', 'amp']:
             value = request.params[param]
             if value:
                 if (param == 'came_from' and
                     '://' in urllib2.unquote(value)):
                     urlparts = urlparse(urllib2.unquote(value))
                     value = urlparts[2] or '/'
                 params.append('%s=%s' % (urllib2.quote(param),
                                         urllib2.quote(value)))
     if 'lc=1' not in params:
         params.append('lc=1')
     if params:
         nextpage = "%s?%s" % (nextpage, '&amp;'.join(params))
     redirect(nextpage)
Example #8
0
def send_prowl(title, msg, gtype, force=False, test=None):
    """ Send message to Prowl """

    if test:
        apikey = test.get('prowl_apikey')
    else:
        apikey = sabnzbd.cfg.prowl_apikey()
    if not apikey:
        return T('Cannot send, missing required data')

    title = Tx(NOTIFICATION.get(gtype, 'other'))
    title = urllib2.quote(title.encode('utf8'))
    msg = urllib2.quote(msg.encode('utf8'))
    prio = get_prio(gtype, 'prowl')

    if force:
        prio = 0

    if prio > -3:
        url = 'https://api.prowlapp.com/publicapi/add?apikey=%s&application=SABnzbd' \
              '&event=%s&description=%s&priority=%d' % (apikey, title, msg, prio)
        try:
            urllib2.urlopen(url)
            return ''
        except:
            logging.warning(T('Failed to send Prowl message'))
            logging.info("Traceback: ", exc_info=True)
            return T('Failed to send Prowl message')
    return ''
Example #9
0
 def get(self,method,args=None):
     """ GET to DeepDetect server """
     u = self.__ddurl
     u += method
     headers = {}
     if args is not None:
         sep = "?"
         for arg,argv in args.iteritems():
             u += sep
             sep = "&"
             u += urllib2.quote(arg)
             u += '='
             if argv is not None:
                 u += urllib2.quote(argv)
                 
     LOG("GET %s"%u)
     response = None
     try:
         req = urllib2.Request(u)
         response = urllib2.urlopen(req, timeout=DD_TIMEOUT)
         jsonresponse=response.read()
     except:
         raise DDCommunicationError(u,"GET",headers,None,response)
     LOG(jsonresponse)
     try:
         return self.__return_format(jsonresponse)
     except:
         raise DDDataError(u,"GET",headers,None,jsonresponse)
Example #10
0
 def buildURL(self, params):
     """RESTリクエストのURLアドレスを構築"""
     params["Service"] = "AWSECommerceService"
     params["AWSAccessKeyId"] = self.access_key
     if self.associate_tag is not None:
         params["AssociateTag"] = self.associate_tag
     params["Timestamp"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
     sorted_params = sorted(params.items())
     
     # paramsのハッシュを展開
     request = []
     #print sorted_params
     for p in sorted_params:
         pair = "%s=%s" % (p[0], urllib2.quote(p[1].encode("utf-8")))
         request.append(pair)
     
     # 2009/8/15から認証が導入されている
     # Secret Access Keyを使ってHMAC-SHA256を計算
     msg = "GET\nwebservices.amazon.co.jp\n/onca/xml\n%s" % ("&".join(request))
     hmac_digest = hmac.new(self.secret_access_key, msg, hashlib.sha256).digest()
     base64_encoded = base64.b64encode(hmac_digest)
     signature = urllib2.quote(base64_encoded)
     
     # Signatureをリクエストに追加してURLを作成
     request.append("Signature=%s" % signature)
     url = self.amazonurl + "?" + "&".join(request)
     
     return url
Example #11
0
def get_lyrics(entry, db):
    global errors
    global successes

    title = entry['title'].encode('utf-8')
    artist = entry['artist'].encode('utf-8')
    year = entry['year']

    artist_clean = urllib2.quote(sanitize_artist(artist).replace(" ", "_"))
    title_clean = urllib2.quote(sanitize_title(title).replace(" ", "_"))
    url = 'http://lyrics.wikia.com/' + artist_clean + ':' + title_clean
    page = requests.get(url)
    if page.status_code != 200:
        print "404 error getting lyrics for " + title + " by " + artist + ", " + str(year)
        errors += 1
    else:
        page_soup = BeautifulSoup(page.text)
        lyrics = page_soup.select(".lyricbox")
        if len(lyrics) == 0:
            print "Parsing error getting lyrics for " + title + " by " + artist + ", " + str(year)
            errors += 1
            return

        lyrics = lyrics[0]
        [x.extract() for x in lyrics.findAll('script')]
        lyrics = lyrics.get_text(' ', strip=True).encode('utf-8')
        lyrics = santize(lyrics)
        entry['lyrics'] = lyrics
        db.entries.save(entry)
        successes += 1
        print "Successfully extracted lyrics for " + title + " by " + artist
Example #12
0
def sb_search():
    sickbeard = {}
    params = ''

    try:
        params = '&name=%s' % (urllib2.quote(request.args['name']))
    except:
        pass

    try:
        params = '&tvdbid=%s' % (urllib2.quote(request.args['tvdbid']))
    except:
        pass

    try:
        params = '&lang=%s' % (urllib2.quote(request.args['lang']))
    except:
        pass

    if params is not '':
        params = '/?cmd=sb.searchtvdb%s' % params

        try:
            sickbeard = sickbeard_api(params)
            sickbeard = sickbeard['data']['results']
        except:
            sickbeard = None

    else:
        sickbeard = None

    return render_template('sickbeard-search.html',
        data=sickbeard,
        sickbeard='results',
    )
Example #13
0
 def build_query(self):
     """
     Builds query to access to cghub server.
     """
     parts = []
     for key, value in self.query.iteritems():
         if isinstance(value, list) or isinstance(value, tuple):
             value_str = '+OR+'.join([
                     self.escape_query_value(key, v) for v in value])
             value_str = '(%s)' % value_str
         else:
             value_str = self.escape_query_value(key, value)
         parts.append('='.join([key, value_str]))
     if self.offset:
         parts.append('='.join(['start', str(self.offset)]))
     if self.limit:
         parts.append('='.join(['rows', str(self.limit)]))
     if self.sort_by:
         if self.sort_by[0] == '-':
             parts.append('='.join([
                     'sort_by',
                     '%s:desc' % urllib2.quote(self.sort_by[1:])]))
         else:
             parts.append('='.join([
                     'sort_by',
                     '%s:asc' % urllib2.quote(self.sort_by)]))
     return '&'.join(parts)
Example #14
0
def searchBook(isbn_num):
    logText("Searching for: ", isbn_num)
    
    query = "AWSAccessKeyId=" + AWSAccessKeyID + "&AssociateTag=abc&Keywords="
    query += isbn_num 
    query += "&Operation=ItemSearch&ResponseGroup=ItemAttributes&SearchIndex=Books&Service=AWSECommerceService"
    query += "&Timestamp=" + urllib2.quote(datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"))[:-1]
    # query += "&Version=2011-08-01"
    
    data = "GET\n"
    data += "ecs.amazonaws.com\n"
    data += "/onca/xml\n"
    data += query
    
    a = hmac.new(AWSSecret, data, hashlib.sha256)
    signature = urllib2.quote(base64.encodestring(a.digest())[:-1])
    
    url = "http://ecs.amazonaws.com/onca/xml?" + query + "&Signature=" + signature
    
    # print "URL : ", url
    
    url_obj = urllib2.urlopen(url)
    
    
    data = url_obj.read()
    
    book_info = getInfoFromXML(data)
        
    logText( " - Title: ", book_info[0])
    logText( " - Price: ", book_info[1])
    storeInDB( (book_info[0], isbn_num, book_info[1]) )
def get_lat_lng(address, city, state):
    c = config.load()

    # If address is a PO Box, skip
    if re.search('P(\.)?O(\.)?(\sBox\s)[0-9]+', address) is not None or address == '':
        return None
    else:
        url = 'https://api.smartystreets.com/street-address?'
        url += 'state=' + urllib2.quote(str(state))
        url += '&city=' + urllib2.quote(str(city))
        url += '&auth-id=' + c['ss_id']
        url += '&auth-token=' + c['ss_token']
        url += '&street=' + urllib2.quote(str(address))

        result = json.load(urllib2.urlopen(url))

        if len(result) == 1:
            lat_lng = {'lat': result[0]['metadata']['latitude'], 'lng': result[0]['metadata']['longitude']}
            return lat_lng
        elif len(result) == 0:
            # return generic lat/lng if zero results so we can come back later to fix it
            lat_lng = {'lat': 36.0, 'lng': -76.0}
            return lat_lng
        else:
            print result
            exit(-1)
Example #16
0
    def http_get(self, url):
            MIME = '*/*'
            unquoteurl = urllib2.unquote(url.encode('utf-8'))
            scheme,netloc,url,params,query,fragment = urlparse(unquoteurl)
	    netloc=urllib2.quote(netloc)
            url = urllib2.quote(url)
            url = ParseResult( scheme,netloc,url,params,query,fragment ).geturl()
            retries = 30
            i = 0
            while(True):
                try:
                    if(self.useproxy):
			print 'using proxy'
                        response = self.opener.open(url,timeout=5)
                        print("GET "  + urllib2.unquote(response.geturl().encode()) + " " + str(response.code))
                        if('content-type' in response.headers):
                            MIME = response.headers['content-type'].split(';')[0]
			print response
                        return response.read(), response.code, MIME
                    else:
                        response = requests.get(url)
                        print("GET "  + urllib2.unquote(str(response.url)) + " " + str(response.status_code))
                        if('content-type' in response.headers):
                            MIME = response.headers['content-type'].split(';')[0]
                        return response.content, response.status_code, MIME
                except:
                    if(i > retries):
                        print traceback.print_exc()
                        raise sys.exc_info()[0]
                    print "timeout 5000ms"
                    i += 1
    def _get_archived_json_results(self):
        """Download JSON file that only contains test
        name list from test-results server. This is for generating incremental
        JSON so the file generated has info for tests that failed before but
        pass or are skipped from current run.

        Returns (archived_results, error) tuple where error is None if results
        were successfully read.
        """
        results_json = {}
        old_results = None
        error = None

        if not self._test_results_server:
            return {}, None

        results_file_url = (self.URL_FOR_TEST_LIST_JSON %
            (urllib2.quote(self._test_results_server),
             urllib2.quote(self._builder_name),
             self.RESULTS_FILENAME,
             urllib2.quote(self._test_type),
             urllib2.quote(self._master_name)))

        try:
            # FIXME: We should talk to the network via a Host object.
            results_file = urllib2.urlopen(results_file_url)
            info = results_file.info()
            old_results = results_file.read()
        except urllib2.HTTPError, http_error:
            # A non-4xx status code means the bot is hosed for some reason
            # and we can't grab the results.json file off of it.
            if (http_error.code < 400 and http_error.code >= 500):
                error = http_error
def get_s3_files_table(prefix):
    ''' list files from s3, to be used with table listing; return dicts '''
    bucket_name = os.environ['BUCKET']

    try:
        ak, sk = get_env_creds()
        s3 = boto.connect_s3(aws_access_key_id=ak,
                             aws_secret_access_key=sk)
        bucket = s3.get_bucket(bucket_name)
    except:
        logging.error('get_s3_files: Could not connect to AWS/Bucket: %s'
                      % str(sys.exc_info()))
    files = bucket.list_versions(prefix=prefix)
    filelist = []
    for f in files:
        if type(f) is not boto.s3.key.Key:
            continue
        size_in_mb = '%.2f' % (float(f.size) / (1024*1024))
        key = f.name[len(prefix):]
        directory = key.partition('/')[0]
        filename = key.partition('/')[-1]
        cb64 = urllib2.quote((f.name).encode('base64').rstrip())
        vb64 = urllib2.quote(f.version_id.encode('base64').rstrip())
        dfmt = '%Y-%m-%dT%H:%M:%S.000Z'
        date = datetime.strptime(f.last_modified, dfmt)
        d = { 'name' : filename,
              'dir'  : directory,
              'v_id' : f.version_id,
              'date' : date,
              'size' : size_in_mb,
              'cb64' : cb64,
              'vb64' : vb64,
              'key'  : key}
        filelist.append(d)
    return filelist
Example #19
0
def responseMsg(request):
	rawStr = smart_str(request.body)
	#rawStr = smart_str(request.POST['XML'])
	msg = paraseMsgXml(ET.fromstring(rawStr))
	
	queryStr = msg.get('Content','You have input nothing~')
	msgType = msg.get('MsgType', 'text')

	raw_youdaoURL = "http://fanyi.youdao.com/openapi.do?keyfrom=%s&key=%s&type=data&doctype=%s&version=1.1&q=" % (YOUDAO_KEY_FROM,YOUDAO_KEY,YOUDAO_DOC_TYPE)	
	
	event = msg.get('Event', '')

	if msgType == 'event':
		result = getBasicReply(msg, '欢迎使用,发送单词或者中文词语,将获得相应的解释;如果需要单词的读音,请在单词前面添加一个点,如.hello;欢迎推荐给你们de小伙伴们')

	elif queryStr.startswith('.'):
		queryStr = queryStr[1:]
		youdaoURL = "%s%s" % (raw_youdaoURL,urllib2.quote(queryStr))
		req = urllib2.Request(url=youdaoURL)
		result = urllib2.urlopen(req).read()
		replyContent = getPronounce(ET.fromstring(result))
		result = getReplyXml(msg,replyContent, queryStr)

	else:
		youdaoURL = "%s%s" % (raw_youdaoURL,urllib2.quote(queryStr))
		req = urllib2.Request(url=youdaoURL)
		result = urllib2.urlopen(req).read()
		replyContent = paraseYouDaoXml(ET.fromstring(result))
		result = getBasicReply(msg,replyContent)

	return result
def lastfm_info(tracktuple, trinfo):
    if tracktuple[0] != '':
        mbid = '&mbid=' + tracktuple[0]
    else: mbid = ''
    artist = urllib2.quote(tracktuple[1].encode('utf-8'))
    songtitle = urllib2.quote(tracktuple[2].encode('utf-8'))
    query = 'http://ws.audioscrobbler.com/2.0/?method=track.getInfo&api_key='\
        + LASTFM_KEY + mbid + '&artist=' + artist + '&track='\
        + songtitle + '&format=json'
    response = json.loads(urllib2.urlopen(query).read())
    result = None
    try:
        result = response['track']
    except KeyError:
        global lastfm_failed
        print '?? No result for', tracktuple, 'on last.fm'
        print '   ', response
        lastfm_failed.append(tracktuple)
    if result != None:
        trinfo['track']['name'] = response['track']['name']
        try:
            album_response = response['track']['album']
            trinfo['track']['album'] = {}
            trinfo['track']['album']['title'] = album_response['title']
            trinfo['track']['album']['url'] = album_response['url']
            trinfo['track']['album']['artist'] = album_response['artist']
            trinfo['track']['album']['mbid'] = album_response['mbid']
        except KeyError:
            print '?? No album for', trinfo['track']['name']
        trinfo['track']['artist'] = response['track']['artist']
        trinfo['track']['toptags'] = response['track']['toptags']
        trinfo['track']['id']['musicbrainz'] = response['track']['mbid']
        trinfo['track']['duration'] = response['track']['duration']
        print trinfo['track']['name'], 'succesfully appended'
    return trinfo
Example #21
0
    def baiduMusic(self, musicTitle, musicAuthor):
        baseurl = r"http://box.zhangmen.baidu.com/x?op=12&count=1&title=%s$$%s$$$$" % \
        (urllib2.quote(musicTitle.encode("utf-8")),urllib2.quote(musicAuthor.encode("utf-8")))
        
        resp = urllib2.urlopen(baseurl)
        xml = resp.read()
        
        #.*?是只获取<url>之间的数据 普通url
        url = re.findall('<url>.*?</url>',xml)
        #.*?是只获取<durl>之间的数据 高品质url
        durl = re.findall('<durl>.*?</durl>',xml)

        #获取第一个url中encode标签的数据
        url1 = re.findall('<encode>.*?CDATA\[(.*?)\]].*?</encode>',url[0])
        url2 = re.findall('<decode>.*?CDATA\[(.*?)\]].*?</decode>',url[0])
        
        #取出url1最后一个 /(包含) 之前的数据加上url2最后一个 &(不包含) 之前的数据
        urlpath = url1[0][:url1[0].rindex('/')+1] + url2[0][:url2[0].rindex('&')]
        durlpath = ""
        if durl:
            durl1 = re.findall('<encode>.*?CDATA\[(.*?)\]].*?</encode>',durl[0])
            durl2 = re.findall('<decode>.*?CDATA\[(.*?)\]].*?</decode>',durl[0])
            durlpath = durl1[0][:durl1[0].rindex('/')+1] + durl2[0][:durl2[0].rindex('&')]

        return urlpath, durlpath
def translate(phrase, in_lang):
    if in_lang == "en":
        out_lang = "ja"
    else:
        out_lang = "en"

    if True:
        url = (
            "http://api.microsofttranslator.com/V2/Ajax.svc/GetTranslations?appId=F2926FC35C3732CEC3E9C92913745F9C28912821&from="
            + in_lang
            + "&to="
            + out_lang
            + "&maxTranslations=1"
        )
        url += "&text=" + quote(phrase.encode("utf-8"))

        response = urlfetch.fetch(url=url)

        content = re.sub(u"\xEF\xBB\xBF", "", response.content)
        data = json.loads(content)
        translated_text = data["Translations"][0]["TranslatedText"]
        time.sleep(0.1)
    else:
        url = "https://www.googleapis.com/language/translate/v2?"
        url += "&source=" + in_lang
        url += "&target=" + out_lang
        url += "&q=" + quote(phrase.encode("utf-8"))
        url += "&key=" + "AIzaSyAI3PoUAJ_uP0o33EDgUfSEUMALepQAaNA"

        content = urlfetch.fetch(url=url).content
        data = json.loads(content)

        translated_text = data["data"]["translations"][0]["translatedText"]

    return translated_text
Example #23
0
    def releaseId(self, _atExit=False):
        postData= {}
        postData['wantedId']= self.lastId
        postData['logName']= urllib2.quote(self.parentDB.config.projectUser)
        if self.settings.login!='' and self.settings.passw!='':
            postData['logName']= urllib2.quote(self.settings.login)
            postData['logPass']= urllib2.quote(self.settings.passw)

        postData['rep']= self.settings.base
        postData['project']= urllib2.quote(self.parentDB.config.projectName)

        req = urllib2.Request('http://' +self.settings.addr +'/?=release_task_id', str.encode(urllib.urlencode(postData)))
        try:
            response= bytes.decode( urllib2.urlopen(req, None, self.timeout).read() ) or 0
            self.lastId= None
        except Exception as e:
            print('TypeTodo: HTTP server error releasing todo')
            print(e)
            return False
            
        if str(int(response)) != response:
            print('TypeTodo: HTTP server fails releasing todo')
            response= False

        return response
Example #24
0
def get_SIMBAD_coordinates(name):
    url = VOTABLE_OPTIONS + SIMBAD_VOTABLE_SCRIPT_START + QUERY_VOTABLE_FULLCOORDINATES + SIMBAD_VOTABLE_SCRIPT_MIDDLE + name + SIMBAD_VOTABLE_SCRIPT_END

    try:
        response = urllib2.urlopen(SIMBAD_ROOT_1+NAME_SCRIPT+urllib2.quote(url))
    except urllib2.URLError:
        try:
            response = urllib2.urlopen(SIMBAD_ROOT_2+NAME_SCRIPT+urllib2.quote(url))
        except urllib2.URLError:
            return None

    try:
        response_votable = votable.parse(response.fp)
        first_table = response_votable.get_first_table()
    except:
        return None
    else:
        ra = float(first_table.array[0][0])
        dec = float(first_table.array[0][1])

        try:
            coords, created = AstronomicalCoordinates.objects.get_or_create(right_ascension=ra, declination=dec)
        except MultipleObjectsReturned:
            coords = AstronomicalCoordinates.objects.filter(right_ascension=ra, declination=dec).first()

        return coords
Example #25
0
def get_SIMBAD_object_types(name):
    url = SIMBAD_BASIC_SCRIPT + QUERY_OTYPES + name

    try:
        response = urllib2.urlopen(SIMBAD_ROOT_1+NAME_SCRIPT+urllib2.quote(url))
    except urllib2.URLError:
        try:
            response = urllib2.urlopen(SIMBAD_ROOT_2+NAME_SCRIPT+urllib2.quote(url))
        except urllib2.URLError:
            return None

    otypes = []
    ok = False

    value_line = None
    for line in response.readlines():
        if ok and len(line.strip()) > 0:
            value_line = line.strip()
        if line.find(QUERY_DATA_DELIMITER) >= 0:
            ok = True

    if value_line is not None and len(value_line) > 0:
        values = value_line.split(",")
        for value in values:
            otype, created = ObjectType.objects.get_or_create(value=value)
            otypes.append(otype)

    return otypes
def plos_search(query, query_type = None, rows = 20, more_parameters = None, fq = '''doc_type:full AND article_type:"Research Article"''', output = "json", verbose = False):
    '''
    Accesses the PLOS search API.
    query: the text of your query.
    query_type: subject, author, etc.
    rows: maximum number of results to return.
    more_parameters: an optional dictionary; key-value pairs are parameter names and values for the search api.
    fq: determines what kind of results are returned.
    Set by default to return only full documents that are research articles (almost always what you want).
    output: determines output type. Set to JSON by default, XML is also possible, along with a few others.
    '''
    api_key = "..."

    query_string = ""
    if query_type:
        query_string += query_type + ":"
    query_string += '"' + query + '"'

    params_string = ""
    if more_parameters:
        params_string = "&" + "&".join([key + "=" + quote(value) for key, value in more_parameters.iteritems()])

    fq_string = "&fq=" + quote(fq)

    url = "http://api.plos.org/search?q=" + query_string + params_string + fq_string + "&wt=" + output + "&rows=" + str(rows) + "&api_key=" + api_key
    headers = {'Content-Type': 'application/' + output}
    if verbose:
        print url
    r = requests.get(url, headers=headers)
    r.encoding = "UTF-8" # just to be sure
    return r.json()["response"]["docs"]
Example #27
0
            def decorated_function(*args, **kwargs):
                page = int(request.args.get('page', 1))

                # 这里要转换成str类型, 否则会报类型错误
                _path = request.path.encode("utf-8")

                # 对于非ASCII的URL,需要进行URL编码
                if quote(_path).count('%25') <= 0:
                    _path = quote(_path)

                _viewkey = 'mobile%s' % _path if request.MOBILE else _path
                cache_key = str(key % _viewkey)

                if page > 1:
                    cache_key = '%s_%s' % (cache_key, page)

                rv = cache.get(cache_key)
                if rv is not None: 
                    return rv
                rv = f(*args, **kwargs)
                _suffix = u"\n<!-- cached at %s -->" % str(datetime.datetime.now())
                if hasattr(rv, "data"):
                    rv.data += _suffix
                if isinstance(rv, unicode):
                    rv += _suffix
                cache.set(cache_key, rv, timeout)
                return rv
Example #28
0
    def fetch(self):
        postData= {}
        postData['rep']= self.settings.base
        postData['project']= urllib2.quote(self.parentDB.config.projectName)
        if self.settings.login!='' and self.settings.passw!='':
            postData['logName']= urllib2.quote(self.settings.login)
            postData['logPass']= urllib2.quote(self.settings.passw)
        req = urllib2.Request('http://' +self.settings.addr +'/?=fetch_tasks', str.encode(urllib.urlencode(postData)))
        try:
            response= bytes.decode( urllib2.urlopen(req, None, self.timeout).read() )
        except Exception as e:
            print('TypeTodo: cant fetch http')
            print(e)
            return False

        todoA= {}
        for task in json.loads(response):
            __id= int(task['id'])

            if __id not in todoA:
                todoA[__id]= TodoTask(__id, self.parentDB.config.projectName, self.parentDB)

                fetchedStateName= task['namestate']

                for cState in STATE_LIST:
                    if cState and cState[1]==fetchedStateName:
                        break

                tags= task['nametag'].split(',')
                todoA[__id].set((cState or STATE_DEFAULT)[0], tags, task['priority'], task['namefile'], task['comment'], task['nameuser'], int(task['ustamp']))

        return todoA
    def _generate_url(self, options):
        options['Service'] = 'AWSECommerceService'
        options['AWSAccessKeyId'] = self.access_key_id
        options['AssociateTag'] = self.associate_tag
        options['Timestamp'] = self._generate_timestamp()

        # 'None' が含まれている場合は削除する.
        for k, v in options.items():
            if v is None:
                del options[k]

        # 署名(v2)を作成する.
        keys = sorted(options.keys())
        args = '&'.join('%s=%s' % (key, urllib2.quote(unicode(options[key])
                        .encode('utf-8'), safe='~')) for key in keys)

        msg = 'GET'
        msg += '\n' + self.uri
        msg += '\n' + self.end_point
        msg += '\n' + args

        hmac.new(self.secret_key or '', msg, hashlib.sha256).digest()
        signature = urllib2.quote(
            base64.b64encode(hmac.new(self.secret_key or '', msg, hashlib.sha256).digest()))

        url = "http://%s%s?%s&Signature=%s" % (self.uri, self.end_point, args, signature)

        return url
Example #30
0
 def __raw_search_anime(self, query):
     h = httplib2.Http()
     resp, content = h.request(self.malapiurl + '/anime/search?q=' + urllib2.quote(query))
     print self.malapiurl + '/anime/search?q=' + urllib2.quote(query)
     if int(resp['status']) != 200:
         return None
     return content
Example #31
0
def encode(url):
    return urllib2.quote(url).replace("/", "%2F")
Example #32
0
def urlencode(s):
    return urllib2.quote(s)
Example #33
0
    def get_issue_summaries(self, repo_url, baseurl=None, cachefile=None):
        '''Paginate through github's web interface and scrape summaries'''

        # repo_url - https://github.com/ansible/ansible for example
        # baseurl - an entrypoint for one-off utils to scrape specific issue
        #           query urls. NOTE: this disables writing a cache

        # get cached
        if not baseurl:
            issues = self.load_summaries(repo_url)
        else:
            issues = {}

        if not baseurl:
            url = repo_url
            url += '/issues'
            url += '?'
            url += 'q='
            url += urllib2.quote('sort:updated-desc')
        else:
            url = baseurl

        namespace = repo_url.split('/')[-2]
        reponame = repo_url.split('/')[-1]

        rr = self._request_url(url)
        soup = BeautifulSoup(rr.text, 'html.parser')
        data = self._parse_issue_summary_page(soup)
        if data['issues']:
            # send to receiver
            post_to_receiver('summaries', {
                'user': namespace,
                'repo': reponame
            }, data['issues'])
            # update master list
            issues.update(data['issues'])

        if not baseurl:
            self.dump_summaries_tmp(repo_url, issues)

        while data['next_page']:
            rr = self._request_url(self.baseurl + data['next_page'])
            soup = BeautifulSoup(rr.text, 'html.parser')
            data = self._parse_issue_summary_page(soup)

            # send to receiver
            post_to_receiver('summaries', {
                'user': namespace,
                'repo': reponame
            }, data['issues'])

            if not data['next_page'] or not data['issues']:
                break

            changed = []
            changes = False
            for k, v in data['issues'].iteritems():

                if not isinstance(k, unicode):
                    k = u'%s' % k

                if k not in issues:
                    changed.append(k)
                    changes = True
                elif v != issues[k]:
                    changed.append(k)
                    changes = True
                issues[k] = v

            if changed:
                logging.info('changed: %s' % ','.join(x for x in changed))

            if not baseurl:
                self.dump_summaries_tmp(repo_url, issues)

            if not changes:
                break

        # get missing
        if not baseurl:
            numbers = sorted([int(x) for x in issues.keys()])
            missing = [x for x in xrange(1, numbers[-1]) if x not in numbers]
            for x in missing:
                summary = self.get_single_issue_summary(repo_url,
                                                        x,
                                                        force=True)
                if summary:
                    post_to_receiver('summaries', {
                        'user': namespace,
                        'repo': reponame
                    }, {x: summary})
                    if not isinstance(x, unicode):
                        x = u'%s' % x
                    issues[x] = summary

        # get missing timestamps
        if not baseurl:
            numbers = sorted([int(x) for x in issues.keys()])
            missing = [
                x for x in numbers
                if str(x) not in issues or not issues[str(x)]['updated_at']
            ]
            for x in missing:
                summary = self.get_single_issue_summary(repo_url,
                                                        x,
                                                        force=True)
                if summary:
                    post_to_receiver('summaries', {
                        'user': namespace,
                        'repo': reponame
                    }, {x: summary})
                    if not isinstance(x, unicode):
                        x = u'%s' % x
                    issues[x] = summary

        # save the cache
        if not baseurl:
            self.dump_summaries(repo_url, issues)

        return issues
Example #34
0
###################################################################################

import scraperwiki
import simplejson
import urllib2

# Change QUERY to your search term of choice. 
# Examples: 'newsnight', 'from:bbcnewsnight', 'to:bbcnewsnight'
        QUERY ='wish list'
RESULTS_PER_PAGE = '200'
LANGUAGE = 'en'
NUM_PAGES = 1500

for page in range(1, NUM_PAGES+1):
    base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&lang=%s&page=%s' \
         % (urllib2.quote(QUERY), RESULTS_PER_PAGE, LANGUAGE, page)
    try:
        results_json = simplejson.loads(scraperwiki.scrape(base_url))
        for result in results_json['results']:
            #print result
            data = {}
            data['id'] = result['id']
            data['text'] = result['text']
            data['from_user'] = result['from_user']
            data['created_at'] = result['created_at']
            print data['from_user'], data['text']
            scraperwiki.sqlite.save(["id"], data) 
    except:
        print 'Oh dear, failed to scrape %s' % base_url
        break
        
Example #35
0
def xiaoqu_chengjiao_spider(db_cj, xq_name=u"绿川新苑"):

    trytimes = 0
    #    tryblocktimes = 0
    url = u"http://sh.lianjia.com/chengjiao/rs" + urllib2.quote(xq_name) + "/"
    while 1:
        try:
            #            proxy_s = urllib2.ProxyHandler(proxys[random.randint(0, len(proxys)-1)])
            #            opener = urllib2.build_opener(proxy_s)
            #            urllib2.install_opener(opener)
            req = urllib2.Request(url,
                                  headers=hds[random.randint(0,
                                                             len(hds) - 1)])
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor)
            source_code = opener.open(req, timeout=5).read()
            plain_text = unicode(source_code)  #,errors='ignore')
            soup = BeautifulSoup(plain_text)
        except socket.timeout as e:
            if trytimes < 5:
                #time.sleep(5)
                trytimes += 1
                continue
            else:
                print e
                exception_write(e, 'xiaoqu_chengjiao_spider', xq_name)
                return

        except (urllib2.HTTPError, urllib2.URLError) as e:
            print e
            exception_write(e, 'xiaoqu_chengjiao_spider', xq_name)
            return
        except Exception as e:
            print e
            exception_write(e, 'xiaoqu_chengjiao_spider', xq_name)
            return

        human = soup.find('div', {'class': 'human'})

        if not human:
            break
        else:
            print "block"
            time.sleep(random.randint(900, 1200))
            trytimes = 0


#            if tryblocktimes < 5:
#                tryblocktimes += 1
#                continue
#            else:
#                print "block"
#                getProxyIp();
#                trytimes = 0
#                tryblocktimes = 0

    pagebox = soup.find('div', {'class': 'c-pagination'})
    if not pagebox:
        print "---no chengjiao record"
        return

    tpage = pagebox.find('a', {'gahref': 'results_totalpage'})
    npage = pagebox.find('a', {'gahref': 'results_next_page'})
    allpage = pagebox.findAll('a')
    if tpage:
        pagenum = int(tpage['gahref'].split('_d')[-1])
    else:
        if npage:
            pagenum = int(allpage[-2]['gahref'].split('_d')[-1])
        else:
            pagenum = 1

    print u"---开始爬 %s 区全部的信息" % xq_name
    print u"---total number of pages is " + str(pagenum)

    for j in range(pagenum):
        url_page = u"http://sh.lianjia.com/chengjiao/d%drs%s/" % (j + 1,
                                                                  xq_name)
        chengjiao_page_search(db_cj, url_page)

        #time.sleep(random.randint(1,2))
        print u"---" + xq_name + "  " + str(j + 1) + "th page have been done"

    print u"---爬下了 %s 区全部的信息" % xq_name
Example #36
0
 def _QuoteOrNone(self, x):
   if x is None:
     return None
   else:
     return urllib2.quote(x)
Example #37
0
import scraperwiki
import simplejson
import urllib2

# Get results from the Twitter API! Change QUERY to your search term of choice.
# Examples: 'newsnight', 'from:bbcnewsnight', 'to:bbcnewsnight'
QUERY = '#BeforeBlackPresidents'
RESULTS_PER_PAGE = '100'
LANGUAGE = 'en'
# NUM_PAGES = 5

# for page in range(1, NUM_PAGES+1):

base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&lang=%s&page=%s' % (
    urllib2.quote(QUERY), RESULTS_PER_PAGE, LANGUAGE, 1)

try:
    print simplejson.loads(scraperwiki.scrape(base_url))
except:
    print 'Oh dear, failed to scrape %s' % base_url

# Blank Python

import scraperwiki
import simplejson
import urllib2

# Get results from the Twitter API! Change QUERY to your search term of choice.
# Examples: 'newsnight', 'from:bbcnewsnight', 'to:bbcnewsnight'
QUERY = '#BeforeBlackPresidents'
Example #38
0
# To get the item log we connect to NZBGet via XML-RPC and call
# method "loadlog", which returns the log for a given nzb item.
# For more info visit http://nzbget.net/RPC_API_reference

# First we need to know connection info: host, port and password of NZBGet server.
# NZBGet passes all configuration options to post-processing script as
# environment variables.
host = os.environ['NZBOP_CONTROLIP'];
port = os.environ['NZBOP_CONTROLPORT'];
username = os.environ['NZBOP_CONTROLUSERNAME'];
password = os.environ['NZBOP_CONTROLPASSWORD'];

if host == '0.0.0.0': host = '127.0.0.1'

# Build a URL for XML-RPC requests
rpcUrl = 'http://%s:%s@%s:%s/xmlrpc' % (quote(username), quote(password), host, port);

# Create remote server object
server = ServerProxy(rpcUrl)

# Call remote method 'loadlog'
nzbid = int(os.environ['NZBPP_NZBID'])
log = server.loadlog(nzbid, 0, 10000)

# Now iterate through entries and save them to the output file
if len(log) > 0:
	f = open('%s/_nzblog.txt' % os.environ['NZBPP_DIRECTORY'], 'wb')
	for entry in log:
		f.write((u'%s\t%s\t%s\n' % (entry['Kind'], datetime.datetime.fromtimestamp(int(entry['Time'])), entry['Text'])).encode('utf8'))
	f.close()
Example #39
0
url = 'http://www.baidu.com'
User_Agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36'
response = urllib2.urlopen(url)
print response.getcode()
print len(response.read())

request = urllib2.Request(url)
request.add_header('User-Agent',User_Agent)
response = urllib2.urlopen(request)
print response.getcode()
print len(response.read())

cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
response = urllib2.urlopen(url)
print response.getcode()
print len(response.read())
print cj

url = "http://music.baidu.com/artist"
response = urllib2.urlopen(url)
print response.getcode()
print len(response.read())


url = 'http://www.baidu.com'
request = urllib2.Request(url)
request.add_data({"a",1})
request.add_data({"param",urllib2.quote('中文')})
print request.getUrl()
    def update(self, data):
        # Only run an update on the ticket if a review ID has been found
        if self.rid:
            logging.info("We found a review, %u" % self.rid)
            try:
                headers = {
                    "Accept": "*/*",
                    "Authorization": "Basic %s" % self.auth
                }

                post_data = None
                if version == 3:
                    post_data = "api_format=json&ship_it=0&body_top=%s&body_bottom=&public=1" % urllib.parse.quote(
                        data)
                else:
                    post_data = "api_format=json&ship_it=0&body_top=%s&body_bottom=&public=1" % urllib2.quote(
                        data)

                if not debug:
                    logging.info("Dispatching request to reviews.apache.org")

                    conn = None
                    if version == 3:
                        conn = http.client.HTTPSConnection(
                            "reviews.apache.org", 443)
                    else:
                        conn = httplib.HTTPSConnection("reviews.apache.org",
                                                       443)

                    conn.request("POST",
                                 "/api/review-requests/%u/reviews/" % self.rid,
                                 post_data, headers)
                    response = conn.getresponse()
                    if response.status == 201:
                        logging.info("Posted ReviewBoard update")
                    else:
                        logging.warning(
                            "ReviewBoard instance returned status code %u" %
                            response.status)
                else:
                    logging.warning(
                        "Foreground mode enabled, no actual ReviewBoard update made"
                    )
            except:
                pass
import scraperwiki
import simplejson
import urllib2

# Change QUERY to your search term of choice.
# Examples: 'newsnight', 'from:bbcnewsnight', 'to:bbcnewsnight'
QUERY = 'claudioalfonso'
RESULTS_PER_PAGE = '100'
RESULT_TYPE = 'old'
NUM_PAGES = 500
ENTITIES = 'true'

for page in range(1, NUM_PAGES + 1):
    base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&page=%s&result_type=%s&include_entities=%s' \
         % (urllib2.quote(QUERY), RESULTS_PER_PAGE, page, RESULT_TYPE, ENTITIES)
    try:
        results_json = simplejson.loads(scraperwiki.scrape(base_url))
        for result in results_json['results']:
            #print result
            data = {}
            data['id'] = result['id']
            data['text'] = result['text']
            data['from_user'] = result['from_user']
            data['created_at'] = result['created_at']
            data['geo'] = result['geo']
            data['entities'] = result['entities']
            print data['from_user'], data['text'], data['geo'], data[
                'entities']
            scraperwiki.sqlite.save(["id"], data)
    except:
Example #42
0
 def quote(self, string, safe='/'):
     return urllib2.quote(string, safe)
Example #43
0
    def handleRequest(self, headers_only, channelName=None, channelIcon=None, fmt=None):
        logger = logging.getLogger('handleRequest')
        logger.debug("Headers:\n" + str(self.headers))
        self.requrl = urlparse.urlparse(self.path)
        self.reqparams = urlparse.parse_qs(self.requrl.query)
        self.path = self.requrl.path[:-1] if self.requrl.path.endswith('/') else self.requrl.path
        
        # Check if third parameter exists
        # …/pid/blablablablabla/video.mpg
        #                      |_________|
        # And if it ends with regular video extension
        try:
            if not self.path.endswith(('.3gp', '.avi', '.flv', '.mkv', '.mov', '.mp4', '.mpeg', '.mpg', '.ogv', '.ts')):
                logger.error("Request seems like valid but no valid video extension was provided")
                self.dieWithError(400)
                return
        except IndexError:
            self.dieWithError(400)  # 400 Bad Request
            return

        # Limit concurrent connections
        if 0 < AceConfig.maxconns <= AceStuff.clientcounter.total:
            logger.debug("Maximum connections reached, can't serve this")
            self.dieWithError(503)  # 503 Service Unavailable
            return

        # Pretend to work fine with Fake or HEAD request.
        if headers_only or AceConfig.isFakeRequest(self.path, self.reqparams, self.headers):
            # Return 200 and exit
            if headers_only:
                logger.debug("Sending headers and closing connection")
            else:
                logger.debug("Fake request - closing connection")
            self.send_response(200)
            self.send_header("Content-Type", "video/mpeg")
            self.end_headers()
            self.closeConnection()
            return

        # Make list with parameters
        self.params = list()
        for i in xrange(3, 8):
            try:
                self.params.append(int(self.splittedpath[i]))
            except (IndexError, ValueError):
                self.params.append('0')
        
        self.url = None
        self.video = None
        self.path_unquoted = urllib2.unquote(self.splittedpath[2])
        contentid = self.getCid(self.reqtype, self.path_unquoted)
        cid = contentid if contentid else self.path_unquoted
        logger.debug("CID: " + cid)
        self.client = Client(cid, self, channelName, channelIcon)
        self.vlcid = urllib2.quote(cid, '')
        shouldStart = AceStuff.clientcounter.add(cid, self.client) == 1

        try:
            # Initializing AceClient
            if shouldStart:
                if contentid:
                    self.client.ace.START('PID', {'content_id': contentid})
                elif self.reqtype == 'pid':
                    self.client.ace.START(
                        self.reqtype, {'content_id': self.path_unquoted, 'file_indexes': self.params[0]})
                elif self.reqtype == 'torrent':
                    paramsdict = dict(
                        zip(aceclient.acemessages.AceConst.START_TORRENT, self.params))
                    paramsdict['url'] = self.path_unquoted
                    self.client.ace.START(self.reqtype, paramsdict)
                logger.debug("START done")
                # Getting URL
                self.url = self.client.ace.getUrl(AceConfig.videotimeout)
                # Rewriting host for remote Ace Stream Engine
                self.url = self.url.replace('127.0.0.1', AceConfig.acehost)

            self.errorhappened = False

            if shouldStart:
                logger.debug("Got url " + self.url)
                # If using VLC, add this url to VLC
                if AceConfig.vlcuse:
                    # Force ffmpeg demuxing if set in config
                    if AceConfig.vlcforceffmpeg:
                        self.vlcprefix = 'http/ffmpeg://'
                    else:
                        self.vlcprefix = ''

                    self.client.ace.pause()
                    # Sleeping videodelay
                    gevent.sleep(AceConfig.videodelay)
                    self.client.ace.play()

                    AceStuff.vlcclient.startBroadcast(
                        self.vlcid, self.vlcprefix + self.url, AceConfig.vlcmux, AceConfig.vlcpreaccess)
                    # Sleep a bit, because sometimes VLC doesn't open port in
                    # time
                    gevent.sleep(0.5)
            
            self.hanggreenlet = gevent.spawn(self.hangDetector)
            logger.debug("hangDetector spawned")
            gevent.sleep()

            # Building new VLC url
            if AceConfig.vlcuse:
                self.url = 'http://' + AceConfig.vlchost + \
                    ':' + str(AceConfig.vlcoutport) + '/' + self.vlcid
                logger.debug("VLC url " + self.url)
                
                # Sending client headers to videostream
                self.video = urllib2.Request(self.url)
                for key in self.headers.dict:
                    self.video.add_header(key, self.headers.dict[key])
    
                self.video = urllib2.urlopen(self.video)
    
                # Sending videostream headers to client
                if not self.headerssent:
                    self.send_response(self.video.getcode())
                    if self.video.info().dict.has_key('connection'):
                        del self.video.info().dict['connection']
                    if self.video.info().dict.has_key('server'):
                        del self.video.info().dict['server']
                    if self.video.info().dict.has_key('transfer-encoding'):
                        del self.video.info().dict['transfer-encoding']
                    if self.video.info().dict.has_key('keep-alive'):
                        del self.video.info().dict['keep-alive']
    
                    for key in self.video.info().dict:
                        self.send_header(key, self.video.info().dict[key])
                    # End headers. Next goes video data
                    self.end_headers()
                    logger.debug("Headers sent")
    
                # Run proxyReadWrite
                self.proxyReadWrite()
            else:
                if not fmt:
                    fmt = self.reqparams.get('fmt')[0] if self.reqparams.has_key('fmt') else None
                self.client.handle(shouldStart, self.url, fmt)

        except (aceclient.AceException, vlcclient.VlcException, urllib2.URLError) as e:
            logger.error("Exception: " + repr(e))
            self.errorhappened = True
            self.dieWithError()
        except gevent.GreenletExit:
            # hangDetector told us about client disconnection
            pass
        except Exception:
            # Unknown exception
            logger.error(traceback.format_exc())
            self.errorhappened = True
            self.dieWithError()
        finally:
            if AceConfig.videodestroydelay and not self.errorhappened and AceStuff.clientcounter.count(cid) == 1:
                # If no error happened and we are the only client
                try:
                    logger.debug("Sleeping for " + str(AceConfig.videodestroydelay) + " seconds")
                    gevent.sleep(AceConfig.videodestroydelay)
                except:
                    pass
                
            try:
                remaining = AceStuff.clientcounter.delete(cid, self.client)
                self.client.destroy()
                self.ace = None
                self.client = None
                if AceConfig.vlcuse and remaining == 0:
                    try:
                        AceStuff.vlcclient.stopBroadcast(self.vlcid)
                    except:
                        pass
                logger.debug("END REQUEST")
            except:
                logger.error(traceback.format_exc())
import os
import csv
import urllib2
from selenium.common.exceptions import TimeoutException

data = []
urls = []
urldata = []

with open('movie_budget_info.csv','rb') as csvfile:
	spamreader = csv.reader(csvfile,delimiter=',')
	for row in spamreader:
		data.append(row)

for x in range(1,len(data)):
	title = urllib2.quote(data[x][2])
	urls.append("http://www.imdb.com/find?ref_=nv_sr_fn&q={}&s=tt".format(title))

chromedriver = "/Users/sai teja/Downloads/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
browser = webdriver.Chrome(chromedriver)
#browser.get("http://www.the-numbers.com/movie/budgets/all")

for url in range(4000,len(urls)):
	try:
		print(url)
		browser.get(urls[url])
		if len(browser.find_elements_by_xpath("//table/tbody/tr"))>0 and len(browser.find_elements_by_xpath("//table/tbody/tr")[0].find_elements_by_tag_name("td"))>1 :
			mov_tit = browser.find_elements_by_xpath("//table/tbody/tr")[0].find_elements_by_tag_name("td")[1].find_element_by_tag_name("a").text.encode('utf-8')
			mov_url = browser.find_elements_by_xpath("//table/tbody/tr")[0].find_elements_by_tag_name("td")[1].find_element_by_tag_name("a").get_attribute('href')
			urldata.append([mov_tit,mov_url])
Example #45
0
        return
    except Exception, e:
        print e
        exception_write('xiaoqu_chengjiao_spider', xq_name)
        return
    content = soup.find('div', {'class': 'page-box house-lst-page-box'})
    total_pages = 0
    if content:
        d = "d=" + content.get('page-data')
        exec(d)
        total_pages = d['totalPage']

    threads = []
    for i in range(total_pages):
        url_page = u"http://bj.lianjia.com/chengjiao/pg%drs%s/" % (
            i + 1, urllib2.quote(xq_name))
        t = threading.Thread(target=chengjiao_spider, args=(db_cj, url_page))
        threads.append(t)
    for t in threads:
        t.start()
    for t in threads:
        t.join()


def do_xiaoqu_chengjiao_spider(db_xq, db_cj):
    """
    批量爬取小区成交记录
    """
    count = 0
    xq_list = db_xq.fetchall()
    for xq in xq_list:
Example #46
0
def browser_search(text=None, url="https://www.google.com/search?q=%s"):
    if not text:
        text = read_selected(True)
    url = url % quote(text)
    browser_open(url)
Example #47
0
    def get_positon(self, start_a, end_b, windows, query_file_path, page_folder_path):
        results_position_list = []
        images_position_list = []
        queries_lines = open(query_file_path, "r").readlines()
        queries = ["index"]
        # start from 1
        for query in queries_lines:
            query = query.strip()
            queries.append(query)

        driver = webdriver.PhantomJS()
        for i in range(start_a, end_b):  # the range of query
            if i >= len(queries):
                break
            query = queries[i]
            file_path = page_folder_path + query + '_sogou.html'
            code_file_path = page_folder_path + urllib2.quote(query, '+') + '_sogou.html'
            try:
                fin = open(file_path, 'r')
                fin.close()
            except:
                continue

            try:
                Results_position = []
                Images_position = []
                count = 0
                driver.get(code_file_path)
                content_results = driver.find_element_by_id('main')
                divs = content_results.find_elements_by_css_selector('div')
                for div in divs:
                    classes = div.get_attribute('class').split(' ')
                    if 'rb' in classes or 'vrwrap' in classes or 'vrPic' in classes:
                        count += 1
                        result_position = Position("query", i, query, count, div.location['x'], div.location['y'], div.size['width'], div.size['height'])
                        Results_position.append(result_position)
                        # anchors = div.find_elements_by_css_selector("a")
                        images = div.find_elements_by_css_selector("img")
                        Images = []
                        for image in images:
                            anchor = image.find_element_by_xpath('..')
                            if anchor.size['width'] == 0 or anchor.size['height'] == 0:
                                if image.size['width'] == 0 or image.size['height'] == 0:
                                    continue
                                image_position = Position("image", i, query, count, image.location['x'], image.location['y'], image.size['width'], image.size['height'])
                                Images.append(image_position)
                            else:
                                image_position = Position("image", i, query, count, anchor.location['x'], anchor.location['y'], anchor.size['width'], anchor.size['height'])
                                Images.append(image_position)
                        Images_position.append(Images)
                        if count == windows:
                            break
                results_position_list.append(Results_position)
                images_position_list.append(Images_position)
                print "Sogou " + query, i
            except:
                continue

        driver.quit()
        driver.stop_client()
        return results_position_list, images_position_list
Example #48
0
def main():

    count = 20
    method = "searcht"
    origstring = ""
    optionstr = ""
    filelocation = "/tmp/test.txt"
    test = 0
    debug = 0
    lang = "da"

    try:
        opts, args = getopt.getopt(sys.argv[1:], "s:g:h:m:c:dta:f:i:j:w:")
        for o, a in opts:
            if o == "-s":
                origstring = a
                searchstring = urllib2.quote(a.encode('utf8'))
            elif o == "-g":
                fromdate = a
                optionstr = optionstr + "&from-date=" + fromdate
            elif o == "-h":
                todate = a
                optionstr = optionstr + "&to-date=" + todate
            elif o == "-i":
                tag = a
                optionstr = optionstr + "&tag=type/" + tag
            elif o == "-m":
                method = a
            elif o == "-c":
                count = a
                optionstr = optionstr + "&page-size=" + count
            elif o == "-j":
                pagenumber = a
                optionstr = optionstr + "&page=" + pagenumber
            elif o == "-w":
                wordcount = a
                optionstr = optionstr + "&min-wordcount=2&max-wordcount=" + wordcount
            elif o == "-t":
                test = 1
            elif o == "-d":
                debug = 1
            elif o == "-f":
                filelocation = a
            elif o == "-l":
                lang = a
            else:
                assert False, "unhandled option"

    except getopt.GetoptError as err:
        print(err)
        sys.exit(2)

    baseurl = starturl + "/search?q=" + searchstring + optionstr
    endurl = "&show-fields=all&show-tags=all&show-factboxes=all&show-elements=all&show-references=all&show-snippets=all&api-key=" + key
    #endurl="&show-fields=all&show-tags=all&show-factboxes=all&show-elements=all&show-references=all&show-snippets=all&api-key=mediahackdays2014"
    # http://content.guardianapis.com/search?q=cameron&tag=type%2Farticle&show-tags=all&api-key=mediahackdays2014
    #http://content.guardianapis.com/search?q=maria+miller&tag=type%2Fvideo&show-tags=all
    url = baseurl + endurl

    print url
    r = requests.get(url=url)
    #print json.dumps(input, sort_keys = False, indent = 4)
    #print json.dumps(r,sort_keys = False, indent = 4)
    newdata = json.loads(r.text)
    #wanted = {u'id',u'webTitle',u'newspaperPageNumber'}
    #[i for i in newdata[u'response'] if any(w in newdata for w in i[u'results'])]

    if debug:
        with open(filelocation, "w") as fh:
            json.dumps(r.text, fh)
        count = 0
        #print len(newdata)
    if method == "timeline":
        for row in newdata:
            fh.write(newdata[count]['text'])
    elif method == "searcht":
        for k in newdata['response']['results']:
            data = {}
            res = 0
            tmpID = k['id'].encode('utf8')
            myID = tmpID.replace('/', '_')
            #results = list(stories.find({'_id':myID}))
            cursor = stories.find({'_id': myID})
            obj = next(cursor, None)
            #pdb.set_trace()
            if obj:
                print "OK " + str(obj)
                continue
            else:
                print "NOT found " + myID

            data['_id'] = tmpID.replace('/', '_')
            print "ID:" + k['id'].encode('utf8') + myID

            print "sectionId:" + k['sectionId']
            mysectionId = k['sectionId']
            data['sectionId'] = mysectionId

            print "sectionName:" + k['sectionName']
            mysectionName = k['sectionName']
            data['sectionName'] = mysectionName

            print "WebPublicationDate:" + k['webPublicationDate']
            mytmpWebPublicationDate = k['webPublicationDate']
            myWebPublicationDate = mytmpWebPublicationDate.split("T")[0]
            myStoryDate = myWebPublicationDate.split("-")
            myDate = myStoryDate[2] + " " + monthToNum(
                myStoryDate[1]) + " " + myStoryDate[0]
            data['webPublicationDate'] = myWebPublicationDate
            data['date'] = myDate
            data['displaydate'] = myDate

            print "WebTitle:" + k['webTitle']
            myTitle = k['webTitle']
            data['title'] = myTitle

            print "readmoreurl:" + k['webUrl']
            myreadmoreurl = k['webUrl']
            data['readmoreurl'] = myreadmoreurl

            print "TRAIL:" + k['fields']['trailText']
            myTrail = k['fields']['trailText']
            data['Trail'] = myTrail

            print "headline:" + k['fields']['headline']
            myheadline = k['fields']['headline']
            data['headline'] = myheadline

            try:
                print "byline:" + k['fields']['byline']
                mycaption = k['fields']['byline']
                #data['byline']=mycaption
                data['byline'] = mycaption
                data['caption'] = origstring
            except:
                print "ups on caption .."
                #data['caption']="lorem ipsum"

            try:
                print "wordcount:" + k['fields']['wordcount']
                mywordcount = k['fields']['wordcount']
                data['wordcount'] = mywordcount
            except:
                print "ups on wordcount .."
                data['wordcount'] = 0

            try:
                print "photourl:" + k['fields']['thumbnail']
                mythumbnail = k['fields']['thumbnail']
                data['photourl'] = mythumbnail
            except:
                print "No photo on wordcount .."
                data['photourl'] = ""

            #print "BODY:" + k['fields']['body']
            myBody = k['fields']['body']

            #print "BODY:" + k['fields']['body']
            myBody = k['fields']['body']
            # clean up for timeline
            timeBody = doClean(myBody)

            data['FullBody'] = myBody
            data['body'] = timeBody

            try:
                print "PAGENO:" + k['fields']['newspaperPageNumber']
                myPageNo = k['fields']['newspaperPageNumber']
                data['newspaperPageNumber'] = myPageNo
            except:
                print "ups on newpaper .."
                data['newspaperPageNumber'] = 0

            try:
                print "DATE:" + k['fields']['newspaperEditionDate']
                myDate = k['fields']['newspaperEditionDate']
                data['newspaperEditionDate'] = myDate
                #data['date']=myDate
            except:
                print "ups on eddate .."
                data['newspaperEditionDate'] = "1970-01-01"
                #data['date']="1970-01-01"

            tmpStr = ""
            for j in k['tags']:
                tmpStr = tmpStr + "," + j['webTitle']

            print "--> " + tmpStr
            data['Tags'] = tmpStr

            if test:
                print "Just testing ..."
                #stories.update(data, upsert=True)
            else:
                print "inserting data ..."
                stories.insert(data)
Example #49
0
	except HTTPError, error:
		geocode_status_code = error.code

        if geocode_status_code == 200:
                geocode_json = json.load(geocode_service_call)
                lat = geocode_json["lat"]
                long = geocode_json["lng"]
        else:
                return index(errors=["Invalid Address - Please check, renter and try again."])


	title = "Weather App"
	header = "Weather for {2} ({0}, {1})".format(lat, long, address)

	#Build weather url and make call
	weather_service_url = quote("https://arcuschallenge-getweather.appspot.com/getweather/{0},{1}".format(lat,long), ':/?&=,')
	try:
		weather_service_call = urlopen(weather_service_url)
		weather_status_code = weather_service_call.getcode()
	except HTTPError, error:
		weather_status_code = error.code
		error = error.reason
	
	#Check weather call status code
	if weather_status_code == 200:
		weather_json = json.load(weather_service_call) 
		content = []
		 
		#Loop through each day weather and take variables that we want to display
		for each_day in weather_json:
			
Example #50
0
def set_online_followed(value):
    window.setProperty(key='%s-online_followers' % kodi.get_id(),
                       value=quote(str(value)))
Example #51
0
 def test_valid_queries(self):
     # Set protector to unsafe mode
     protector = Protector(["prevent_delete"], [], False)
     self.assertTrue(
         protector.check(quote("select * from bla where x=y")).is_ok())
import scraperwiki
import simplejson
import urllib2
import sys

# Needs to be in lower case

SCREENNAME = 'easternamigo'

# API help: https://dev.twitter.com/docs/api/1/get/followers/ids
url = 'http://api.twitter.com/1/followers/ids.json?screen_name=%s' % (
    urllib2.quote(SCREENNAME))
print url
followers_json = simplejson.loads(scraperwiki.scrape(url))
print "Found %d followers of %s" % (len(followers_json), SCREENNAME)
followers_json = followers_json[
    'ids']  # get earliest followers first for batching
followers_json.reverse()


# Groups a list in chunks of a given size
def group(lst, n):
    for i in range(0, len(lst), n):
        val = lst[i:i + n]
        if len(val) == n:
            yield tuple(val)


# Where to start? Overlap one batch to increase hit rate if people unfollow etc.
batchdone = scraperwiki.sqlite.get_var('batchdone', 1)
batchstart = batchdone - 1
Example #53
0
 def _fetch_builder_page(self, builder):
     buildbot_url = config_urls.chromium_buildbot_url('chromium.webkit')
     builder_page_url = "%s/builders/%s?numbuilds=100" % (
         buildbot_url, urllib2.quote(builder.name()))
     return urllib2.urlopen(builder_page_url)
Example #54
0
    def search(self,
               owner=None,
               reviewer=None,
               base=None,
               closed=None,
               private=None,
               commit=None,
               created_before=None,
               created_after=None,
               modified_before=None,
               modified_after=None,
               per_request=None,
               keys_only=False,
               with_messages=False):
        """Yields search results."""
        # These are expected to be strings.
        string_keys = {
            'owner': owner,
            'reviewer': reviewer,
            'base': base,
            'created_before': created_before,
            'created_after': created_after,
            'modified_before': modified_before,
            'modified_after': modified_after,
        }
        # These are either None, False or True.
        three_state_keys = {
            'closed': closed,
            'private': private,
            'commit': commit,
        }

        url = '/search?format=json'
        # Sort the keys mainly to ease testing.
        for key in sorted(string_keys):
            value = string_keys[key]
            if value:
                url += '&%s=%s' % (key, urllib2.quote(value))
        for key in sorted(three_state_keys):
            value = three_state_keys[key]
            if value is not None:
                url += '&%s=%s' % (key, value)

        if keys_only:
            url += '&keys_only=True'
        if with_messages:
            url += '&with_messages=True'
        if per_request:
            url += '&limit=%d' % per_request

        cursor = ''
        while True:
            output = self.get(url + cursor)
            if output.startswith('<'):
                # It's an error message. Return as no result.
                break
            data = json.loads(output) or {}
            if not data.get('results'):
                break
            for i in data['results']:
                yield i
            cursor = '&cursor=%s' % data['cursor']
Example #55
0
    def get_issue_summaries(self, repo_url, baseurl=None, cachefile=None):
        '''Paginate through github's web interface and scrape summaries'''

        # repo_url - https://github.com/ansible/ansible for example
        # baseurl - an entrypoint for one-off utils to scrape specific issue
        #           query urls. NOTE: this disables writing a cache

        # get cached
        if not baseurl:
            issues = self.load_summaries(repo_url)
        else:
            issues = {}

        if not baseurl:
            url = repo_url
            url += '/issues'
            url += '?'
            url += 'q='
            url += urllib2.quote('sort:updated-desc')
        else:
            url = baseurl

        rr = self._request_url(url)
        soup = BeautifulSoup(rr.text, 'html.parser')
        data = self._parse_issue_summary_page(soup)
        if data['issues']:
            issues.update(data['issues'])

        if not baseurl:
            self.dump_summaries_tmp(repo_url, issues)

        while data['next_page']:
            rr = self._request_url(self.baseurl + data['next_page'])
            soup = BeautifulSoup(rr.text, 'html.parser')
            data = self._parse_issue_summary_page(soup)
            if not data['next_page'] or not data['issues']:
                break

            changed = []
            changes = False
            for k, v in data['issues'].iteritems():
                #v['href'] = self.baseurl + v['href']
                if str(k) not in issues:
                    changed.append(str(v['number']))
                    changes = True
                elif v != issues[str(k)]:
                    changed.append(str(v['number']))
                    changes = True
                issues[str(k)] = v

            if changed:
                logging.info('changed: %s' % ','.join(x for x in changed))

            if not baseurl:
                self.dump_summaries_tmp(repo_url, issues)

            if not changes:
                break

        # save the cache
        if not baseurl:
            self.dump_summaries(repo_url, issues)

        return issues
Example #56
0
def search(request):
    title = '搜索结果'
    param = request.GET
    print param
    keys = param.keys()
    if 'page' not in keys:
        page = 1
    else:
        page = int(param['page'])

    if 'num' not in keys:
        num = 10
    else:
        num = int(param['num'])

    if 'q' not in keys:
        return render_to_response("search_index.html")
    else:
        q = param['q'].encode('utf-8')

    # print q, page, num

    start = str((page - 1) * num)
    rows  = str(num)
    q = urllib2.quote(q)
    uri = "http://127.0.0.1:8983/solr/article_core/select?q=" + q + "&start=" + start + "&rows=" + rows + "&wt=json&indent=true"

    req = urllib2.Request(uri)
    res = urllib2.urlopen(req).read()
    res = json.loads(res)

    responseHeader = res['responseHeader']
    response = res['response']
    num_found = response['numFound']
    articles = response['docs']
    if num_found > 250:
        num_found = 250;
        articles = articles[:250]

    # 处理类别
    for article in articles:
        if article['type'] == 1:
            article['category'] = 'zcfb'
            article['category_name'] = webConfig.TOPLABEL1
        elif article['type'] == 2:
            article['category'] = 'gsgg'
            article['category_name'] = webConfig.TOPLABEL2
        elif article['type'] == 3:
            article['category'] = 'lddt'
            article['category_name'] = webConfig.TOPLABEL3
        elif article['type'] == 4:
            article['category'] = 'hydt'
            article['category_name'] = webConfig.TOPLABEL4
        elif article['type'] == 5:
            article['category'] = 'dfdt'
            article['category_name'] = webConfig.TOPLABEL5
        elif article['type'] == 0:
            article['category'] = 'qtwz'
            article['category_name'] = webConfig.TOPLABEL6
        else:
            article['category'] = 'index'
            article['category_name'] = webConfig.TOPLABEL0

        if len(article['content']) > 200:
            article['desc'] = article['content'][:200] + "......"
        else:
            article['desc'] = article['content']
    # print articles

    # 页数导航
    pages = []
    page_num = (int(num_found) - 1) / num + 1
    temp = page - page % 5
    if page_num <= 5:
        for i in range(page_num):
            pages.append(i + 1)
    elif page > page_num - page_num % 5:
        for i in range(page_num - page_num % 5, page_num):
            pages.append(i + 1)
    else:
        for i in range(5):
            pages.append(i + 1 + temp)

    #前一页,后一页
    if page == 1:
        pre_page = page
        next_page = page + 1
    elif page == page_num:
        pre_page = page - 1
        next_page = page_num
    else:
        pre_page = page - 1
        next_page = page + 1

    return render_to_response("search.html",
                              {
                                  "title": title,
                                  'articles': articles,
                                  'project_name': webConfig.PROJECTNAME,
                                  'toplabel0': webConfig.TOPLABEL0,
                                  'toplabel1': webConfig.TOPLABEL1,
                                  'toplabel2': webConfig.TOPLABEL2,
                                  'toplabel3': webConfig.TOPLABEL3,
                                  'toplabel4': webConfig.TOPLABEL4,
                                  'toplabel5': webConfig.TOPLABEL5,
                                  'toplabel6': webConfig.TOPLABEL6,
                                  'page': page,
                                  'num': num,
                                  'query': q,
                                  'num_found': num_found,
                                  'responseHeader': responseHeader,
                                  'page_num': page_num,
                                  'pages': pages,
                                  'pre_page': pre_page,
                                  'next_page': next_page,

                              }
                              )
Example #57
0
def fetch_mldata(dataname, target_name='label', data_name='data',
                 transpose_data=True, data_home=None):
    """Fetch an mldata.org data set

    If the file does not exist yet, it is downloaded from mldata.org .

    mldata.org does not have an enforced convention for storing data or
    naming the columns in a data set. The default behavior of this function
    works well with the most common cases:

      1) data values are stored in the column 'data', and target values in the
         column 'label'
      2) alternatively, the first column stores target values, and the second
         data values
      3) the data array is stored as `n_features x n_samples` , and thus needs
         to be transposed to match the `sklearn` standard

    Keyword arguments allow to adapt these defaults to specific data sets
    (see parameters `target_name`, `data_name`, `transpose_data`, and
    the examples below).

    mldata.org data sets may have multiple columns, which are stored in the
    Bunch object with their original name.

    Parameters
    ----------

    dataname :
        Name of the data set on mldata.org,
        e.g.: "leukemia", "Whistler Daily Snowfall", etc.
        The raw name is automatically converted to a mldata.org URL .

    target_name : optional, default: 'label'
        Name or index of the column containing the target values.

    data_name : optional, default: 'data'
        Name or index of the column containing the data.

    transpose_data : optional, default: True
        If True, transpose the downloaded data array.

    data_home : optional, default: None
        Specify another download and cache folder for the data sets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    Returns
    -------

    data : Bunch
        Dictionary-like object, the interesting attributes are:
        'data', the data to learn, 'target', the classification labels,
        'DESCR', the full description of the dataset, and
        'COL_NAMES', the original names of the dataset columns.

    Examples
    --------
    Load the 'iris' dataset from mldata.org:

    >>> from sklearn.datasets.mldata import fetch_mldata
    >>> import tempfile
    >>> test_data_home = tempfile.mkdtemp()

    >>> iris = fetch_mldata('iris', data_home=test_data_home)
    >>> iris.target.shape
    (150,)
    >>> iris.data.shape
    (150, 4)

    Load the 'leukemia' dataset from mldata.org, which needs to be transposed
    to respects the scikit-learn axes convention:

    >>> leuk = fetch_mldata('leukemia', transpose_data=True,
    ...                     data_home=test_data_home)
    >>> leuk.data.shape
    (72, 7129)

    Load an alternative 'iris' dataset, which has different names for the
    columns:

    >>> iris2 = fetch_mldata('datasets-UCI iris', target_name=1,
    ...                      data_name=0, data_home=test_data_home)
    >>> iris3 = fetch_mldata('datasets-UCI iris',
    ...                      target_name='class', data_name='double0',
    ...                      data_home=test_data_home)

    >>> import shutil
    >>> shutil.rmtree(test_data_home)
    """

    # normalize dataset name
    dataname = mldata_filename(dataname)

    # check if this data set has been already downloaded
    data_home = get_data_home(data_home=data_home)
    data_home = join(data_home, 'mldata')
    if not exists(data_home):
        os.makedirs(data_home)

    matlab_name = dataname + '.mat'
    filename = join(data_home, matlab_name)

    # if the file does not exist, download it
    if not exists(filename):
        urlname = MLDATA_BASE_URL % quote(dataname)
        try:
            mldata_url = urlopen(urlname)
        except HTTPError as e:
            if e.code == 404:
                e.msg = "Dataset '%s' not found on mldata.org." % dataname
            raise
        # store Matlab file
        try:
            with open(filename, 'w+b') as matlab_file:
                copyfileobj(mldata_url, matlab_file)
        except:
            os.remove(filename)
            raise
        mldata_url.close()

    # load dataset matlab file
    with open(filename, 'rb') as matlab_file:
        matlab_dict = io.loadmat(matlab_file, struct_as_record=True)

    # -- extract data from matlab_dict

    # flatten column names
    col_names = [str(descr[0])
                 for descr in matlab_dict['mldata_descr_ordering'][0]]

    # if target or data names are indices, transform then into names
    if isinstance(target_name, numbers.Integral):
        target_name = col_names[target_name]
    if isinstance(data_name, numbers.Integral):
        data_name = col_names[data_name]

    # rules for making sense of the mldata.org data format
    # (earlier ones have priority):
    # 1) there is only one array => it is "data"
    # 2) there are multiple arrays
    #    a) copy all columns in the bunch, using their column name
    #    b) if there is a column called `target_name`, set "target" to it,
    #        otherwise set "target" to first column
    #    c) if there is a column called `data_name`, set "data" to it,
    #        otherwise set "data" to second column

    dataset = {'DESCR': 'mldata.org dataset: %s' % dataname,
               'COL_NAMES': col_names}

    # 1) there is only one array => it is considered data
    if len(col_names) == 1:
        data_name = col_names[0]
        dataset['data'] = matlab_dict[data_name]
    # 2) there are multiple arrays
    else:
        for name in col_names:
            dataset[name] = matlab_dict[name]

        if target_name in col_names:
            del dataset[target_name]
            dataset['target'] = matlab_dict[target_name]
        else:
            del dataset[col_names[0]]
            dataset['target'] = matlab_dict[col_names[0]]

        if data_name in col_names:
            del dataset[data_name]
            dataset['data'] = matlab_dict[data_name]
        else:
            del dataset[col_names[1]]
            dataset['data'] = matlab_dict[col_names[1]]

    # set axes to scikit-learn conventions
    if transpose_data:
        dataset['data'] = dataset['data'].T
    if 'target' in dataset:
        if not sp.sparse.issparse(dataset['target']):
            dataset['target'] = dataset['target'].squeeze()

    return Bunch(**dataset)
Example #58
0
    def get_acquire_url(self):
        site = self._get_site()

        offering_id = urllib2.quote(self.owner_organization.name + '/' +
                                    self.name + '/' + self.version)
        return urljoin(site, 'offering/' + offering_id)
Example #59
0
def updateship(cmdr, shipid, shiptype, props=[]):
    if shipid is not None and shiptype:
        args = '&shipId=%d&type=%s' % (shipid, shiptype)
        for (slot, thing) in props:
            args += '&%s=%s' % (slot, urllib2.quote(unicode(thing)))
        call(cmdr, 'api-commander-v1/update-ship', args)
Example #60
0
            global all_filter_totals
            all_filter_totals += 1

    print("==========================第%s页采集结束================\n" % (page_num))
    if savefile == 1:
        #logfile.write("==========================第"+page_num+"页采集结束================\n")
        logfile.close()


if __name__ == '__main__':
    #Get the start time
    starttime = datetime.datetime.now()
    show_logo()
    key = raw_input('\033[1;33;40mplease input keyword:')
    key = key.encode('utf-8')
    key = urllib2.quote(key)

    page = int(raw_input("Search Number of pages:"))

    for i in range(page):
        page_pn = (i * baidu_page_size)
        baidu_search(key, page_pn)
    #Get the end time
    endtime = datetime.datetime.now()
    runtime = (endtime - starttime).seconds

    print(
        "\033[1;36;40m%d found | %d checked | %d filter | %d delete      The program runs in %s seconds\033[1;37;40m"
        % (all_totals, all_checked_totals, all_filter_totals,
           all_delete_totals, runtime))