Beispiel #1
0
def main(args):
    # print args.species, args.force_scheme_name, args.repository_url
    docfile = url.urlopen(args.repository_url)
    doc = xml.parse(docfile)
    root = doc.childNodes[0]
    found_species = []
    for species_node in root.getElementsByTagName('species'):
        info = getspeciesinfo(species_node, args.species,
                              args.force_scheme_name)
        if info is not None:
            found_species.append(info)
    if len(found_species) == 0:
        print("No species matched your query.")
        return
    if len(found_species) > 1:
        print(
            "The following {} species match your query, please be more specific:"
            .format(len(found_species)))
        for info in found_species:
            print(info.name)
            return
        # exit(2)

    # output information for the single matching species
    assert len(found_species) == 1
    species_info = found_species[0]
    species_name_underscores = species_info.name.replace(' ', '_')
    species_name_underscores = species_name_underscores.replace('/', '_')
    species_all_fasta_filename = species_name_underscores + '.fasta'
    species_all_fasta_file = open(
        '{}/{}'.format(args.path, species_all_fasta_filename), 'w')
    log_filename = "mlst_data_download_{}_{}.log".format(
        species_name_underscores, species_info.retrieved)
    log_file = open('{}/{}'.format(args.path, log_filename), "w")
    profile_path = urlparse(species_info.profiles_url).path
    profile_filename = profile_path.split('/')[-1]
    log_file.write("definitions: {}\n".format(profile_filename))
    log_file.write("{} profiles\n".format(species_info.profiles_count))
    log_file.write("sourced from: {}\n\n".format(species_info.profiles_url))
    profile_doc = url.urlopen(species_info.profiles_url)
    profile_file = open('{}/{}'.format(args.path, profile_filename), 'w')
    profile_file.write(profile_doc.read())
    profile_file.close()
    profile_doc.close()
    for locus in species_info.loci:
        locus_path = urlparse(locus.url).path
        locus_filename = locus_path.split('/')[-1]
        log_file.write("locus {}\n".format(locus.name))
        log_file.write(locus_filename + '\n')
        log_file.write("Sourced from {}\n\n".format(locus.url))
        locus_doc = url.urlopen(locus.url)
        locus_file = open('{}/{}'.format(args.path, locus_filename), 'w')
        locus_fasta_content = locus_doc.read()
        locus_file.write(locus_fasta_content)
        species_all_fasta_file.write(locus_fasta_content)
        locus_file.close()
        locus_doc.close()
    log_file.write("all loci: {}\n".format(species_all_fasta_filename))
    log_file.close()
    species_all_fasta_file.close()
Beispiel #2
0
    def _register_agent(self):
        register_name = self.app.config.get('TRCDASH_REGISTER_AS')
        if not register_name:
            register_name = socket.gethostname()

        url_args = {
            'name': register_name,
            'port': self.app.config.get('TRCDASH_PORT', self.DEFAULT_PORT),
        }
        register_url = '%s/register?%s' % (
            self.app.config['TRCDASH_REGISTER_TO'], urllib.urlencode(url_args))

        if 'TRCDASH_AUTH_USERNAME' in self.app.config and 'TRCDASH_AUTH_PASSWORD' in self.app.config:
            auth_handler = urllib3.HTTPBasicAuthHandler()
            auth_handler.add_password(
                realm='TRCDash login required',
                uri=register_url,
                user=self.app.config['TRCDASH_AUTH_USERNAME'],
                passwd=self.app.config['TRCDASH_AUTH_PASSWORD'])
            opener = urllib3.build_opener(auth_handler)
            urllib3.install_opener(opener)

        try:
            urllib3.urlopen(register_url)
        except urllib3.HTTPError as e:
            logger.error('Failed to register agent to "%s": %s', register_url,
                         e)
def main():

    ### Setup access credentials
    consumer_key = "your key"
    consumer_secret = "your secret"

    ### Get the Access Token
    bearer_token = "%s:%s" % (consumer_key, consumer_secret)
    bearer_token_64 = base64.b64encode(bearer_token)

    http = urllib3.PoolManager()
    token_request = http.request('GET', "https://api.twitter.com/oauth2/token")
    token_request.add_header(
        "Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
    token_request.add_header("Authorization", "Basic %s" % bearer_token_64)
    token_request.data = "grant_type=client_credentials"
    token_response = urllib3.urlopen(token_request)
    token_contents = token_response.read()
    token_data = json.loads(token_contents)
    access_token = token_data["access_token"]

    ### Use the Access Token to make an API request
    timeline_request = urllib3.Request(
        "https://api.twitter.com/1.1/users/show.json?screen_name=@realself")
    timeline_request = urllib3.Request(
        "https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=realself&count=2"
    )
    timeline_request.add_header("Authorization", "Bearer %s" % access_token)
    timeline_response = urllib3.urlopen(timeline_request)
    timeline_contents = timeline_response.read()
    timeline_data = json.loads(timeline_contents)
    print(json.dumps(timeline_data, indent=2, sort_keys=True))
    def crawlLinks(self, links, pages, file=None):
        res = []
        for link in pages:
            if shutdown_event.isSet():
                return GAME_OVER
            status_code = 0

            # This is due to an error the program will pick up
            if link != "https://www.linkedin.com/edu/school?id=17968":
                try:
                    request = build_request(link)
                    f = urlopen(request)
                    status_code = f.code
                    f.close()
                except (HTTPError, URLError):
                    status_code = HTTPError

                if status_code == 200:
                    request = build_request(link)
                    f = urlopen(request, timeout=3)
                    xml = f.read()
                    youtubes = self.getYoutube(xml, link)
                    l = len(youtubes['youtube'])
                    for i in range(l):
                        youtubeURL = youtubes['youtube'][i][:-1]
                        if youtubeURL in res: continue
                        res.append(youtubeURL)
                        print(youtubeURL + ", " + youtubes['link'])
                        file.write(youtubeURL + "," + youtubes['link'] + "\n")
                        file.flush()

        return GAME_OVER
    def crawlLinks(self, links, pages, file=None):
        res = []
        for link in pages:
            if shutdown_event.isSet():
                return GAME_OVER
            status_code = 0

            # This is due to an error the program will pick up
            if link != "https://www.linkedin.com/edu/school?id=17968":
                try:
                    request = build_request(link)
                    f = urlopen(request)
                    status_code = f.code
                    f.close()
                except (HTTPError, URLError):
                    status_code = HTTPError

                if status_code == 200:
                    request = build_request(link)
                    f = urlopen(request, timeout=3)
                    xml = f.read()
                    links = self.getKeyword(xml, link)
                    for i in links['keyword']:
                        if "www.googletagmanager.com/ns.html?id=gtm-nmx8dc" in i:  # this is for iframe, if you want to search other one please remove this if statment
                            continue
                        print (i + "," + links['link'])
                        file.write(i + "," + links['link'] + "\n")
                        file.flush()

        return GAME_OVER
    def crawlLinks(self, links, pages, file=None):
        count = 0
        res = []
        for link in pages:
            if shutdown_event.isSet():
                return GAME_OVER
            status_code = 0

            # This is due to an error the program will pick up
            if link != "https://www.linkedin.com/edu/school?id=17968":
                try:
                    request = build_request(link)
                    f = urlopen(request)
                    status_code = f.code
                    f.close()
                except (HTTPError, URLError):
                    status_code = HTTPError

                if status_code == 200:
                    count += 1
                    print (count)
                    request = build_request(link)
                    f = urlopen(request, timeout=3)
                    xml = f.read()
                    links = self.getKeyword(xml, link)
                    if len(links['keyword'])!=0:
                        print (links['keyword'][0] + "," + links['link'])
                        file.write(links['keyword'][0] + "," + links['link'] + "\n")
                        file.flush()

        return GAME_OVER
Beispiel #7
0
def add_event():
    event_name = request.args['name']
    location_lat = float(request.args['lat'])
    if(location_lat > 90 or location_lat < -90):
        return "bad lattitude"
    location_lng = float(request.args['lng'])
    if(location_lng > 180 or location_lat < -180):
        return "bad longitude"
    start_time = to_utc(request.args['start'])
    end_time = to_utc(request.args['end'])
    if(end_time <= start_time):
        return "invalid request"
    event_type = request.args['type']
    image_url='https://www.originvietnam.com/file/st_uploadfont/No_Image.jpg'
    if('image' in request.args):
        image_url = request.args['image']
        try: 
            print urllib3.urlopen(image_url)
        except:
            return image_url + "  site does not exist"
            
    #Add to mongo here...
    try:
        event_table.insert({"startTime": start_time, "endTime": end_time, "lat":location_lat, "lng": location_lng, "name":event_name, "votes":0, "comments":[], "type":event_type, "imageLink":image_url})
        print event_table.find_one()
        return "Inserting entry: " + str(event_table.count())
    except:
        return "Could not insert, duplicate entry"
    return "add_event code here"
Beispiel #8
0
def start_server(port):
    with socketserver.TCPServer(('127.0.0.1', port), TaskServer) as httpd:
        httpd.handle_request()
        # We fire a last request at the server in order to take it out of the
        try:
            urllib3.urlopen('http://%s:%s/' %
                            (httpd.server_address[1], httpd.server_address[1]))
        except:
            # If the server is already shut down, we receive a socket error,
            # which we ignore.
            pass
        httpd.server_close()
    return task_meta
    def get_latest_price(self, target_currency, base_currency):
        url = CurrentAnalyze.base_coin_market_cap_url.format(base_currency)
        response = urllib3.urlopen(url)
        jsonObj = simplejson.load(response)
        currency_to_usd = float(jsonObj[0]["price_usd"])

        target_url = (CurrentAnalyze.base_binance_url + CurrentAnalyze.latest_price).format(target_currency + CurrentAnalyze.symbol_id_map[base_currency])
        target_obj = simplejson.load(urllib3.urlopen(target_url))
        price_usd = currency_to_usd * float(target_obj['price'])
        target_obj['price_usd'] = price_usd
        target_obj['symbol'] = target_currency
        target_obj['base_currency'] = CurrentAnalyze.symbol_id_map[base_currency]
        return target_obj
Beispiel #10
0
def test_suggest(request,did):
	event = Event.objects.get(id=did)
	url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?location="+event.lat+","+event.lng+"&radius=4000&types=restaurant&key=InsertKeyHere"
	response = urllib3.urlopen(url).read()
	json_response = json.loads(response)
	print(type(json_response))
	print(response)
	results = json_response["results"]
	hospital_arr = []
	for result in results:
		#if 'hospital' in result["name"] or 'Hospital' in result["name"]:
		place = Copy.objects.create()
		place.name = result["name"]
		place.vicinity = result["vicinity"]
		place.place_id = result["id"]
		place.lat = result["geometry"]["location"]["lat"]
		place.lng = result["geometry"]["location"]["lng"]
		place.did = did
		#place.typeof = 'H'
		types = ''
		for keyword in result["types"]:
			types = types + ' ' + keyword
		
		place.types = types
		place.save()
		hospital_arr.append(place)
	
	police_arr = []
	url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?location="+event.lat+","+event.lng+"&radius=4000&types=cafe&key=InsertKeyHere"
	response = urllib3.urlopen(url).read()
	json_response = json.loads(response)
	print(type(json_response))
	print(response)
	results = json_response["results"]
	for result in results:
		#if 'police' in result["name"] or 'Police' in result["name"]:
		place = Center.objects.create()
		place.name = result["name"]
		place.vicinity = result["vicinity"]
		place.place_id = result["id"]
		place.lat = result["geometry"]["location"]["lat"]
		place.lng = result["geometry"]["location"]["lng"]
		place.did = did
		place.typeof = 'P'
		types = ''
		for keyword in result["types"]:
			types = types + ' ' + keyword
		place.types = types
		place.save()
		police_arr.append(place)
	return HttpResponse('lol')	
Beispiel #11
0
def gi2up(gi_accession_number):
    try:
        url = 'http://www.uniprot.org/mapping/?from=P_GI&to=ACC&query={}'.format(
            gi_accession_number)
        p = urllib3.urlopen(url).read()
        splitter = 'xml:lang="en"><head><title>'
        ls = p.split(splitter)[1].split(' ')[0].split(':')[1]
        url2 = 'http://www.uniprot.org/uniprot/?query=yourlist:{}&sort=yourlist:{}&columns=yourlist%28{}%29,id%2Centry%20name%2Creviewed%2Cprotein%20names%2Cgenes%2Corganism%2Clength%2Cexistence%2Ccomment%28PATHWAY%29%2Cgo%2Cgo%28biological%20process%29%2Cgo%28molecular%20function%29%2Cgo%28cellular%20component%29%2Cgo-id'.format(
            ls, ls, ls)
        x = urllib3.urlopen(url2).read()
        datum = x.split('class="addRemoveColumn mid"')[0].split(
            '</script></td></tr></thead><tbody><tr ')[1]
        datum = datum.split('class=')
        biorec = {}
        biorec['Entry (Uniprot)'] = datum[5].split(
            '"entryID"><a href="/uniprot/')[1].split('"')[0]
        biorec['Entry Name (Uniprot)'] = datum[6].split('>')[1].split('<')[0]
        biorec['Protein Names'] = datum[11].split('title="')[1].split('"')[0]
        biorec['Gene Names'] = ''.join(''.join('>'.join(
            datum[14].split('>')[1:]).split('<strong>')).split(
                '</strong>')).split('</div>')[0].strip()
        biorec['Organism'] = ''.join(
            datum[15].split('">')[2:]).split('</a><')[0]
        biorec['Length'] = datum[16].split('>')[1].split('<')[0]
        biorec['Protein Existence'] = datum[17].split('>')[1].split('<')[0]
        biorec['Pathway'] = datum[18].split('td>')[1].split('<td')[0]
        go = datum[19].split('<td style=""')[0].split('</a>')
        biorec['Gene Ontology (GO)'] = [
            i.split('>')[-1] for i in go if len(i.split('>')[-1]) > 0
        ]
        go = datum[20].split('<td style=""')[0].split('</a>')
        biorec['Gene ontology (biological process)'] = [
            i.split('>')[-1] for i in go if len(i.split('>')[-1]) > 0
        ]
        #return entryID1, entryID2, protein_name, gene_names, organisms
        go = datum[21].split('<td style=""')[0].split('</a>')
        biorec['Gene ontology (molecular function)'] = [
            i.split('>')[-1] for i in go if len(i.split('>')[-1]) > 0
        ]
        go = datum[22].split('<td style=""')[0].split('</a>')
        biorec['Gene ontology (cellular component)'] = [
            i.split('>')[-1] for i in go if len(i.split('>')[-1]) > 0
        ]
        go = datum[23].split('<td style=""')[0].split('</a>')
        biorec['Gene ontology IDs'] = [
            i.split('>')[-1] for i in go
            if ((len(i.split('>')[-1]) > 0) and (i.split('>')[-1] != '<td '))
        ]
        return biorec
    except:
        return None
Beispiel #12
0
def _send_msg(url, access_token, body):
    posturl = url + access_token
    headers = {'Content-Type': 'application/json'}
    request = urllib3.Request(url=posturl, headers=headers, data=body)
    response = urllib3.urlopen(request)
    resp = response.read()
    print(resp)
Beispiel #13
0
 def find_interests(self, tweets):
     interests = {}
     interests['links'] = []
     interests['users'] = []
     interests['hashtags'] = []
     for tweet in tweets:
         text = tweet['tweet']
         links = re.compile(r'(https.*?)|(http.*?)').findall(text)
         for link in links:
             if link[0]:
                 link = link[0]
             elif link[1]:
                 link = link[1]
             else:
                 continue
             try:
                 response = urllib3.urlopen(link)
                 full_link = response.url
                 interests['links'].append(full_link)
             except Exception:
                 pass
         interests['users'] += re.compile(r'(@\w+)').findall(text)
         interests['hashtags'] += re.compile(r'(#\w+)').findall(text)
     interests['users'].sort()
     interests['hashtags'].sort()
     interests['links'].sort()
     return interests
Beispiel #14
0
def main(client, number_of_campaigns, number_of_adgroups, number_of_keywords):
  # Initialize BatchJobHelper.
  batch_job_helper = client.GetBatchJobHelper(version='v201806')

  # Create a BatchJob.
  batch_job = AddBatchJob(client)
  # Retrieve the URL used to upload the BatchJob operations.
  upload_url = batch_job['uploadUrl']['url']
  batch_job_id = batch_job['id']
  print('Created BatchJob with ID "%d", status "%s", and upload URL "%s"' % (
      batch_job['id'], batch_job['status'], upload_url))

  # Generate operations to upload.
  budget_operations = BuildBudgetOperations(batch_job_helper)
  campaign_operations = BuildCampaignOperations(
      batch_job_helper, budget_operations, number_of_campaigns)
  campaign_criterion_operations = BuildCampaignCriterionOperations(
      campaign_operations)
  adgroup_operations = BuildAdGroupOperations(
      batch_job_helper, campaign_operations, number_of_adgroups)
  adgroup_criterion_operations = BuildAdGroupCriterionOperations(
      adgroup_operations, number_of_keywords)
  adgroup_ad_operations = BuildAdGroupAdOperations(adgroup_operations)

  # Upload operations.
  batch_job_helper.UploadOperations(
      upload_url, budget_operations, campaign_operations,
      campaign_criterion_operations, adgroup_operations,
      adgroup_criterion_operations, adgroup_ad_operations)

  # Download and display results.
  download_url = GetBatchJobDownloadUrlWhenReady(client, batch_job_id)
  response = urllib3.urlopen(download_url).read()
  PrintResponse(batch_job_helper, response)
def use_simple_urllib3():
    response = urllib3.urlopen(URL_IP)
    print('>>>>Response Headers:')
    print(response.info())
    print('>>>>R' \
          'esponse body:')
    print(''.join([line for line in response.readlines()]))
Beispiel #16
0
 def get(self,symbol,exchange):
     url = self.prefix+"%s:%s"%(exchange,symbol)
     u = urllib3.urlopen(url)
     content = u.read()
     
     obj = json.loads(content[3:])
     return obj[0]
Beispiel #17
0
def testFunctionWeb():
    """Benchmarcking function...
	"""
    #print p
    resp = urllib3.urlopen('http://www.i3visio.com')
    html = resp.read()
    return
Beispiel #18
0
def download_the_av(url):
    req = urllib.request.Request(url)
    content = urllib.request.urlopen(req).read()
    content = content.decode('utf-8')
    while len(content) < 100:
        print("try again...")
        content = urllib3.urlopen(req).read()
    print("All length:" + str(len(content)))

    title_begin = content.find("<title>")
    title_end = content.find("</title>")
    title = content[title_begin + 7:title_end - 14]
    title = title.replace('/', '_')
    title = filter(
        lambda x: x in
        "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ _-",
        title)

    quality = ['720', '480', '240']
    for i in quality:
        find_position = content.find("\"quality\":\"" + i + "\"")
        if find_position > 0:
            print("Quality: " + i + "P")
            break
    to_find = content[find_position:find_position + 4000]

    pattern = re.compile(r"\"videoUrl\":\"[^\"]*\"")
    match = pattern.search(to_find)
    if match:
        the_url = match.group()
    the_url = the_url[12:-1]  #the real url
    the_url = the_url.replace("\\/", "/")
    save_file(the_url, big_path + title + ".mp4")
Beispiel #19
0
def github_set_commit_status(user,
                             repo,
                             token,
                             sha1,
                             state="success",
                             description="",
                             link=""):
    #pending, success, error, failure

    description = description[0:min(len(
        description), 140)]  #github doesn't like too long description

    data = js.dumps({
        'state': state,
        'context': 'default',
        'description': description,
        'target_url': link
    })
    url = "https://api.github.com/repos/{0}/{1}/statuses/{2}".format(
        github_user, github_repo, sha1)

    req = urllib3.Request(url)
    req.add_header("Authorization", "token {0}".format(token))
    req.add_data(data)
    try:
        res = urllib3.urlopen(req)
        result = res.read()
    except urllib3.HTTPError as e:
        print("setting github status failed: HTTP error ", e.code)
    except urllib3.URLError as e:
        print("setting github status failed: failure ", e.reason)
Beispiel #20
0
    def get(self, symbol, exchange):
        url = self.prefix + "%s:%s" % (exchange, symbol)
        u = urllib3.urlopen(url)
        content = u.read()

        obj = json.loads(content[3:])
        return obj[0]
Beispiel #21
0
 def get_first_string():
     response = urllib3.urlopen(
         'https://gist.githubusercontent.com/jsdario/6d6c69398cb0c73111e49f1218960f79/raw/8d4fc4548d437e2a7203a5aeeace5477f598827d/el_quijote.txt'
     )
     full_text = response.read()
     text_tokenized = full_text.split(' ')
     return text_tokenized[0]
Beispiel #22
0
def find_interests(tweets):
    interests = defaultdict(list)

    for tweet in tweets:
        text = tweet['tweet']
        # Regexp to grab URLs might miss certain types of URL since it's hard
        # to match all possible URLs with a regexp. But, it is sufficient for
        # this program.
        links = re.compile('(http.*?)\Z|(http.*?) ').findall(text)

        for link in links:
            if link[0]:
                link = link[0]
            elif link[1]:
                link = link[1]
            else:
                continue

            with suppress(Exception):
                response = urllib3.urlopen(link)
                full_link = response.url
                interests['links'].append(full_link)

        interests['users'] += re.compile('(@\w+)').findall(text)
        interests['hashtags'] += re.compile('(#\w+)').findall(text)

    interests['users'].sort()
    interests['hashtags'].sort()
    interests['links'].sort()

    return interests
Beispiel #23
0
def analyze(url):
	with contextlib.closing(urllib.urlopen(url)) as urlf:
		url = urlf.geturl()
		content = urlf.read()
	doc = etree.HTML(content)

	title = doc.find('.//h1[@class="title"]').text
	votes = {}
	for o in OPTIONS:
		votes[o] = int(doc.find('.//div[@class="votingresults"]/div[@class="option-' + o + '"]').text.strip('()'))
	voteCount = sum(votes.values())
	decision = sum(OPTIONS[o]['weight'] * votes for o,votes in votes.items())
	nextUrl = 'http://besser-studieren.nrw.de' + doc.find('.//a[@class="navigate_next"]').attrib['href']
	authorNode = doc.find('.//div[@class="username"]')
	aNode = authorNode.find('./a')
	if aNode is not None:
		author = aNode.text
	else:
		author = authorNode.text
	if author.startswith('verfasst von: '):
		author = author[len('verfasst von: '):]
	author = author.strip()

	return {
		'author': author,
		'url': url,
		'title': title,
		'votes': votes,
		'nextUrl': nextUrl,
	}
Beispiel #24
0
 def post(self):
     self.values = {"username":"******","password":"******"}
     data = urllib.urlencode(self.values)
     url = "https://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn"
     request = urllib3.Request(url, data)
     response = urllib3.urlopen(request)
     print(response.read())
Beispiel #25
0
    def _get_aws_meta(scene_id: str, path: int, row: int) -> str:
        meta_url = '{}/{}/{}/{}/{}_MTL.txt'.format(AWS_LS8_URL, path, row,
                                                   scene_id, scene_id)
        # TODO update to Python3
        meta_data = urllib3.urlopen(meta_url).readlines()

        return meta_data
Beispiel #26
0
    def upcoverImage(self, url):

        image_name = 'binary'
        data_param = {
            "csrf": self.csrf_token,
            'img_name': image_name
        }  #有些API要求指定文件名参数
        # TODO 获取远程网络图片
        image_file = io.StringIO(urllib3.urlopen(url).read())
        image_data = Image.open(image_file)
        output = io.BytesIO()
        image_data.save(output, format='PNG')  # format=image_data.format
        print(image_data.format)  #输出的不一定是JPEG也可能是PNG
        image_data.close()
        data_bin = output.getvalue()
        output.close()
        file_obj = data_bin
        #fixed at 2017-05-19 10:49:57
        img_file = {image_name: file_obj}  #Very Important.
        #the key must be the filename,
        #because the requests cannot guess the correct filename from the bytes array.
        data_result = requests.post(url,
                                    data_param,
                                    files=img_file,
                                    headers=self.headers)
        if isinstance(file_obj, MsgUtil):  #这里load_image获得的是二进制流了,不是file对象。
            file_obj.close()
        data_resultJson = demjson.decode(data_result)
        if data_resultJson['code'] == 0:
            imageUrl = data_resultJson['data']['url']
        else:
            imageUrl = "https://i0.hdslb.com/bfs/album/1453def5c58b7c52041e4e076a5a853e358a53e1.jpg"
        return imageUrl
Beispiel #27
0
    def query(self, address):
        lat, lng = self.address_to_latlng(address)

        query_url = 'https://en.wikipedia.org/w/api.php?action=query&list=geosearch&gsradius=5000&gscoord={0}%7C{1}&gslimit=20&format=json'.format(
            lat, lng)
        g = urllib3.urlopen(query_url)
        results = g.read()
        g.close()

        data = json.loads(results)

        places = []
        for place in data['query']['geosearch']:
            name = place['title']
            meters = place['dist']
            lat = place['lat']
            lng = place['lon']

            wiki_url = self.wiki_path(name)
            walking_time = self.meters_to_walking_time(meters)

            d = {
                'name': name,
                'url': wiki_url,
                'time': walking_time,
                'lat': lat,
                'lng': lng
            }

            places.append(d)

        return places
    def readSiteMap(self):
        pages = []
        try:
            # f = urlopen("http://www.codepool.biz/sitemap.xml")
            # Change the link when you need to crawl a different page
            url = "https://www.usfca.edu/sitemap.xml"
            maps = self.getLinks(self.getHtml(url))
            for map in maps:
                request = build_request(map)
                f = urlopen(request, timeout=3)
                xml = f.read()

                soup = BeautifulSoup(xml)
                urlTags = soup.find_all("url")
                # print(urlTags)

                print("The number of url tags in sitemap: ", str(len(urlTags)))

                for sitemap in urlTags:
                    link = sitemap.findNext("loc").text
                    pages.append(link)
                f.close()

        except (HTTPError, URLError):
            print(URLError.code)

        return pages
Beispiel #29
0
    def run(self):
        while True:
            item = self.queue.get()
            data = self._data_post(item)
            try:
                req = urllib3.Request(url=self.url, data=data)
                res = urllib3.urlopen(req)
            except urllib3.HTTPError as e:
                raise e.reason
            py_data = json.loads(res.read())
            res.close()

            item['first'] = 'false'
            item['pn'] = item['pn'] + 1
            success = py_data['success']
            if success:
                print("Get success ...")
            else:
                print('Get fail')
            print('pn is : %s' % item['pn'])
            result = py_data['content']['result']
            if len(result) != 0:
                self.queue.put(item)
            print("now queue size is : %d" % self.queue.qsize())
            self.out_queue.put(py_data['content']['result'])
            self.queue.task_done()
def get_historical_data(name, number_of_days):
    data = []
    url = "https://finance.yahoo.com/quote/" + name + "/history/"
    rows = bs(
        urllib3.urlopen(url).read()).findAll('table')[0].tbody.findAll('tr')

    for each_row in rows:
        divs = each_row.findAll('td')
        if divs[1].span.text != 'Dividend':  #Ignore this row in the table
            #I'm only interested in 'Open' price; For other values, play with divs[1 - 5]
            data.append({
                'Date': divs[0].span.text,
                'Open': float(divs[1].span.text.replace(',', ''))
            })

    return data[:number_of_days]


#Test
# print get_historical_data('amzn', 15)

# https://query1.finance.yahoo.com/v7/finance/download/WFC?period1=1561874153&period2=1593496553&interval=1d&events=history
# https://query1.finance.yahoo.com/v7/finance/download/WFC?period1=1561874369&period2=1593496769&interval=1d&events=history
# https://query1.finance.yahoo.com/v7/finance/download/AMZN?period1=1561874338&period2=1593496738&interval=1d&events=history

# max
# https://query1.finance.yahoo.com/v7/finance/download/WFC?period1=76204800&period2=1593388800&interval=1d&events=history
# https://query1.finance.yahoo.com/v7/finance/download/VBIV?period1=1031097600&period2=1593388800&interval=1d&events=history
def find_interests(tweets):
    interests = defaultdict(list)

    for tweet in tweets:
        text = tweet['tweet']
        # Regexp to grab URLs might miss certain types of URL since it's hard
        # to match all possible URLs with a regexp. But, it is sufficient for
        # this program.
        links = re.compile('(http.*?)\Z|(http.*?) ').findall(text)

        for link in links:
            if link[0]:
                link = link[0]
            elif link[1]:
                link = link[1]
            else:
                continue

            with suppress(Exception):
                response = urllib3.urlopen(link)
                full_link = response.url
                interests['links'].append(full_link)

        interests['users'] += re.compile('(@\w+)').findall(text)
        interests['hashtags'] += re.compile('(#\w+)').findall(text)

    interests['users'].sort()
    interests['hashtags'].sort()
    interests['links'].sort()

    return interests
Beispiel #32
0
def get_photo_size(url):
    width = 0
    height = 0
    if url == '':
        return width, height

    try:
        file = urllib3.urlopen(url, timeout=URL_OPEN_TIME_OUT)
        p = ImageFile.Parser()
        while 1:
            data = file.read(1024)

            if not data:
                break

            p.feed(data)

            if p.image >= 0:
                width = p.image.size[0]
                height = p.image.size[1]
        file.close()
    except:
        print
        'get_photo_size error'
    return width, height
Beispiel #33
0
def google(terms):  # google <search term>
    '''Returns the link and the description of the first result from a google
    search
    '''
    #query = raw_input ( 'Query: ' )
    query = terms.text.strip('/wiki').lstrip(' ')
    print "going to google %s" % query
    query = urllib.urlencode({'q': query})
    response = urllib.urlopen(
        'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' +
        query).read()
    json = m_json.loads(response)
    results = json['responseData']['results']
    returnval = ""
    for result in results:
        title = result['title']
        url = result[
            'url']  # was URL in the original and that threw a name error exception
        #print ( title + '; ' + url )
        title = title.translate({ord(k): None for k in u'<b>'})
        title = title.translate({ord(k): None for k in u'</b>'})
        returnval += title + ' ; ' + url + '\n'

    print "returning %s" % returnval
    return returnval.encode('utf-8')
def httpRequest():
    #Function to send the POST request to ThingSpeak channel for bulk update.
    global messageBuffer
    data = json.dumps({
        'write_api_key': writeAPIkey,
        'updates': messageBuffer
    })  # Format the json data buffer
    http = ul.PoolManager()
    #req = ul.request.urlopen(url = url)
    req = http.request('GET', url=url)
    requestHeaders = {
        "User-Agent": "mw.doc.bulk-update (Raspberry Pi)",
        "Content-Type": "application/json",
        "Content-Length": str(len(data))
    }
    for key, val in requestHeaders.iteritems():  # Set the headers
        req.add_header(key, val)
    req.add_data(data)  # Add the data to the request
    # Make the request to ThingSpeak
    try:
        response = ul.urlopen(req)  # Make the request
        print(response.getcode()
              )  # A 202 indicates that the server has accepted the request
    except ul.HTTPError as e:
        print(e.code)  # Print the error code
    messageBuffer = []  # Reinitialize the message buffer
    global lastConnectionTime
    lastConnectionTime = time.time()  # Update the connection time
Beispiel #35
0
def del1__request(url):
    #url="http://quote.eastmoney.com/stocklist.html#sz"
    i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5"}

    retry =0
    MaxRetry=3

    header=True
    while True :
        try:
            if header==True:
                req = urllib3.Request(url, headers=i_headers)
                response = urllib3.urlopen(req, timeout=10)
            else:
                response = urllib3.urlopen(url, timeout=10)
                #print 'http header:\n', response.info()
                #print 'http status:\n', response.getcode()
                #print 'http url:\n', response.geturl()

            break
        # except urllib3.HTTPError,e:
        except Exception as e:
            if hasattr(e,'code'):
                print('code:{0}'.format(e.code))
            raise urllib3.HTTPError

        # except urllib3.URLError, e:
        except Exception as e:
            if hasattr(e,'reason'):
                print('reason:{0}'.format(e.reason))
            if hasattr(e,'code'):
                print('code:{0}'.format(e.code))

            retry +=1
            if retry > MaxRetry:
                print('More than max %d' % MaxRetry)
                raise urllib3.URLError
            else:
                print('Try request again ...')

    return response.read()
Beispiel #36
0
def getPM25(cityname):
    site = 'http://www.pm25.com/' + cityname + '.html'
    html = urllib3.urlopen(site)
    soup = BeautifulSoup(html)

    city = soup.find(class_='bi_loaction_city')  # 城市名称
    aqi = soup.find("a", {"class", "bi_aqiarea_num"})  # AQI指数
    quality = soup.select(".bi_aqiarea_right span")  # 空气质量等级
    result = soup.find("div", class_='bi_aqiarea_bottom')  # 空气质量描述

    print(city.text + u'AQI指数:' + aqi.text + u'\n空气质量:' + quality[0].text + result.text)
    print('*' * 20 + ctime() + '*' * 20)
Beispiel #37
0
def get_para(wlink):
    msg = ''
    try:
        page_request = urllib3.Request(wlink)
        page_request.add_header('User-agent', 'Mozilla/5.0')
        page = urllib3.urlopen(page_request)
    except IOError:
        msg = 'No hay articulos en Wikipedia, tal vez quieras buscarlo en Google!'

    else:
        msg = wlink

    return msg
Beispiel #38
0
def download_setuptools(
    version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
    delay=15
):
    """Download setuptools from a specified location and return its filename

    `version` should be a valid setuptools version number that is available
    as an egg for download under the `download_base` URL (which should end
    with a '/'). `to_dir` is the directory where the egg will be downloaded.
    `delay` is the number of seconds to pause before an actual download
    attempt.
    """
    import urllib3 as urllib2
    egg_name = "setuptools-%s-py%s.egg" % (version, sys.version[:3])
    url = download_base + egg_name
    saveto = os.path.join(to_dir, egg_name)
    src = dst = None
    if not os.path.exists(saveto):  # Avoid repeated downloads
        try:
            from distutils import log
            if delay:
                log.warn("""
---------------------------------------------------------------------------
This script requires setuptools version %s to run (even to display
help).  I will attempt to download it for you (from
%s), but
you may need to enable firewall access for this script first.
I will start the download in %d seconds.

(Note: if this machine does not have network access, please obtain the file

   %s

and place it in this directory before rerunning this script.)
---------------------------------------------------------------------------""",
                         version, download_base, delay, url)
                from time import sleep
                sleep(delay)
            log.warn("Downloading %s", url)
            src = urllib2.urlopen(url)
            # Read/write all in one block, so we don't create a corrupt file
            # if the download is interrupted.
            data = _validate_md5(egg_name, src.read())
            dst = open(saveto, "wb")
            dst.write(data)
        finally:
            if src:
                src.close()
            if dst:
                dst.close()
    return os.path.realpath(saveto)
Beispiel #39
0
def ipCheck():
   if ipConfig["ip"] is None:
      #initialize the ipcheck.yaml file
      pass
   http_pool = urllib3.connection_from_url(URL)
   try:
      response = urllib3.urlopen(req)
   except URLError, e:
      if hasattr(e, 'reason'):
         print 'We failed to reach a server.'
         print 'Reason: ', e.reason
      elif hasattr(e, 'code'):
         print 'The server couldn\'t fulfill the request.'
         print 'Error code: ', e.code
Beispiel #40
0
def getURLs(url):
    try:
        fp = urllib3.urlopen(url)
    except:
        print('get url exception')
        return []
    pattern = re.compile('http://[\w\.]+')
    while True:
        s = fp.read()
        if not s:
            break
        urls = pattern.findall(s)
    fp.close()
    return urls
Beispiel #41
0
def downURL(url, filename):
    try:
        fp = urllib3.urlopen(url)
    except:
        print('download exception')
        return False
    op = open(filename, 'wb')
    while True:
        s = fp.read()
        if not s:
            break
        op.write(s)

    fp.close()
    op.close()
    return True
Beispiel #42
0
def get_url_of_page(url, if_img=False):
    """
    获取一个页面上的所有链接。
    if_img:如果为true,则获取的是页面上的所有图片的链接
    """
    urls = []
    try:
        f = urllib3.urlopen(url, timeout=3).read()
        url_listen = URLLister()
        url_listen.feed(f)
        if if_img:
            urls.extend(url_listen.imgs)
        else:
            urls.extend(url_listen.urls)
    except urllib3.URLError, e:
        print e
Beispiel #43
0
 def VOGetLines(self,log, source, w_range = [88000,720000]):
     #w_range is in Mhz
     c = 299792458.0
     log.write('Importing lines in range from %s to %s \n' % (w_range[0], w_range[1]))
     w_init = c / (int(w_range[0]) * 1000000.0)
     w_end = c / (int(w_range[1]) * 1000000.0)
     data = '?REQUEST=queryData&WAVELENGTH=' + \
         str(w_init) + '/' + str(w_end) + '&VERB=3'
     curl = source + data.encode('utf-8')
     log.write('  -> Downloading lines via %s:\n' % source)
     log.write('  -> ' + curl + '\n')
     req = urllib3.Request(curl)
     response = urllib3.urlopen(req)
     votable = response.read()
     location = './votables/customVOTable.xml'
     f = open(location, 'w')
     f.write(votable)
     f.close()
def download(fname, redownload=False):
    """download a file
    
    if redownload=False, the file will not be downloaded if it already exists.
    """
    dest = os.path.join(here, fname)
    if os.path.exists(dest) and not redownload:
        return
    url = 'https://raw.github.com/dpsanders/ipython_extensions/master/section_numbering/' + fname
    print("Downloading %s to %s" % (url, dest))
    
    filein  = urllib3.urlopen(url)
    fileout = open(dest, "wb")
    chunk = filein.read(1024)
    while chunk:
        fileout.write(chunk)
        chunk = filein.read(1024)
    filein.close()
    fileout.close()
Beispiel #45
0
def GetTradeArea(url):
    headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6"}
    req = urllib3.Request( url, headers = headers)
    try:
        content = urllib3.urlopen(req).read()
    except Exception as e:
        print(e)
        return 'error no trade'
    soup = BeautifulSoup(content)

    c = soup.findAll('div',class_='relate_stock clearfix')
    #print c
    name = soup.find('h1',class_='name').contents[1].contents[0].encode('utf-8')
    industry = c[1].findAll('li')

    industry_name = industry[0].contents[0].contents[0].encode('utf-8').strip()
    #print industry_name
    #print name
    area_name = industry[1].contents[0].contents[0].encode('utf-8').strip()

    return (industry_name, area_name)
Beispiel #46
0
def google(terms): # google <search term>
    '''Returns the link and the description of the first result from a google
    search
    '''
    #query = raw_input ( 'Query: ' )
    query=terms.text.strip('/wiki').lstrip(' ')
    print "going to google %s" % query
    query = urllib.urlencode ( { 'q' : query } )
    response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
    json = m_json.loads ( response )
    results = json [ 'responseData' ] [ 'results' ]
    returnval=""
    for result in results:
        title = result['title']
        url = result['url']   # was URL in the original and that threw a name error exception
        #print ( title + '; ' + url )
        title=title.translate({ord(k):None for k in u'<b>'})
        title=title.translate({ord(k):None for k in u'</b>'})
        returnval += title + ' ; ' + url + '\n'

    print "returning %s" %returnval
    return returnval.encode('utf-8')
Beispiel #47
0
def UrlRequest(str_symbol,start,end):
    #sym=SymbolCheck(symbol)
    mainurl="http://quotes.money.163.com/service/chddata.html?"
    #http://quotes.money.163.com/service/chddata.html?code=1000593&start=19960312&end=20150623&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP
    options="TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP"
    suburl="code=%s&start=%d&end=%d&fields=%s" % (str_symbol, start, end, options)

    #print mainurl+suburl

    #header=False
    header=True
    testpost=False
    if testpost == True:
        url=mainurl
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        values = {'code' : str_symbol,
                  'start' : start,
                  'end' : end,
                  'fields' : options }
        headers = { 'User-Agent' : user_agent }

    else :
        url=mainurl+suburl
        i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5"}

    Debug=False
    if Debug==True:
        httpHandler = urllib3.HTTPHandler(debuglevel=1)
        httpsHandler = urllib3.HTTPSHandler(debuglevel=1)
        opener = urllib3.build_opener(httpHandler, httpsHandler)
        urllib3.install_opener(opener)

    #useipv4=True
    useipv4=False

    retry =0
    MaxRetry=3
    while True :
        try:

            headers = {
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101 Firefox/52.0'
                    }

            requests.packages.urllib3.disable_warnings()
            # print(url)
            r= requests.get(url, headers=headers, verify=False)
            r.encoding='UTF-8'
            page = r.text
            return page

            tout=120
            if useipv4==True:
                urlopen_IPv4= urllib3.build_opener(HTTPHandler_IPv4).open
                response= urlopen_IPv4(url, timeout=tout)
                break

            if header==True:
                if testpost == True:
                    data = urllib3.urlencode(values)
                    print(data)
                    req = urllib3.Request(url, data, headers)
                else:
                    req = urllib3.Request(url, headers=i_headers)

                response = urllib3.urlopen(req, timeout=tout)
            else:
                response = urllib3.urlopen(url, timeout=tout)

            break
        except Exception as e:
            if hasattr(e,'code'):
                print('code:{0}'.format(e.code))
            # raise urllib3.HTTPError

        except Exception as e:
            if hasattr(e,'reason'):
                print('reason:{0}'.format(e.reason))
            if hasattr(e,'code'):
                print('code:{0}'.format(e.code))

            retry +=1
            if retry > MaxRetry:
                print('More than max %d' % MaxRetry)
                raise urllib3.URLError
            else:
                print('Try request again ...')
        else :
            pass
            #print "Down data ok"

    return response
Beispiel #48
0
from bs4 import BeautifulSoup
import urllib3
import re
import sys

from pymongo import MongoClient

client = MongoClient("mongodb://*****:*****@kahana.mongohq.com:10009/courier_db")
db = client.courier_db

base_url = "https://www.studential.com"

# url of subjects: https://www.studential.com/personal-statement-examples/subjects
page = urllib3.urlopen("https://www.studential.com/personal-statement-examples/subjects")
soup = BeautifulSoup(page.read())
chunk=soup.find("div",{"id":"content"})
content=chunk.find("div",{"class":"content"})

# regex to find valid href's: ^/personal-statement-examples/.*-personal-statements$
subjects = content.find_all("a",href = re.compile(r'^/personal-statement-examples/.*-personal-statements$'))

# subjects[i]["href"] to access href
for a in subjects[:]:

	subject = re.match(r"^/personal-statement-examples/(.*)-personal-statements$",a["href"]).group(1)
	print subject

	subject_page = urllib3.urlopen(base_url+a["href"])
	subject_soup = BeautifulSoup(subject_page.read())
	statement_chunk = subject_soup.find("div",{"id":"content"})
	statements = statement_chunk.find_all("p")
Beispiel #49
0
def __request(url):

    #urlopen_IPv4= urllib3.build_opener(HTTPHandler_IPv4).open

    #url="http://quote.eastmoney.com/stocklist.html#sz"
    i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5"}

    retry =0
    MaxRetry=3

    #useipv4=True
    useipv4=False

    header=True
    while True :
        try:

            headers = {
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101 Firefox/52.0'
                    }

            requests.packages.urllib3.disable_warnings()
            print(url)
            r= requests.get(url, headers=headers, verify=False)
            r.encoding='UTF-8'
            page = r.text
            return page

            if useipv4==True:
                #response= urlopen_IPv4(url, timeout=3)
                break

            if header==True:
                req = urllib3.Request(url, headers=i_headers)
                response = urllib3.urlopen(req, timeout=10)

            else:
                response = urllib3.urlopen(url, timeout=10)
                #print 'http header:\n', response.info()
                #print 'http status:\n', response.getcode()
                #print 'http url:\n', response.geturl()

            break
        # except urllib3.HTTPError,e:
        except Exception as e:
            if hasattr(e,'code'):
                print('code:{0}'.format(e.code))
            # raise urllib3.HTTPError

        # except urllib3.URLError, e:
        except Exception as e:
            if hasattr(e,'reason'):
                print('reason:{0}'.format(e.reason))
            if hasattr(e,'code'):
                print('code:{0}'.format(e.code))

            retry +=1
            if retry > MaxRetry:
                print('More than max %d' % MaxRetry)
                raise urllib3.URLError
            else:
                print('Try request again ...')

    return response.read()
import urllib3

es_dir = '/Users/MarinaFomicheva/Dropbox/workspace/questplusplus/lang_resources/spanish/wmt15_baseline'
en_dir = '/Users/MarinaFomicheva/Dropbox/workspace/questplusplus/lang_resources/english/wmt15_baseline'

url = 'http://www.quest.dcs.shef.ac.uk/quest_files/truecase-model.es'
response = urllib3.urlopen(url)
with open(es_dir + '/' + 'truecase-model.es', 'w') as f:
    f.write(response.read())
    def __init__(self, filename, gdalDataset, gdalMetadata, **kwargs):
        ''' Create VRT '''

        ThreddsBase = 'http://thredds.met.no/thredds/dodsC/myocean/siw-tac/siw-metno-svalbard/'
        # First check if mapper is called with keyword syntax:
        # filename = metno_hires_seaice:YYYYmmdd
        keywordBase = 'metno_hires_seaice'
        foundDataset = False
        if filename[0:len(keywordBase)] == keywordBase:
            keywordTime = filename[len(keywordBase)+1:]
            requestedTime = datetime.strptime(keywordTime, '%Y%m%d')
            # Search for nearest available file, within the closest 3 days
            for deltaDay in [0, -1, 1, -2, 2, -3, 3]:
                validTime = (requestedTime + timedelta(days=deltaDay) +
                             timedelta(hours=15))
                filename = (ThreddsBase +
                            validTime.strftime(
                                '%Y/%m/ice_conc_svalbard_%Y%m%d1500.nc'))
                try:
                    urllib.urlopen(filename + '.dds')
                    foundDataset = True
                    # Data is found for this day
                    break
                except:
                    # No data for this day
                    pass

        if not foundDataset:
            raise WrongMapperError

        # Then check if a valid OPeNDAP URL is given
        # (or has been constructed from keyword)
        if filename[0:len(ThreddsBase)] != ThreddsBase:
            AttributeError("Not Met.no Svalbard-ice Thredds URL")
        else:
            timestr = filename[-15:-3]
            validTime = datetime.strptime(timestr, '%Y%m%d%H%M')

        filename = filename + '?ice_concentration[0][y][x]'
        srcProjection = osr.SpatialReference()
        srcProjection.ImportFromProj4('+proj=stere lon_0=0.0 +lat_0=90 +datum=WGS84 +ellps=WGS84 +units=km +no_defs')
        srcProjection = srcProjection.ExportToWkt()

        # From thredds web, with manual shift
        srcGeotransform = (-1243.008 - 1, 1, 0, -3190.026 - 7, 0, 1)

        # create empty VRT dataset with geolocation only
        self._init_from_dataset_params(3812, 2980, srcGeotransform, srcProjection)

        metaDict = [{'src': {'SourceFilename': filename,
                             'sourceBand': 1},
                     'dst': {'name': 'sea_ice_area_fraction',
                             'wkv': 'sea_ice_area_fraction'}}]

        # Add band
        self.create_bands(metaDict)

        # Set time
        self.logger.info('Valid time: %s', str(validTime))
        self.dataset.SetMetadataItem('time_coverage_start',
                                     validTime.isoformat())
Beispiel #52
0
__author__ = 'sereg'

#!/usr/bin/python
import string
from urllib3 import urlopen

u = urlopen("http://python.org")
words = {}

for line in u.readlines():
    line = string.strip(line, " \n")
    for word in line.split(" "):
        try:
            words[word] += 1
        except KeyError:
            words[word] = 1




pairs = words.items()

pairs.sort(lambda a, b: b[1]-a[1])

for p in pairs[:10]:
    print p[0], p[1]