Python simple_getの例、mathematicians.simple_get Pythonの例

コード例 #1

0

ファイルを表示

def get_hits_on_name(name):
    """
    Accepts a `name` of a mathematician and returns the number
    of hits that mathematician's wikipedia page received in the
    last 60 days, as an `int`
    """
    # url_root is a template string that used to buld a URL.
    url_root = 'https://xtools.wmflabs.org/articleinfo/en.wikipedia.org/{}'
    response = simple_get(url_root.format(name))

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')

        hit_link = [
            a for a in html.select('a') if a['href'].find('latest-60') > -1
        ]

        if len(hit_link) > 0:
            # Strip commas:
            link_text = hit_link[0].text.replace(',', '')
            try:
                # Convert to integer
                return int(link_text)
            except:
                log_error("couldn't parse {} as an `int`".format(link_text))

    log_error('No pageviews found for {}'.format(name))
    return None

コード例 #2

0

ファイルを表示

def get_names():
    url = "http://www.fabpedigree.com/james/mathmen.htm"
    response = simple_get(url)
    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        names = set()
        for li in html.select('li'):
            for name in li.text.split('\n'):
                if len(name) > 0:
                    names.add(name.strip())
        return list(names)
    raise Exception('Error retrieving contents at {}'.format(url))

コード例 #3

0

ファイルを表示

 def get_data(self, url):
     raw_html = mathematicians.simple_get(url)
     html = BeautifulSoup(raw_html, 'html.parser')
     jobTags = html.find_all('div', class_="job_content")
     info = []
     for tag in jobTags:
         logo = tag.find('div', class_="logo-wrapper")
         img = logo.find('img')
         title = tag.find('h2', class_="title").text
         print('URL logo: ', img['data-src'])
         print('Company Name:', img['alt'])
         print('Job nam', title)

コード例 #4

0

ファイルを表示

ファイル: mathematicians.py プロジェクト: BehnazD/foundations

def get_names():
    """
    Downloads the page where the list of mathematicians is found
    and returns a list of strings, one per mathematician
    """
    url = 'http://www.fabpedigree.com/james/mathmen.htm'
    response = simple_get(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        names = set()
        for li in html.select('li'):
            for name in li.text.split('\n'):
                if len(name) > 0:
                    names.add(name.strip())
        return list(names)

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))

コード例 #5

0

ファイルを表示

def get_hits_on_name(name):
    url_root = "https://xtools.wmflabs.org/articleinfo/en.wikipedia.org/Henri_Poincar%C3%A9"
    response = simple_get(url_root.format(name))

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        hit_link = [
            a for a in html.select('a') if a['href'].find('latest-60') > -1
        ]
        if len(hit_link) > 0:
            # strip commas
            link_text = hit_link[0].text.replace(', ', '')
            try:
                # convert to integer
                return int(link_text)
            except:
                log_error("couldn't parse {} as an `int`".format(link_text))

    log_error('No pageviews found for {}'.format(name))
    return None

コード例 #6

0

ファイルを表示

ファイル: getMathematicians.py プロジェクト: yousefmahmad/WebScrapingPractice

def get_hits_on_name(name):
    """
  Accepts a 'name' of a mathematician and returns the number
  of hits that mathematician's Wikipedia page recieved in the last 
  60 days, as an 'int'
  """

    # using the xtools api, use an enpoint to grab statistics of a page
    """ 
  This gives the base url for gathering statistics from wikipedia, it's still missing a piece at the end however.
  Need to format it so that it'll take the data from the previous get names function and adds that to the end 
  of this api endpoint to automatically go through and scrape the wikipedia names for statistics of each the scientists.
  """
    url = 'https://xtools.wmflabs.org/api/page/prose/en.wikipedia.org/'

    # This is where we will format to automatically insert the names at the end of the api endpoint
    # This is currently not working correctly because i need to format the name for enpoint to follow this format 'albert_einstein'
    response = simple_get(url.format(name))

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')

        hit_link = [
            a for a in html.select('a') if a['href'].find('latest-60') > -1
        ]

        if len(hit_link) > 0:
            # strip commas
            link_text = hit_link[0].text.replace(',', '')
            try:
                # Convert to integer
                return int(link_text)
            except:
                log_error("couldn't parse{} as an `int `".format(link_text))

    log_error('No pageviews found for {}'.format(name))
    return None

コード例 #7

0

ファイルを表示

ファイル: test.py プロジェクト: bparker12/webscrape_tutorial

import requests
from mathematicians import simple_get


raw_html = simple_get('https://realpython.com/blog/')
len(raw_html)

no_html = simple_get('https://realpython.com/blog/nope-not-gonna-find-it')
no_html is None


# import sys
# print(sys.executable)

コード例 #8

0

ファイルを表示

                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
            and content_type is not None
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors.
    This function just prints them, but you can
    make it do anything.
    """
    print(e)


>> > from mathematicians import simple_get
>> > raw_html = simple_get('https://realpython.com/blog/')
>> > len(raw_html)
33878

コード例 #9

0

ファイルを表示

ファイル: SolarB.py プロジェクト: davidpelosi21/HVO

import requests
import shutil
import os
import time
from SFS import g2j
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from mathematicians import simple_get
#cancellare vecchio salvataggio in Download e aggiornare file con data
shutil.rmtree("/var/www/html/SFS/backup")
os.mkdir("/var/www/html/SFS/backup")
shutil.rmtree("/var/www/html/SFS/Download")
os.mkdir("/var/www/html/SFS/Download")
url = simple_get("http://wso.stanford.edu/Polar.html")
html = BeautifulSoup(url, 'html.parser')
f = open(
    time.strftime('/var/www/html/SFS/backup/Solar_Field_Strenght%m-%d-%Y.txt'),
    "w")
for i, pre in enumerate(html.select('pre')):
    f.write("%s" % (pre.text))
#lo script importa i dati - tabelle complete - e le salva con la data attuale

#elimino la prima riga per avere un format diviso in colonne
lines1 = tuple(
    open(
        time.strftime(
            '/var/www/html/SFS/backup/Solar_Field_Strenght%m-%d-%Y.txt'), "r"))
with open(
        time.strftime(
            '/var/www/html/SFS/backup/Solar_Field_Strenght%m-%d-%Y.txt'),

コード例 #10

0

ファイルを表示

ファイル: prac.py プロジェクト: thinkful-ei21/pyscraping-

"""for the shell  . ./venv/bin/activate"""

from bs4 import BeautifulSoup
from mathematicians import simple_get
"""from the blog"""
raw_html = simple_get('https://realpython.com/blog/')

"""----len(raw_html)"""
g_html = simple_get('https://soundcloud.com/theaipodcast')
a_html = simple_get('https://soundcloud.com/a16z')
r_html = simple_get('https://www.reddit.com/r/artificial/')
t_html = simple_get('https://soundcloud.com/techemergence')

ag_html = BeautifulSoup(g_html, 'html.parser')
aa_html = BeautifulSoup(a_html, 'html.parser')
"""ar_html = BeautifulSoup(r_html, 'html.parser')"""
at_html = BeautifulSoup(t_html, 'html.parser')
#only sound cloud works
"""for i, li in enumerate(ag_html.select('li')):"""
"""    print (i, li.text)"""
print("a16z")
for i, a in enumerate(aa_html.select('a')): 
    if i < 20:
        print (i, a.text)
print("techemergence")
for i, a in enumerate(at_html.select('a')):
    if i < 20:
        print (i, a.text)
print("AIpodcast")
for i, a in enumerate(at_html.select('a')):
    if i < 20:

コード例 #11

0

ファイルを表示

ファイル: mathematicians.py プロジェクト: BehnazD/foundations

    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

from mathematicians import simple_get
raw_html = simple_get('https://realpython.com/blog/')
len(raw_html)
33878

no_html = simple_get('https://realpython.com/blog/nope-not-gonna-find-it')
no_html is None
True

from bs4 import BeautifulSoup
raw_html = open('contrived.html').read()
html = BeautifulSoup(raw_html, 'html.parser')
for p in html.select('p'):
    if p['id'] == 'walrus':
        print(p.text)

raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.htm')

コード例 #12

0

ファイルを表示

from mathematicians import simple_get
website = input(
    "What is the URL of the degree requirments page of your university?: ")
type(website)
raw_html = simple_get(website)
len(raw_html)
f = open("degree.html", "wb")
f.write(raw_html)

コード例 #13

0

ファイルを表示

from mathematicians import simple_get

raw_html = simple_get("https://www.realpython.com/blog")
count = len(raw_html)
print(count)

コード例 #14

0

ファイルを表示

from mathematicians import simple_get
from bs4 import BeautifulSoup
raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.html')
html = BeautifulSoup(raw_html, 'html.parser')
for i, li in enumerate(html.select('li')):
    print(i, li.text)

コード例 #15

0

ファイルを表示

print(line)
stat = line.split()

#sline contiene i nomi delle stazioni
for i in range(len(stat)):
    NM = "http://www.nmdb.eu/nest/draw_graph.php?formchk=1&stations[]=" + stat[
        i] + "&tabchoice=1h&dtype=corr_for_efficiency&tresolution=43200&yunits=0&date_choice=bydate&start_day=1&start_month=1&start_year=1960&start_hour=0&start_min=0&end_day=4&end_month=12&end_year=2019&end_hour=23&end_min=59&output=ascii"
    year = str(now.year)
    month = str(now.month)
    day = str(now.day)
    print(year + month + day)
    NM = NM.replace("end_year=2019", "end_year=" + year)
    NM = NM.replace("end_month=12", "end_month=" + month)
    NM = NM.replace("end_day=4", "end_day=" + day)
    url = simple_get(NM)
    print(NM)
    html = BeautifulSoup(url, 'html.parser')
    pathP = "/var/www/html/Neutron/Update/" + stat[i] + "P.txt"
    path = "/var/www/html/Neutron/" + stat[i] + ".txt"

    f = open(pathP, "w")
    for i, pre in enumerate(html.select('pre')):
        f.write("%s" % (pre.text))
    f.close()

    #elimino righe inutili
    lines1 = tuple(open(pathP, "r"))
    with open(pathP, "w+") as file:
        for i in range(len(lines1)):
            if i > 25:

コード例 #16

0

ファイルを表示

ファイル: Tilt.py プロジェクト: bozzochet/matissewebsite

import requests
import shutil
import os
import time
from SFS import g2j
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from mathematicians import simple_get
#cancellare vecchio salvataggio in Download e aggiornare file con data
shutil.rmtree("Download")
os.mkdir("Download")
url = simple_get("http://wso.stanford.edu/Tilts.html")
html = BeautifulSoup(url, 'html.parser')
f = open(time.strftime('Download/Solar-TILT_ANGLE%m-%d-%Y.txt'), "w")
for i, pre in enumerate(html.select('pre')):
    f.write("%s" %(pre.text))
#lo script importa i dati - tabelle complete - e le salva con la data attuale

#elimino la prima riga per avere un format diviso in colonne
lines1 = tuple(open(time.strftime('Download/Solar-TILT_ANGLE%m-%d-%Y.txt'), "r"))
with open(time.strftime('Download/Solar-TILT_ANGLE%m-%d-%Y.txt'), "w+") as file:
 for i in range(len(lines1)):
    if i > 1:
        file.write(lines1[i])

#ho otteuto il file completo txt

#creo il file SFSN.txt converto date in julian date
lines1 = tuple(open(time.strftime('Download/Solar-TILT_ANGLE%m-%d-%Y.txt'), "r"))
fileRav = open(time.strftime('Tilt_R_av.txt'), "w+")

コード例 #17

0

ファイルを表示

ファイル: SolarNeutron.py プロジェクト: bozzochet/matissewebsite

from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from mathematicians import simple_get

now = datetime.datetime.now()
#------------------INVK----------
INVK = "http://www.nmdb.eu/nest/draw_graph.php?formchk=1&stations[]=INVK&tabchoice=1h&dtype=corr_for_efficiency&tresolution=43200&yunits=0&date_choice=bydate&start_day=1&start_month=1&start_year=1960&start_hour=0&start_min=0&end_day=4&end_month=12&end_year=2019&end_hour=23&end_min=59&output=ascii"
year = str(now.year)
month = str(now.month)
day = str(now.day)
print(year + month + day)
INVK = INVK.replace("end_year=2019", "end_year=" + year)
INVK = INVK.replace("end_month=12", "end_month=" + month)
INVK = INVK.replace("end_day=4", "end_day=" + day)
url = simple_get(INVK)
print(INVK)
html = BeautifulSoup(url, 'html.parser')
f = open(time.strftime('Update/INVKP.txt'), "w")
for i, pre in enumerate(html.select('pre')):
    f.write("%s" % (pre.text))
f.close()

#elimino righe inutili
lines1 = tuple(open('Update/INVKP.txt', "r"))
with open('Update/INVKP.txt', "w+") as file:
    for i in range(len(lines1)):
        if i > 25:
            file.write(lines1[i])
#su OuluP.txt ho scaricato i dati correnti devo confrontarli con Oulu.txt storico

コード例 #18

0

ファイルを表示

ファイル: mathematicians.py プロジェクト: BehnazD/foundations

def get_hits_on_name(name):
    """
    Accepts a `name` of a mathematician and returns the number
    of hits that mathematician's Wikipedia page received in the 
    last 60 days, as an `int`
    """
    # url_root is a template string that is used to build a URL.
    url_root = 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE'
    response = simple_get(url_root.format(name))

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')

        hit_link = [a for a in html.select('a')
                    if a['href'].find('latest-60') > -1]

        if len(hit_link) > 0:
            # Strip commas
            link_text = hit_link[0].text.replace(',', '')
            try:
                # Convert to integer
                return int(link_text)
            except:
                log_error("couldn't parse {} as an `int`".format(link_text))

    log_error('No pageviews found for {}'.format(name))
    return Noneif __name__ == '__main__':
    print('Getting the list of names....')
    names = get_names()
    print('... done.\n')

    results = []

    print('Getting stats for each name....')

    for name in names:
        try:
            hits = get_hits_on_name(name)
            if hits is None:
                hits = -1
            results.append((hits, name))
        except:
            results.append((-1, name))
            log_error('error encountered while processing '
                      '{}, skipping'.format(name))

    print('... done.\n')

    results.sort()
    results.reverse()

    if len(results) > 5:
        top_marks = results[:5]
    else:
        top_marks = results

    print('\nThe most popular mathematicians are:\n')
    for (mark, mathematician) in top_marks:
        print('{} with {} pageviews'.format(mathematician, mark))

    no_results = len([res for res in results if res[0] == -1])
    print('\nBut we did not find results for '
          '{} mathematicians on the list'.format(no_results))

コード例 #19

0

ファイルを表示

ファイル: test.py プロジェクト: hercwey/python-zero

from mathematicians import simple_get
from bs4 import BeautifulSoup

raw_html = simple_get('https://realpython.com/blog/')
print(len(raw_html))

raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.htm')
html = BeautifulSoup(raw_html, 'html.parser')
for i, li in enumerate(html.select('li')):
    print(i, li.text)

コード例 #20

0

ファイルを表示

ファイル: subieTrack.py プロジェクト: nowrd2xpln/subie_tracker

def get_used_subies(dealer):
    #response = simple_get('https://www.capitolsubarusj.com/used-inventory/index.htm?compositeType=&year=&make=Subaru&model=Forester&trim=&bodyStyle=&driveLine=&internetPrice=&saveFacetState=true&lastFacetInteracted=inventory-listing1-facet-anchor-model-1')
    #response = simple_get(subieDealerAddr[dealer])
    response = simple_get(dealer.url)

    save_site(response)

    carCount = 0

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        html_str = str(html)

        names = list()

        text_file = open("out.txt", "w")

        print("DEALER: %s" % dealer.name)

        print("DBGDBG\n")
        dbgtest = html.findAll(
            "div", attrs={'data-widget-name': 'tracking-ddc-data-layer'})
        dbgtest_str = str(dbgtest)
        #print("\t%s\n" % dbgtest)
        print("DBGDBG\n")

        stuff = []
        test_list = []
        datalayer = []
        cnt = 0
        cnt1 = 0
        db = {}

        print(type(dbgtest_str))

        # Get Vehicle datalayer
        match = re.search(r"DDC.dataLayer\['(\w+)'\]\s+=\s+\[\n?(.*\n)+\];",
                          dbgtest_str, re.UNICODE)
        mat_tup = ""
        if match:
            print(
                "************************************************************")
            test_list = match.group(0)
            #print(test_list)
            #mat_tup = re.search(r"\{\n(.*\n)+(\}\n)",test_list, re.UNICODE)
            #mat_tup = re.findall(r"\{\n(.*,?\n)+(\}\n)",test_list, re.UNICODE)

            # Grab each car
            mat_tup = re.findall(r"\{\n((\".*\n)+\})", test_list, re.UNICODE)

            if mat_tup:
                print("cars:%d\n" % (len(mat_tup)))
                #print(*mat_tup, sep = "\n\n")

                car_list_in_dealership = []
                car_list_in_dealership = [i[0] for i in mat_tup]

                # DBG print raw text data
                # print(*car_list_in_dealership, sep = "\n\n")

                for car in car_list_in_dealership:
                    cnt1 += 1
                    print("\n\nCAR #%02d" % cnt1)
                    #print(car)
                    car_attribs = re.findall(r"\".*,?", car, re.UNICODE)
                    if car_attribs:
                        for itm in car_attribs:
                            #print("%d - %s" % (car_attribs.index(itm)+1, itm))
                            # Parse attributes and put into dict
                            something = re.search(
                                r"\"(\w+)\"\s?:\s*([\"\[]?([\w\\\.]*)[\"\]]?),?",
                                itm)

                            attr = something.group(1)
                            val = something.group(3)
                            #print("a:%s - b:%s\n" % (attr, val))
                            car_dict[attr] = val
                            #print("l:%s - r:%s\n" % (attr, car_dict[attr]))
                            print("%02d - %-23s%s" % (car_attribs.index(itm) +
                                                      1, attr, car_dict[attr]))
                        print("%d vin:%s - attribs:%d" %
                              (cnt1, car_dict["vin"], len(car_dict)))

                        # Copy Car to list
                        list_of_car_dicts.append(car_dict.copy())
                        dealership_dict[
                            car_dict["accountId"]] = car_dict.copy()
                        print("accountId:%s" % car_dict["accountId"])

            print(
                "************************************************************")

        # for item in dbgtest:
        #     cnt += 1
        #     #print("%02d:%s\n" % (cnt, item))
        #     stuff.append(item)
        #     print(cnt)
        #     print("%02d:%s\n" % (cnt-1, stuff[cnt-1]))

        global car_total_cnt
        car_total_cnt += cnt1
        print("Car Count %d" % cnt1)
        print(len(car_list))
        print("total Car Count %d" % car_total_cnt)

        # Add carlist to
        #subieDealer[dealer] = list(car_list)
        #subieDealer[dealer] = list(car_list)
        #subieDealer[dealer] = car_list[:]

        dealer.carlist = list_of_car_dicts[:]
        print("dealer.carlist length = %d" % len(dealer.carlist))
        del car_list[:]
        del list_of_car_dicts[:]

        return names

    raise Exception('Error retrieving contents at {}'.format(url))