def get_hits_on_name(name): """ Accepts a `name` of a mathematician and returns the number of hits that mathematician's wikipedia page received in the last 60 days, as an `int` """ # url_root is a template string that used to buld a URL. url_root = 'https://xtools.wmflabs.org/articleinfo/en.wikipedia.org/{}' response = simple_get(url_root.format(name)) if response is not None: html = BeautifulSoup(response, 'html.parser') hit_link = [ a for a in html.select('a') if a['href'].find('latest-60') > -1 ] if len(hit_link) > 0: # Strip commas: link_text = hit_link[0].text.replace(',', '') try: # Convert to integer return int(link_text) except: log_error("couldn't parse {} as an `int`".format(link_text)) log_error('No pageviews found for {}'.format(name)) return None
def get_names(): url = "http://www.fabpedigree.com/james/mathmen.htm" response = simple_get(url) if response is not None: html = BeautifulSoup(response, 'html.parser') names = set() for li in html.select('li'): for name in li.text.split('\n'): if len(name) > 0: names.add(name.strip()) return list(names) raise Exception('Error retrieving contents at {}'.format(url))
def get_data(self, url): raw_html = mathematicians.simple_get(url) html = BeautifulSoup(raw_html, 'html.parser') jobTags = html.find_all('div', class_="job_content") info = [] for tag in jobTags: logo = tag.find('div', class_="logo-wrapper") img = logo.find('img') title = tag.find('h2', class_="title").text print('URL logo: ', img['data-src']) print('Company Name:', img['alt']) print('Job nam', title)
def get_names(): """ Downloads the page where the list of mathematicians is found and returns a list of strings, one per mathematician """ url = 'http://www.fabpedigree.com/james/mathmen.htm' response = simple_get(url) if response is not None: html = BeautifulSoup(response, 'html.parser') names = set() for li in html.select('li'): for name in li.text.split('\n'): if len(name) > 0: names.add(name.strip()) return list(names) # Raise an exception if we failed to get any data from the url raise Exception('Error retrieving contents at {}'.format(url))
def get_hits_on_name(name): url_root = "https://xtools.wmflabs.org/articleinfo/en.wikipedia.org/Henri_Poincar%C3%A9" response = simple_get(url_root.format(name)) if response is not None: html = BeautifulSoup(response, 'html.parser') hit_link = [ a for a in html.select('a') if a['href'].find('latest-60') > -1 ] if len(hit_link) > 0: # strip commas link_text = hit_link[0].text.replace(', ', '') try: # convert to integer return int(link_text) except: log_error("couldn't parse {} as an `int`".format(link_text)) log_error('No pageviews found for {}'.format(name)) return None
def get_hits_on_name(name): """ Accepts a 'name' of a mathematician and returns the number of hits that mathematician's Wikipedia page recieved in the last 60 days, as an 'int' """ # using the xtools api, use an enpoint to grab statistics of a page """ This gives the base url for gathering statistics from wikipedia, it's still missing a piece at the end however. Need to format it so that it'll take the data from the previous get names function and adds that to the end of this api endpoint to automatically go through and scrape the wikipedia names for statistics of each the scientists. """ url = 'https://xtools.wmflabs.org/api/page/prose/en.wikipedia.org/' # This is where we will format to automatically insert the names at the end of the api endpoint # This is currently not working correctly because i need to format the name for enpoint to follow this format 'albert_einstein' response = simple_get(url.format(name)) if response is not None: html = BeautifulSoup(response, 'html.parser') hit_link = [ a for a in html.select('a') if a['href'].find('latest-60') > -1 ] if len(hit_link) > 0: # strip commas link_text = hit_link[0].text.replace(',', '') try: # Convert to integer return int(link_text) except: log_error("couldn't parse{} as an `int `".format(link_text)) log_error('No pageviews found for {}'.format(name)) return None
import requests from mathematicians import simple_get raw_html = simple_get('https://realpython.com/blog/') len(raw_html) no_html = simple_get('https://realpython.com/blog/nope-not-gonna-find-it') no_html is None # import sys # print(sys.executable)
return None except RequestException as e: log_error('Error during requests to {0} : {1}'.format(url, str(e))) return None def is_good_response(resp): """ Returns True if the response seems to be HTML, False otherwise. """ content_type = resp.headers['Content-Type'].lower() return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1) def log_error(e): """ It is always a good idea to log errors. This function just prints them, but you can make it do anything. """ print(e) >> > from mathematicians import simple_get >> > raw_html = simple_get('https://realpython.com/blog/') >> > len(raw_html) 33878
import requests import shutil import os import time from SFS import g2j from requests.exceptions import RequestException from contextlib import closing from bs4 import BeautifulSoup from mathematicians import simple_get #cancellare vecchio salvataggio in Download e aggiornare file con data shutil.rmtree("/var/www/html/SFS/backup") os.mkdir("/var/www/html/SFS/backup") shutil.rmtree("/var/www/html/SFS/Download") os.mkdir("/var/www/html/SFS/Download") url = simple_get("http://wso.stanford.edu/Polar.html") html = BeautifulSoup(url, 'html.parser') f = open( time.strftime('/var/www/html/SFS/backup/Solar_Field_Strenght%m-%d-%Y.txt'), "w") for i, pre in enumerate(html.select('pre')): f.write("%s" % (pre.text)) #lo script importa i dati - tabelle complete - e le salva con la data attuale #elimino la prima riga per avere un format diviso in colonne lines1 = tuple( open( time.strftime( '/var/www/html/SFS/backup/Solar_Field_Strenght%m-%d-%Y.txt'), "r")) with open( time.strftime( '/var/www/html/SFS/backup/Solar_Field_Strenght%m-%d-%Y.txt'),
"""for the shell . ./venv/bin/activate""" from bs4 import BeautifulSoup from mathematicians import simple_get """from the blog""" raw_html = simple_get('https://realpython.com/blog/') """----len(raw_html)""" g_html = simple_get('https://soundcloud.com/theaipodcast') a_html = simple_get('https://soundcloud.com/a16z') r_html = simple_get('https://www.reddit.com/r/artificial/') t_html = simple_get('https://soundcloud.com/techemergence') ag_html = BeautifulSoup(g_html, 'html.parser') aa_html = BeautifulSoup(a_html, 'html.parser') """ar_html = BeautifulSoup(r_html, 'html.parser')""" at_html = BeautifulSoup(t_html, 'html.parser') #only sound cloud works """for i, li in enumerate(ag_html.select('li')):""" """ print (i, li.text)""" print("a16z") for i, a in enumerate(aa_html.select('a')): if i < 20: print (i, a.text) print("techemergence") for i, a in enumerate(at_html.select('a')): if i < 20: print (i, a.text) print("AIpodcast") for i, a in enumerate(at_html.select('a')): if i < 20:
content_type = resp.headers['Content-Type'].lower() return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1) def log_error(e): """ It is always a good idea to log errors. This function just prints them, but you can make it do anything. """ print(e) from mathematicians import simple_get raw_html = simple_get('https://realpython.com/blog/') len(raw_html) 33878 no_html = simple_get('https://realpython.com/blog/nope-not-gonna-find-it') no_html is None True from bs4 import BeautifulSoup raw_html = open('contrived.html').read() html = BeautifulSoup(raw_html, 'html.parser') for p in html.select('p'): if p['id'] == 'walrus': print(p.text) raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.htm')
from mathematicians import simple_get website = input( "What is the URL of the degree requirments page of your university?: ") type(website) raw_html = simple_get(website) len(raw_html) f = open("degree.html", "wb") f.write(raw_html)
from mathematicians import simple_get raw_html = simple_get("https://www.realpython.com/blog") count = len(raw_html) print(count)
from mathematicians import simple_get from bs4 import BeautifulSoup raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.html') html = BeautifulSoup(raw_html, 'html.parser') for i, li in enumerate(html.select('li')): print(i, li.text)
print(line) stat = line.split() #sline contiene i nomi delle stazioni for i in range(len(stat)): NM = "http://www.nmdb.eu/nest/draw_graph.php?formchk=1&stations[]=" + stat[ i] + "&tabchoice=1h&dtype=corr_for_efficiency&tresolution=43200&yunits=0&date_choice=bydate&start_day=1&start_month=1&start_year=1960&start_hour=0&start_min=0&end_day=4&end_month=12&end_year=2019&end_hour=23&end_min=59&output=ascii" year = str(now.year) month = str(now.month) day = str(now.day) print(year + month + day) NM = NM.replace("end_year=2019", "end_year=" + year) NM = NM.replace("end_month=12", "end_month=" + month) NM = NM.replace("end_day=4", "end_day=" + day) url = simple_get(NM) print(NM) html = BeautifulSoup(url, 'html.parser') pathP = "/var/www/html/Neutron/Update/" + stat[i] + "P.txt" path = "/var/www/html/Neutron/" + stat[i] + ".txt" f = open(pathP, "w") for i, pre in enumerate(html.select('pre')): f.write("%s" % (pre.text)) f.close() #elimino righe inutili lines1 = tuple(open(pathP, "r")) with open(pathP, "w+") as file: for i in range(len(lines1)): if i > 25:
import requests import shutil import os import time from SFS import g2j from requests.exceptions import RequestException from contextlib import closing from bs4 import BeautifulSoup from mathematicians import simple_get #cancellare vecchio salvataggio in Download e aggiornare file con data shutil.rmtree("Download") os.mkdir("Download") url = simple_get("http://wso.stanford.edu/Tilts.html") html = BeautifulSoup(url, 'html.parser') f = open(time.strftime('Download/Solar-TILT_ANGLE%m-%d-%Y.txt'), "w") for i, pre in enumerate(html.select('pre')): f.write("%s" %(pre.text)) #lo script importa i dati - tabelle complete - e le salva con la data attuale #elimino la prima riga per avere un format diviso in colonne lines1 = tuple(open(time.strftime('Download/Solar-TILT_ANGLE%m-%d-%Y.txt'), "r")) with open(time.strftime('Download/Solar-TILT_ANGLE%m-%d-%Y.txt'), "w+") as file: for i in range(len(lines1)): if i > 1: file.write(lines1[i]) #ho otteuto il file completo txt #creo il file SFSN.txt converto date in julian date lines1 = tuple(open(time.strftime('Download/Solar-TILT_ANGLE%m-%d-%Y.txt'), "r")) fileRav = open(time.strftime('Tilt_R_av.txt'), "w+")
from requests.exceptions import RequestException from contextlib import closing from bs4 import BeautifulSoup from mathematicians import simple_get now = datetime.datetime.now() #------------------INVK---------- INVK = "http://www.nmdb.eu/nest/draw_graph.php?formchk=1&stations[]=INVK&tabchoice=1h&dtype=corr_for_efficiency&tresolution=43200&yunits=0&date_choice=bydate&start_day=1&start_month=1&start_year=1960&start_hour=0&start_min=0&end_day=4&end_month=12&end_year=2019&end_hour=23&end_min=59&output=ascii" year = str(now.year) month = str(now.month) day = str(now.day) print(year + month + day) INVK = INVK.replace("end_year=2019", "end_year=" + year) INVK = INVK.replace("end_month=12", "end_month=" + month) INVK = INVK.replace("end_day=4", "end_day=" + day) url = simple_get(INVK) print(INVK) html = BeautifulSoup(url, 'html.parser') f = open(time.strftime('Update/INVKP.txt'), "w") for i, pre in enumerate(html.select('pre')): f.write("%s" % (pre.text)) f.close() #elimino righe inutili lines1 = tuple(open('Update/INVKP.txt', "r")) with open('Update/INVKP.txt', "w+") as file: for i in range(len(lines1)): if i > 25: file.write(lines1[i]) #su OuluP.txt ho scaricato i dati correnti devo confrontarli con Oulu.txt storico
def get_hits_on_name(name): """ Accepts a `name` of a mathematician and returns the number of hits that mathematician's Wikipedia page received in the last 60 days, as an `int` """ # url_root is a template string that is used to build a URL. url_root = 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE' response = simple_get(url_root.format(name)) if response is not None: html = BeautifulSoup(response, 'html.parser') hit_link = [a for a in html.select('a') if a['href'].find('latest-60') > -1] if len(hit_link) > 0: # Strip commas link_text = hit_link[0].text.replace(',', '') try: # Convert to integer return int(link_text) except: log_error("couldn't parse {} as an `int`".format(link_text)) log_error('No pageviews found for {}'.format(name)) return Noneif __name__ == '__main__': print('Getting the list of names....') names = get_names() print('... done.\n') results = [] print('Getting stats for each name....') for name in names: try: hits = get_hits_on_name(name) if hits is None: hits = -1 results.append((hits, name)) except: results.append((-1, name)) log_error('error encountered while processing ' '{}, skipping'.format(name)) print('... done.\n') results.sort() results.reverse() if len(results) > 5: top_marks = results[:5] else: top_marks = results print('\nThe most popular mathematicians are:\n') for (mark, mathematician) in top_marks: print('{} with {} pageviews'.format(mathematician, mark)) no_results = len([res for res in results if res[0] == -1]) print('\nBut we did not find results for ' '{} mathematicians on the list'.format(no_results))
from mathematicians import simple_get from bs4 import BeautifulSoup raw_html = simple_get('https://realpython.com/blog/') print(len(raw_html)) raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.htm') html = BeautifulSoup(raw_html, 'html.parser') for i, li in enumerate(html.select('li')): print(i, li.text)
def get_used_subies(dealer): #response = simple_get('https://www.capitolsubarusj.com/used-inventory/index.htm?compositeType=&year=&make=Subaru&model=Forester&trim=&bodyStyle=&driveLine=&internetPrice=&saveFacetState=true&lastFacetInteracted=inventory-listing1-facet-anchor-model-1') #response = simple_get(subieDealerAddr[dealer]) response = simple_get(dealer.url) save_site(response) carCount = 0 if response is not None: html = BeautifulSoup(response, 'html.parser') html_str = str(html) names = list() text_file = open("out.txt", "w") print("DEALER: %s" % dealer.name) print("DBGDBG\n") dbgtest = html.findAll( "div", attrs={'data-widget-name': 'tracking-ddc-data-layer'}) dbgtest_str = str(dbgtest) #print("\t%s\n" % dbgtest) print("DBGDBG\n") stuff = [] test_list = [] datalayer = [] cnt = 0 cnt1 = 0 db = {} print(type(dbgtest_str)) # Get Vehicle datalayer match = re.search(r"DDC.dataLayer\['(\w+)'\]\s+=\s+\[\n?(.*\n)+\];", dbgtest_str, re.UNICODE) mat_tup = "" if match: print( "************************************************************") test_list = match.group(0) #print(test_list) #mat_tup = re.search(r"\{\n(.*\n)+(\}\n)",test_list, re.UNICODE) #mat_tup = re.findall(r"\{\n(.*,?\n)+(\}\n)",test_list, re.UNICODE) # Grab each car mat_tup = re.findall(r"\{\n((\".*\n)+\})", test_list, re.UNICODE) if mat_tup: print("cars:%d\n" % (len(mat_tup))) #print(*mat_tup, sep = "\n\n") car_list_in_dealership = [] car_list_in_dealership = [i[0] for i in mat_tup] # DBG print raw text data # print(*car_list_in_dealership, sep = "\n\n") for car in car_list_in_dealership: cnt1 += 1 print("\n\nCAR #%02d" % cnt1) #print(car) car_attribs = re.findall(r"\".*,?", car, re.UNICODE) if car_attribs: for itm in car_attribs: #print("%d - %s" % (car_attribs.index(itm)+1, itm)) # Parse attributes and put into dict something = re.search( r"\"(\w+)\"\s?:\s*([\"\[]?([\w\\\.]*)[\"\]]?),?", itm) attr = something.group(1) val = something.group(3) #print("a:%s - b:%s\n" % (attr, val)) car_dict[attr] = val #print("l:%s - r:%s\n" % (attr, car_dict[attr])) print("%02d - %-23s%s" % (car_attribs.index(itm) + 1, attr, car_dict[attr])) print("%d vin:%s - attribs:%d" % (cnt1, car_dict["vin"], len(car_dict))) # Copy Car to list list_of_car_dicts.append(car_dict.copy()) dealership_dict[ car_dict["accountId"]] = car_dict.copy() print("accountId:%s" % car_dict["accountId"]) print( "************************************************************") # for item in dbgtest: # cnt += 1 # #print("%02d:%s\n" % (cnt, item)) # stuff.append(item) # print(cnt) # print("%02d:%s\n" % (cnt-1, stuff[cnt-1])) global car_total_cnt car_total_cnt += cnt1 print("Car Count %d" % cnt1) print(len(car_list)) print("total Car Count %d" % car_total_cnt) # Add carlist to #subieDealer[dealer] = list(car_list) #subieDealer[dealer] = list(car_list) #subieDealer[dealer] = car_list[:] dealer.carlist = list_of_car_dicts[:] print("dealer.carlist length = %d" % len(dealer.carlist)) del car_list[:] del list_of_car_dicts[:] return names raise Exception('Error retrieving contents at {}'.format(url))