def main(): args = sys.argv if args[1] == 'scrape': patients_url, querents_url, inspections_url = get_url() if args[2] in {'inspections', 'all'}: create_inspections_csv(inspections_url) if args[2] in {'patients', 'all'}: create_patients_csv(patients_url) if args[2] in {'querents', 'all'}: create_querents_csv(querents_url) if not args[2] in {'all', 'inspections', 'patients', 'querents'}: print('invalid second argument !') elif args[1] == 'convert': convert() else: print('invalid first argument !')
# local module imports from config import urls import data_processor as dp import scraper as scrape # external module imports from bs4 import BeautifulSoup page = scrape.get_url(urls['titanic_data']).text soup = BeautifulSoup(page, 'html.parser') titanic = dp.populate(soup) dp.analyse(titanic.passengers)
import scraper as scrape from bs4 import BeautifulSoup # Run stuff # Dictionary of websites to access urls = { # 'Web Scraper Test Site': 'http://webscraper.io', 'StarLadder': 'https://starladder.com/en/starseries-i-league-pubg', # 'Google': 'http://www.google.com', # 'Reddit': 'https://www.reddit.com' } for key, value in urls.items(): try: page = BeautifulSoup(scrape.get_url(value).text, 'html.parser') team_standings = dp.parse_historic(page) team_stats = dp.parse_historic_stats(page) final_results = dp.aggregate_historic_results(team_standings, team_stats) print(final_results) except custom_exception.DisallowedException as e: print( 'Connection to {} not permitted with HTTP code {}. Does its robots.txt allow access?' .format(value, e.status))
import scraper from bs4 import BeautifulSoup html = scraper.get_url( "https://www.kijiji.ca/b-motorcycles/canada/ninja-300/k0c30l0?price=1000__&dc=true" ) soup = scraper.ad(html) #print(soup) #for i in range(5): # print("---------------------------------------") #print(soup[2]) for i in range(5): print("---------------------------------------") for i in soup: print(scraper.link(i)) #print(soup.prettify())
from scrapeFlowsheet import get_flowsheet_info from scraper import get_url from app import db, UBClasses, UBRecitation, Degree import re base_url = 'https://www.eng.buffalo.edu/undergrad/advisement/flowsheets/' path = '//*[@id="div_content"]/ul/li/a' links = get_url(base_url, path) modified_links = [] for i in range(0,len(links)): if i is 1: modified_links.append(links[i]) elif i is 4: modified_links.append(links[i]) elif i is 5: modified_links.append(links[i]) elif i is 6: modified_links.append(links[i]) elif i is 7: modified_links.append(links[i]) elif i is 9: modified_links.append(links[i]) elif i is 11: modified_links.append(links[i]) elif i is 14: modified_links.append(links[i]) elif i is 15: modified_links.append(links[i]) elif i is 17: modified_links.append(links[i])
else: instructor.append(curr_data) elif(position == 11): status.append(curr_data) if(position >= 11): position = 1 else: position += 1 curr_data = '' else: curr_data += data[i] return data index=0 semester_links = get_url(base_url, '//*[@id="content_internal"]/ul/li/a') semester_links.remove(semester_links[1]) semester_links.remove(semester_links[0]) semester_links.remove(semester_links[0]) semester_links.remove(semester_links[0]) semester_links.remove(semester_links[0]) semester_links.remove(semester_links[0]) for semm in semester_links: print semm broken_departments = [] for semester in semester_links: print semester # print link department_links = get_url(semester, '/html/body/table[4]/tr/td[1]/table/tr/td[1]/a') for department in department_links: