def scrape_bbc_home(uReq, soup, keyword_list): logging = None base_url = 'http://www.bbc.co.uk' #url to scrape init_path = "/news" #base url extension page_html = access_url(base_url + init_path, uReq)#make request for page if page_html is not None: page_soup = soup(page_html, "html.parser") #convert the html to a soup object tag_array = page_soup.findAll("div", {"class" : "gs-c-promo"}) #find tags in the soup object if len(tag_array) > 0: #only execute if tags have been found beef_objects = [] #load saved urls saved_urls = get_saved_urls(base_url) percent_per_scrape = 100/len(tag_array) for x in range(0, len(tag_array)): #for each tag print(str(round(x * percent_per_scrape)) + "% complete.") if(tag_array[x].a): #ensure the element has an anchor tag if("http://" in tag_array[x].a["href"]): #check if the a href is an absolute url or an absolute path sub_page_url = tag_array[x].a["href"] else: sub_page_url = base_url + tag_array[x].a["href"] path_split_1 = sub_page_url.split("/")#split path by / path_split_2 = path_split_1[len(path_split_1) - 1 ].split("-")#get final field in path_split_1 and split by - if path_split_2[0] != "blogs": #ensure we are not scraping a blog page if any(url_obj["url"] == sub_page_url for url_obj in saved_urls): #check through pre loaded urls to ensure url has not already been scraped if logging: print("preloaded url found, aborting scrape.") else: if logging: print("preloaded url not found, initiating scrape.") #url must be saved under these conditions: 1. it has not been previously scraped, 2. it may not be relevant to beef and therefore may not be added to selected events, save_url(base_url, sub_page_url) beef_object = scrape_article(sub_page_url, uReq, soup, keyword_list) #scrape this article if beef_object != None: beef_objects.append(beef_object) #beef_object.print_beef() return beef_objects else: return []
def scrape_hiphopdx_home(uReq, soup, keyword_list): logging = None base_url = 'https://hiphopdx.com' #url to scrape initial_suffix = "/news" raw_page_html = access_url(base_url + initial_suffix, uReq) #make request for page if raw_page_html is not None: page_soup = soup(raw_page_html, "html.parser") #convert the html to a soup object news_tag = page_soup.find( "div", {"class", "wire"}) #, text=pattern) #find tags in the soup object beef_objects = [] #load saved urls saved_urls = get_saved_urls(base_url) percent_per_scrape = 100 / len(news_tag.findAll("a")) if len(news_tag) > 0: #only execute if tags have been found for x, a in enumerate(news_tag.findAll("a")): print(str(round(x * percent_per_scrape)) + "% complete.") if a and a["href"] and a["class"][0] != "next": sub_page_url = base_url + a["href"] if any( url_obj["url"] == sub_page_url for url_obj in saved_urls ): #check through pre loaded urls to ensure url has not already been scraped if logging: print("preloaded url found, aborting scrape.") else: if logging: print( "preloaded url not found, initiating scrape.") #url must be saved under these conditions: 1. it has not been previously scraped, 2. it may not be relevant to beef and therefore may not be added to selected events, save_url(base_url, sub_page_url) beef_object = scrape_article(sub_page_url, uReq, soup, keyword_list) if beef_object != None: beef_objects.append(beef_object) return beef_objects else: return []
def scrape_cnn_home(uReq, soup, keyword_list): logging = None base_url = 'http://edition.cnn.com' #url to scrape raw_page_html = access_url(base_url, uReq) #make request for page if raw_page_html is not None: print(raw_page_html) page_soup = soup(raw_page_html, "html.parser") #convert the html to a soup object tag_array = page_soup.findAll( "script") #, text=pattern) #find tags in the soup object if len(tag_array) > 0: #only execute if tags have been found if (tag_array[10].text): #ensure the element has an anchor tag beef_objects = [] #load saved urls saved_urls = get_saved_urls(base_url) script_text = tag_array[10].text print(script_text) result = re.search('CNN.contentModel = (.*);', script_text) if (result.group): script_json = demjson.decode(result.group(1)) percent_per_scrape = 100 / len( script_json['siblings']['articleList']) for x in range(0, len(script_json['siblings'] ['articleList'])): #for each tag print( str(round(x * percent_per_scrape)) + "% complete.") sub_page_url = base_url + script_json['siblings'][ 'articleList'][x]['uri'] if any( url_obj["url"] == sub_page_url for url_obj in saved_urls ): #check through pre loaded urls to ensure url has not already been scraped if logging: print("preloaded url found, aborting scrape.") else: if logging: print( "preloaded url not found, initiating scrape." ) #url must be saved under these conditions: 1. it has not been previously scraped, 2. it may not be relevant to beef and therefore may not be added to selected events, save_url(base_url, sub_page_url) beef_object = scrape_article( sub_page_url, uReq, soup, keyword_list) if beef_object != None: beef_objects.append(beef_object) return beef_objects else: return [] else: return []
def scrape_give_me_sport_home(uReq, soup, keyword_list): logging = None base_url = 'http://www.givemesport.com/' #url to scrape raw_page_html = access_url(base_url, uReq) #make request for page if raw_page_html is not None: page_soup = soup(raw_page_html, "html.parser") #convert the html to a soup object news_tag_array = page_soup.find("section", {"id": "gms-trending"}) if news_tag_array: news_tag_array = news_tag_array.findAll( "article", {"class", "gms-feature" }) #, text=pattern) #find tags in the soup object beef_objects = [] #load saved urls saved_urls = get_saved_urls(base_url) percent_per_scrape = 100 / len(news_tag_array) if len(news_tag_array) > 0: #only execute if tags have been found for x, news_tag in enumerate(news_tag_array): print(str(round(x * percent_per_scrape)) + "% complete.") if news_tag and news_tag.a and news_tag.a["href"]: sub_page_url = base_url + news_tag.a["href"] if any( url_obj["url"] == sub_page_url for url_obj in saved_urls ): #check through pre loaded urls to ensure url has not already been scraped if logging: print("preloaded url found, aborting scrape.") else: if logging: print( "preloaded url not found, initiating scrape." ) #url must be saved under these conditions: 1. it has not been previously scraped, 2. it may not be relevant to beef and therefore may not be added to selected events, save_url(base_url, sub_page_url) beef_object = scrape_article( sub_page_url, uReq, soup, keyword_list) if beef_object != None: beef_objects.append(beef_object) return beef_objects else: return [] else: return []