def scrape_article(path, uReq, soup, keyword_list): sub_page_html = access_url(path, uReq) if sub_page_html is not None: sub_page_soup = soup(sub_page_html, "html.parser") content_tag_array = sub_page_soup.findAll("div", {"class" : "page-content"}) #find tags in the soup object relevant_story = None; if(len(content_tag_array) > 0): content_string = "" #init content string img_link = "" #init content string #check each p tag found for words from the keyword list for p in content_tag_array[0].findAll('p'): if p.a is None: content_string += p.text elif p.a.img is not None and p.a.img["src"] is not None: img_link = p.a.img["src"] if(len(keyword_list) > 0): #if keyword list has values, use them to filter stories, if it is empty, automatically approve story #check if any text from page contains key words stored in list, if keyword found, print page text if(any(keyword in p.text for keyword in keyword_list)): relevant_story = True else: relevant_story = True #clean content string globals.scrub_content_text(content_string) title = sub_page_soup.findAll("div", {"class" : "page_header"})[0].h2.text #find tags in the soup object #article is relevant, build a beef record if(relevant_story): #execute if a story contains a keyword store_event_classification(title, content_string) #classify event and store the classification for later use media_tag_aray = sub_page_soup.findAll("iframe") #find tags in the soup object date_string = sub_page_soup.find("span", {"class" : "date"}).text.replace("Posted ", "") #find tags in the soup object date_split = date_string.split("/") date_string = date_split[1] + "/" + date_split[0] + "/" + date_split[2] actors_list = extract_names(content_string) #extract actors from content_string highlights = extract_quotes(content_string) #extract quotes from content_string categories = [1] media_link = { "link": "", "type": "" } if len(media_tag_aray) > 0: link = media_tag_aray[0]["src"] link_type = "" if "youtube" in link: link_type = "youtube" elif "spotify" in link: link_type = "spotify" elif "soundcloud" in link: link_type = "soundcloud" elif "twitter" in link: link_type = "twitter" media_link = { "link": link, "type": link_type } #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title) beef_obj = BeefObject(title, actors_list, content_string, date_string, highlights, path, categories, img_link, media_link) #create beefObject return beef_obj else: return None else: return None else: return None
def scrape_article(path, uReq, soup, keyword_list): sub_page_html = access_url(path, uReq) if sub_page_html is not None: sub_page_soup = soup(sub_page_html, "html.parser") body_tag = sub_page_soup.find("div", {"class": "article-content-container" }) #find tags in the soup object relevant_story = None if body_tag: content_string = "" #init content string #check each p tag found for words from the keyword list for p in body_tag.section.findAll("p"): content_string += p.text if p is not None and len( keyword_list ) > 0: #if keyword list has values, use them to filter stories, if it is empty, automatically approve story #check if any text from page contains key words stored in list, if keyword found, print page text if (any(keyword in p.text for keyword in keyword_list)): relevant_story = True break else: relevant_story = True #clean content string globals.scrub_content_text(content_string) title = body_tag.h2.text.strip( ) #find tags in the soup object for beef object title #article is relevant, build a beef record if (relevant_story): #execute if a story contains a keyword store_event_classification( title, content_string ) #classify event and store the classification for later use img_tag_array = sub_page_soup.findAll( "img", {"class": "article-gallery-cover"}) if len(img_tag_array) > 0 and img_tag_array[0]["src"]: img_link = img_tag_array[0]["src"] #relevant_story = None; date_string = sub_page_soup.find("div", { "class": "editorBlock-date" }).text.replace("\n", "") #find tags in the soup object date_split = date_string.lstrip().split( ", " ) #split to get month and day in slot [0] and year and rest of string in [1] secondary_date_split = date_split[0].split( " ") #split to seperate month and day tertiary_date_split = date_split[1].split( " ") #split to seperate year from rest of string final_date_string = secondary_date_split[1] + "/" + str( globals.get_month_number(secondary_date_split[0]) ) + "/" + tertiary_date_split[0] actors_list = extract_names( content_string) #extract actors from content_string highlights = extract_quotes( content_string) #extract quotes from content_string categories = [1] link_raw = body_tag.findAll("iframe") link = "" link_type = "" media_link = {"link": "", "type": ""} if len(link_raw) > 0: link = link_raw[0]["src"] if "youtube" in link: link_type = "youtube" elif "spotify" in link: link_type = "spotify" elif "soundcloud" in link: link_type = "soundcloud" elif "twitter" in link: link_type = "twitter" else: link_type = "video_embed" media_link = {"link": link, "type": link_type} #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title) beef_obj = BeefObject(title, actors_list, content_string, final_date_string, highlights, path, categories, img_link, media_link) #create beefObject #beef_obj.print_beef() return beef_obj else: return None else: return None else: return None
def scrape_article(path, uReq, soup, keyword_list): sub_page_html = access_url(path, uReq) if sub_page_html is not None: sub_page_soup = soup(sub_page_html, "html.parser") #body_tag = sub_page_soup.find("div", {"class" : "article-content-container"}) #find tags in the soup object relevant_story = None if sub_page_soup: content_string = "" #init content string #check each p tag found for words from the keyword list for p in sub_page_soup.findAll("p"): if p is not None and ( p.a == None or "bossip" in p.a["href"] ) and "Bossip Newsletter" not in p.text and "WENN" not in p.text: content_string += p.text if p is not None and len( keyword_list ) > 0: #if keyword list has values, use them to filter stories, if it is empty, automatically approve story #check if any text from page contains key words stored in list, if keyword found, print page text if any(keyword in p.text for keyword in keyword_list) or len(keyword_list) == 0: relevant_story = True else: relevant_story = True #clean content string globals.scrub_content_text(content_string) title_tag = sub_page_soup.find("h1") if title_tag and title_tag.text: title = title_tag.text.split("[")[0] #article is relevant, build a beef record if (relevant_story): #execute if a story contains a keyword store_event_classification( title, content_string ) #classify event and store the classification for later use img_tag_array = sub_page_soup.findAll( "img", {"class": ["size-large", "size-full"]}) if len(img_tag_array) > 0 and img_tag_array[0]["src"]: img_link = img_tag_array[0]["src"] else: return None date_string = sub_page_soup.find( "time", {"class": "date" })["datetime"] #find tags in the soup object date_split = date_string.lstrip().split( "-" ) #split to get month and day in slot [0] and year and rest of string in [1] final_date_string = date_split[2].split( " ")[0] + "/" + date_split[1] + "/" + date_split[0] actors_list = extract_names( content_string) #extract actors from content_string highlights = extract_quotes( content_string) #extract quotes from content_string categories = [1] link_raw = sub_page_soup.findAll("iframe") link = "" link_type = "" media_link = {"link": "", "type": ""} if len(link_raw) > 0: link = link_raw[0]["src"] if "youtube" in link: link_type = "youtube" elif "spotify" in link: link_type = "spotify" elif "soundcloud" in link: link_type = "soundcloud" elif "twitter" in link: link_type = "twitter" else: link_type = "video_embed" media_link = {"link": link, "type": link_type} #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title) beef_obj = BeefObject(title, actors_list, content_string, final_date_string, highlights, path, categories, img_link, media_link) #create beefObject #beef_obj.print_beef() return beef_obj else: return None else: return None else: return None
def scrape_article(path, uReq, soup, keyword_list): logging = None sub_page_html = access_url(path, uReq) if sub_page_html is not None: sub_page_soup = soup(sub_page_html, "html.parser") body_tag = sub_page_soup.find( "div", {"id": "OutputBody"}) #find tags in the soup object relevant_story = None if body_tag: content_string = "" #init content string #check each p tag found for words from the keyword list for p in body_tag.findAll("p"): if p is not None and "Do YOU want to write for GiveMeSport?" not in p.text and "Have your say in the comments section below." not in p.text: content_string += p.text if len( keyword_list ) > 0: #if keyword list has values, use them to filter stories, if it is empty, automatically approve story #check if any text from page contains key words stored in list, if keyword found, print page text if (any(keyword in p.text for keyword in keyword_list)): relevant_story = True else: relevant_story = True #clean content string globals.scrub_content_text(content_string) title = sub_page_soup.find("h1", { "class": "gms-article-title" }).text #article is relevant, build a beef record if (relevant_story): #execute if a story contains a keyword store_event_classification( title, content_string ) #classify event and store the classification for later use img_link = sub_page_soup.find("img", {"id": "EdImg"})["src"] date_string = sub_page_soup.find("p", { "class": "gms-article-data" }).span.time["datetime"] #find date in the soup object date_split = date_string.lstrip().split( "-" ) #split to get month and day in slot [0] and year and rest of string in [1] final_date_string = date_split[2].split( "T")[0] + "/" + date_split[1] + "/" + date_split[0] actors_list = extract_names( content_string) #extract actors from content_string highlights = extract_quotes( content_string) #extract quotes from content_string categories = [4] link_raw = body_tag.findAll("iframe") link = "" link_type = "" media_link = {"link": "", "type": ""} if len(link_raw) > 0: link = link_raw[0]["data-url"] if "youtube" in link: link_type = "youtube" elif "spotify" in link: link_type = "spotify" elif "soundcloud" in link: link_type = "soundcloud" elif "twitter" in link: link_type = "twitter" else: link_type = "video_embed" media_link = {"link": link, "type": link_type} if logging: print(content_string) #frame = BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title) beef_obj = BeefObject(title, actors_list, content_string, final_date_string, highlights, path, categories, img_link, media_link) #create beefObject #beef_obj.print_beef() return beef_obj else: return None else: return None else: return None
def scrape_article(path, uReq, soup, keyword_list): sub_page_html = access_url(path, uReq) if sub_page_html is not None: sub_page_soup = soup(sub_page_html, "html.parser") content_tag_array = sub_page_soup.findAll( "div", {"class": "story-body__inner"}) #find tags in the soup object relevant_story = None if len(content_tag_array) > 0: content_string = "" #init content string #check each p tag found for words from the keyword list for p in content_tag_array[0].findAll('p'): content_string += p.text if ( len(keyword_list) > 0 ): #if keyword list has values, use them to filter stories, if it is empty, automatically approve story #check if any text from page contains key words stored in list, if keyword found, print page text if (any(keyword in p.text for keyword in keyword_list)): relevant_story = True #break else: relevant_story = True #clean content string globals.scrub_content_text(content_string) title = sub_page_soup.findAll("h1", { "class": "story-body__h1" })[0].text #find tags in the soup object for beef object title #article is relevant, build a beef record if relevant_story: #execute if a story contains a keyword #store_event_classification(title, content_string) #classify event and store the classification for later use mini_info_panel_tag_array = sub_page_soup.findAll( "li", {"class": "mini-info-list__item" }) #find tags in the soup object for beef object date date_string_split = mini_info_panel_tag_array[0].div[ "data-datetime"].split(" ") #format date date_string = str(date_string_split[0]) + "/" + str( globals.get_month_number( date_string_split[1])) + "/" + str( date_string_split[2]) actors_list = extract_names( content_string) #extract actors from content_string highlights = extract_quotes( content_string) #extract quotes from content_string categories = [] if len(mini_info_panel_tag_array ) > 1 and mini_info_panel_tag_array[ 1].a is not None and mini_info_panel_tag_array[ 1].a.text is not None: category = mini_info_panel_tag_array[1].a.text if "politics" in category.lower(): categories.append(2) if "sport" in category.lower(): categories.append(4) if "technology" in category.lower(): categories.append(6) img_tag_array = sub_page_soup.findAll( "span", {"class": "image-and-copyright-container" }) #find tags in the soup object img_link = "" if len(img_tag_array) > 0: if img_tag_array[ 0].div: #if article contains references to images, extract the first one img_link = img_tag_array[0].div["data-src"] elif img_tag_array[0].img: img_link = img_tag_array[0].img["src"] media_link = {"link": "", "type": ""} media_tag_array = sub_page_soup.findAll( "figure", {"class": "media-player"}) if len(media_tag_array) == 1: link_json = demjson.decode( media_tag_array[0]["data-playable"]) link = link_json["settings"]["externalEmbedUrl"] link_type = "" if "youtube" in link: link_type = "youtube" elif "spotify" in link: link_type = "spotify" elif "soundcloud" in link: link_type = "soundcloud" elif "twitter" in link: link_type = "twitter" elif "bbc" in link: link_type = "bbc_embed" media_link = {"link": link, "type": link_type} #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title) beef_obj = BeefObject(title, actors_list, content_string, date_string, highlights, path, categories, img_link, media_link) #create beefObject return beef_obj else: return None else: return None else: return None
def scrape_article(path, uReq, soup, keyword_list): sub_page_html = access_url(path, uReq) if sub_page_html is not None: sub_page_soup = soup(sub_page_html, "html.parser") content_tag_array = sub_page_soup.findAll( "div", {"class": "zn-body__paragraph"}) #find tags in the soup object relevant_story = None if (len(content_tag_array) > 0): content_string = "" #init content string #check each p tag found for words from the keyword list for p in content_tag_array: content_string += p.text + " " if ( len(keyword_list) > 0 ): #if keyword list has values, use them to filter stories, if it is empty, automatically approve story #check if any text from page contains key words stored in list, if keyword found, print page text if (any(keyword in p.text for keyword in keyword_list)): relevant_story = True else: relevant_story = True #clean content string globals.scrub_content_text(content_string) title = sub_page_soup.findAll( "h1", {"class": "pg-headline" })[0].text #find tags in the soup object #article is relevant, build a beef record if (relevant_story): #execute if a story contains a keyword store_event_classification( title, content_string ) #classify event and store the classification for later use date_tag_array = sub_page_soup.findAll( "p", {"class": "update-time"}) #find tags in the soup object split_date = date_tag_array[0].text.split( " ") #split the date string into parts date_string = split_date[6].split(",")[0] + "/" + str( globals.get_month_number( split_date[5])) + "/" + split_date[ 7] #rebuild date string with only relevant parts actors_list = extract_names( content_string) #extract actors from content_string highlights = extract_quotes( content_string) #extract quotes from content_string categories = [] if "politics" in path: categories.append(2) if "sport" in path: categories.append(4) if "technology" in path: categories.append(6) img_tag_array = sub_page_soup.findAll( "div", {"class": "el__image--fullwidth" }) #find tags in the soup object img_link = "" if (img_tag_array is not None) and ( len(img_tag_array) > 0 ) and (img_tag_array[0].div) and ( img_tag_array[0].div.img ) and ( img_tag_array[0].div.img['data-src-large'] ): #if article contains references to images, extract the first one img_link = img_tag_array[0].div.img['data-src-large'] media_tag_array = sub_page_soup.findAll( "div", {"class": "media__video--thumbnail-wrapper" }) #find tags in the soup object media_link = {"link": "", "type": ""} if len(media_tag_array ) > 0 and media_tag_array[0] and media_tag_array[ 0].script and media_tag_array[0].script.text: json_video_data = demjson.decode( media_tag_array[0].script.text) link = json_video_data["embedUrl"] link_type = "" if "youtube" in link: link_type = "youtube" elif "spotify" in link: link_type = "spotify" elif "soundcloud" in link: link_type = "soundcloud" media_link = {"link": link, "type": link_type} #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title) beef_obj = BeefObject(title, actors_list, content_string, date_string, highlights, path, categories, img_link, media_link) #create beefObject return beef_obj else: return None