def scrape_article(path, uReq, soup, keyword_list):
    
    
    sub_page_html = access_url(path, uReq)
    
    if sub_page_html is not None:
    
        sub_page_soup = soup(sub_page_html, "html.parser")

        content_tag_array = sub_page_soup.findAll("div", {"class" : "page-content"}) #find tags in the soup object
        
        relevant_story = None;
        
        if(len(content_tag_array) > 0):
            
            content_string = "" #init content string
            img_link = "" #init content string

            #check each p tag found for words from the keyword list
            for p in content_tag_array[0].findAll('p'):

                if p.a is None:
                    content_string += p.text
                elif p.a.img is not None and p.a.img["src"] is not None:
                    img_link = p.a.img["src"]

                if(len(keyword_list) > 0): #if keyword list has values, use them to filter stories, if it is empty, automatically approve story

                    #check if any text from page contains key words stored in list, if keyword found, print page text
                    if(any(keyword in p.text for keyword in keyword_list)):
                        relevant_story = True

                else:
                    relevant_story = True

            #clean content string
            globals.scrub_content_text(content_string)

            title = sub_page_soup.findAll("div", {"class" : "page_header"})[0].h2.text #find tags in the soup object
            
            #article is relevant, build a beef record
            if(relevant_story): #execute if a story contains a keyword

                store_event_classification(title, content_string) #classify event and store the classification for later use
                
                media_tag_aray = sub_page_soup.findAll("iframe") #find tags in the soup object

                date_string = sub_page_soup.find("span", {"class" : "date"}).text.replace("Posted ", "") #find tags in the soup object
                date_split = date_string.split("/")
                date_string = date_split[1] + "/" + date_split[0] + "/" + date_split[2]

                actors_list = extract_names(content_string) #extract actors from content_string
                highlights = extract_quotes(content_string) #extract quotes from content_string
                categories = [1]

                media_link = {
                    "link": "",
                    "type": ""                    
                }

                if len(media_tag_aray) > 0:
                    link = media_tag_aray[0]["src"]
                    link_type = ""

                    if "youtube" in link:
                        link_type = "youtube"
                    elif "spotify" in link:
                        link_type = "spotify"
                    elif "soundcloud" in link:
                        link_type = "soundcloud"
                    elif "twitter" in link:
                        link_type = "twitter"

                    media_link = {
                        "link": link,
                        "type": link_type 
                    }

                #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
                beef_obj = BeefObject(title, actors_list, content_string, date_string, highlights, path, categories, img_link, media_link) #create beefObject 

                return beef_obj
            else:
                return None
        else:
            return None
    else:
        return None
def scrape_article(path, uReq, soup, keyword_list):

    sub_page_html = access_url(path, uReq)

    if sub_page_html is not None:

        sub_page_soup = soup(sub_page_html, "html.parser")

        body_tag = sub_page_soup.find("div",
                                      {"class": "article-content-container"
                                       })  #find tags in the soup object

        relevant_story = None

        if body_tag:

            content_string = ""  #init content string

            #check each p tag found for words from the keyword list
            for p in body_tag.section.findAll("p"):

                content_string += p.text

                if p is not None and len(
                        keyword_list
                ) > 0:  #if keyword list has values, use them to filter stories, if it is empty, automatically approve story

                    #check if any text from page contains key words stored in list, if keyword found, print page text
                    if (any(keyword in p.text for keyword in keyword_list)):
                        relevant_story = True
                        break

                else:
                    relevant_story = True

            #clean content string
            globals.scrub_content_text(content_string)

            title = body_tag.h2.text.strip(
            )  #find tags in the soup object for beef object title

            #article is relevant, build a beef record
            if (relevant_story):  #execute if a story contains a keyword

                store_event_classification(
                    title, content_string
                )  #classify event and store the classification for later use

                img_tag_array = sub_page_soup.findAll(
                    "img", {"class": "article-gallery-cover"})

                if len(img_tag_array) > 0 and img_tag_array[0]["src"]:
                    img_link = img_tag_array[0]["src"]

                #relevant_story = None;

                date_string = sub_page_soup.find("div", {
                    "class": "editorBlock-date"
                }).text.replace("\n", "")  #find tags in the soup object
                date_split = date_string.lstrip().split(
                    ", "
                )  #split to get month and day in slot [0] and year and rest of string in [1]
                secondary_date_split = date_split[0].split(
                    " ")  #split to seperate month and day
                tertiary_date_split = date_split[1].split(
                    " ")  #split to seperate year from rest of string

                final_date_string = secondary_date_split[1] + "/" + str(
                    globals.get_month_number(secondary_date_split[0])
                ) + "/" + tertiary_date_split[0]

                actors_list = extract_names(
                    content_string)  #extract actors from content_string
                highlights = extract_quotes(
                    content_string)  #extract quotes from content_string
                categories = [1]

                link_raw = body_tag.findAll("iframe")
                link = ""
                link_type = ""
                media_link = {"link": "", "type": ""}

                if len(link_raw) > 0:
                    link = link_raw[0]["src"]

                    if "youtube" in link:
                        link_type = "youtube"
                    elif "spotify" in link:
                        link_type = "spotify"
                    elif "soundcloud" in link:
                        link_type = "soundcloud"
                    elif "twitter" in link:
                        link_type = "twitter"
                    else:
                        link_type = "video_embed"

                    media_link = {"link": link, "type": link_type}

                #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
                beef_obj = BeefObject(title, actors_list, content_string,
                                      final_date_string, highlights, path,
                                      categories, img_link,
                                      media_link)  #create beefObject

                #beef_obj.print_beef()

                return beef_obj

            else:
                return None
        else:
            return None

    else:
        return None
def scrape_article(path, uReq, soup, keyword_list):

    sub_page_html = access_url(path, uReq)

    if sub_page_html is not None:

        sub_page_soup = soup(sub_page_html, "html.parser")

        #body_tag = sub_page_soup.find("div", {"class" : "article-content-container"}) #find tags in the soup object

        relevant_story = None

        if sub_page_soup:

            content_string = ""  #init content string

            #check each p tag found for words from the keyword list
            for p in sub_page_soup.findAll("p"):

                if p is not None and (
                        p.a == None or "bossip" in p.a["href"]
                ) and "Bossip Newsletter" not in p.text and "WENN" not in p.text:
                    content_string += p.text

                if p is not None and len(
                        keyword_list
                ) > 0:  #if keyword list has values, use them to filter stories, if it is empty, automatically approve story

                    #check if any text from page contains key words stored in list, if keyword found, print page text
                    if any(keyword in p.text for keyword in
                           keyword_list) or len(keyword_list) == 0:
                        relevant_story = True

                else:
                    relevant_story = True

            #clean content string
            globals.scrub_content_text(content_string)

            title_tag = sub_page_soup.find("h1")

            if title_tag and title_tag.text:
                title = title_tag.text.split("[")[0]

            #article is relevant, build a beef record
            if (relevant_story):  #execute if a story contains a keyword

                store_event_classification(
                    title, content_string
                )  #classify event and store the classification for later use

                img_tag_array = sub_page_soup.findAll(
                    "img", {"class": ["size-large", "size-full"]})

                if len(img_tag_array) > 0 and img_tag_array[0]["src"]:
                    img_link = img_tag_array[0]["src"]
                else:
                    return None
                date_string = sub_page_soup.find(
                    "time", {"class": "date"
                             })["datetime"]  #find tags in the soup object
                date_split = date_string.lstrip().split(
                    "-"
                )  #split to get month and day in slot [0] and year and rest of string in [1]

                final_date_string = date_split[2].split(
                    " ")[0] + "/" + date_split[1] + "/" + date_split[0]

                actors_list = extract_names(
                    content_string)  #extract actors from content_string
                highlights = extract_quotes(
                    content_string)  #extract quotes from content_string
                categories = [1]

                link_raw = sub_page_soup.findAll("iframe")
                link = ""
                link_type = ""
                media_link = {"link": "", "type": ""}

                if len(link_raw) > 0:
                    link = link_raw[0]["src"]

                    if "youtube" in link:
                        link_type = "youtube"
                    elif "spotify" in link:
                        link_type = "spotify"
                    elif "soundcloud" in link:
                        link_type = "soundcloud"
                    elif "twitter" in link:
                        link_type = "twitter"
                    else:
                        link_type = "video_embed"

                    media_link = {"link": link, "type": link_type}

                #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
                beef_obj = BeefObject(title, actors_list, content_string,
                                      final_date_string, highlights, path,
                                      categories, img_link,
                                      media_link)  #create beefObject

                #beef_obj.print_beef()

                return beef_obj

            else:
                return None
        else:
            return None

    else:
        return None
def scrape_article(path, uReq, soup, keyword_list):

    logging = None

    sub_page_html = access_url(path, uReq)

    if sub_page_html is not None:

        sub_page_soup = soup(sub_page_html, "html.parser")

        body_tag = sub_page_soup.find(
            "div", {"id": "OutputBody"})  #find tags in the soup object

        relevant_story = None

        if body_tag:

            content_string = ""  #init content string

            #check each p tag found for words from the keyword list
            for p in body_tag.findAll("p"):

                if p is not None and "Do YOU want to write for GiveMeSport?" not in p.text and "Have your say in the comments section below." not in p.text:

                    content_string += p.text

                    if len(
                            keyword_list
                    ) > 0:  #if keyword list has values, use them to filter stories, if it is empty, automatically approve story

                        #check if any text from page contains key words stored in list, if keyword found, print page text
                        if (any(keyword in p.text
                                for keyword in keyword_list)):
                            relevant_story = True

                    else:
                        relevant_story = True

            #clean content string
            globals.scrub_content_text(content_string)

            title = sub_page_soup.find("h1", {
                "class": "gms-article-title"
            }).text

            #article is relevant, build a beef record
            if (relevant_story):  #execute if a story contains a keyword

                store_event_classification(
                    title, content_string
                )  #classify event and store the classification for later use

                img_link = sub_page_soup.find("img", {"id": "EdImg"})["src"]

                date_string = sub_page_soup.find("p", {
                    "class": "gms-article-data"
                }).span.time["datetime"]  #find date in the soup object
                date_split = date_string.lstrip().split(
                    "-"
                )  #split to get month and day in slot [0] and year and rest of string in [1]

                final_date_string = date_split[2].split(
                    "T")[0] + "/" + date_split[1] + "/" + date_split[0]

                actors_list = extract_names(
                    content_string)  #extract actors from content_string
                highlights = extract_quotes(
                    content_string)  #extract quotes from content_string
                categories = [4]

                link_raw = body_tag.findAll("iframe")
                link = ""
                link_type = ""
                media_link = {"link": "", "type": ""}

                if len(link_raw) > 0:
                    link = link_raw[0]["data-url"]

                    if "youtube" in link:
                        link_type = "youtube"
                    elif "spotify" in link:
                        link_type = "spotify"
                    elif "soundcloud" in link:
                        link_type = "soundcloud"
                    elif "twitter" in link:
                        link_type = "twitter"
                    else:
                        link_type = "video_embed"

                    media_link = {"link": link, "type": link_type}

                if logging:
                    print(content_string)

                #frame = BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
                beef_obj = BeefObject(title, actors_list, content_string,
                                      final_date_string, highlights, path,
                                      categories, img_link,
                                      media_link)  #create beefObject

                #beef_obj.print_beef()

                return beef_obj
            else:
                return None
        else:
            return None

    else:
        return None
Ejemplo n.º 5
0
def scrape_article(path, uReq, soup, keyword_list):

    sub_page_html = access_url(path, uReq)

    if sub_page_html is not None:

        sub_page_soup = soup(sub_page_html, "html.parser")

        content_tag_array = sub_page_soup.findAll(
            "div",
            {"class": "story-body__inner"})  #find tags in the soup object

        relevant_story = None

        if len(content_tag_array) > 0:

            content_string = ""  #init content string

            #check each p tag found for words from the keyword list
            for p in content_tag_array[0].findAll('p'):

                content_string += p.text

                if (
                        len(keyword_list) > 0
                ):  #if keyword list has values, use them to filter stories, if it is empty, automatically approve story

                    #check if any text from page contains key words stored in list, if keyword found, print page text
                    if (any(keyword in p.text for keyword in keyword_list)):
                        relevant_story = True
                        #break

                else:
                    relevant_story = True

            #clean content string
            globals.scrub_content_text(content_string)

            title = sub_page_soup.findAll("h1", {
                "class": "story-body__h1"
            })[0].text  #find tags in the soup object for beef object title

            #article is relevant, build a beef record
            if relevant_story:  #execute if a story contains a keyword

                #store_event_classification(title, content_string) #classify event and store the classification for later use

                mini_info_panel_tag_array = sub_page_soup.findAll(
                    "li",
                    {"class": "mini-info-list__item"
                     })  #find tags in the soup object for beef object date
                date_string_split = mini_info_panel_tag_array[0].div[
                    "data-datetime"].split(" ")  #format date
                date_string = str(date_string_split[0]) + "/" + str(
                    globals.get_month_number(
                        date_string_split[1])) + "/" + str(
                            date_string_split[2])

                actors_list = extract_names(
                    content_string)  #extract actors from content_string
                highlights = extract_quotes(
                    content_string)  #extract quotes from content_string

                categories = []

                if len(mini_info_panel_tag_array
                       ) > 1 and mini_info_panel_tag_array[
                           1].a is not None and mini_info_panel_tag_array[
                               1].a.text is not None:

                    category = mini_info_panel_tag_array[1].a.text

                    if "politics" in category.lower():
                        categories.append(2)

                    if "sport" in category.lower():
                        categories.append(4)

                    if "technology" in category.lower():
                        categories.append(6)

                img_tag_array = sub_page_soup.findAll(
                    "span", {"class": "image-and-copyright-container"
                             })  #find tags in the soup object

                img_link = ""

                if len(img_tag_array) > 0:
                    if img_tag_array[
                            0].div:  #if article contains references to images, extract the first one
                        img_link = img_tag_array[0].div["data-src"]
                    elif img_tag_array[0].img:
                        img_link = img_tag_array[0].img["src"]

                media_link = {"link": "", "type": ""}

                media_tag_array = sub_page_soup.findAll(
                    "figure", {"class": "media-player"})

                if len(media_tag_array) == 1:
                    link_json = demjson.decode(
                        media_tag_array[0]["data-playable"])

                    link = link_json["settings"]["externalEmbedUrl"]

                    link_type = ""

                    if "youtube" in link:
                        link_type = "youtube"
                    elif "spotify" in link:
                        link_type = "spotify"
                    elif "soundcloud" in link:
                        link_type = "soundcloud"
                    elif "twitter" in link:
                        link_type = "twitter"
                    elif "bbc" in link:
                        link_type = "bbc_embed"

                    media_link = {"link": link, "type": link_type}

                #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
                beef_obj = BeefObject(title, actors_list, content_string,
                                      date_string, highlights, path,
                                      categories, img_link,
                                      media_link)  #create beefObject

                return beef_obj

            else:
                return None
        else:
            return None
    else:
        return None
def scrape_article(path, uReq, soup, keyword_list):

    sub_page_html = access_url(path, uReq)

    if sub_page_html is not None:

        sub_page_soup = soup(sub_page_html, "html.parser")

        content_tag_array = sub_page_soup.findAll(
            "div",
            {"class": "zn-body__paragraph"})  #find tags in the soup object

        relevant_story = None

        if (len(content_tag_array) > 0):

            content_string = ""  #init content string

            #check each p tag found for words from the keyword list
            for p in content_tag_array:

                content_string += p.text + " "

                if (
                        len(keyword_list) > 0
                ):  #if keyword list has values, use them to filter stories, if it is empty, automatically approve story
                    #check if any text from page contains key words stored in list, if keyword found, print page text
                    if (any(keyword in p.text for keyword in keyword_list)):
                        relevant_story = True

            else:
                relevant_story = True

            #clean content string
            globals.scrub_content_text(content_string)

            title = sub_page_soup.findAll(
                "h1", {"class": "pg-headline"
                       })[0].text  #find tags in the soup object

            #article is relevant, build a beef record
            if (relevant_story):  #execute if a story contains a keyword

                store_event_classification(
                    title, content_string
                )  #classify event and store the classification for later use

                date_tag_array = sub_page_soup.findAll(
                    "p",
                    {"class": "update-time"})  #find tags in the soup object

                split_date = date_tag_array[0].text.split(
                    " ")  #split the date string into parts
                date_string = split_date[6].split(",")[0] + "/" + str(
                    globals.get_month_number(
                        split_date[5])) + "/" + split_date[
                            7]  #rebuild date string with only relevant parts

                actors_list = extract_names(
                    content_string)  #extract actors from content_string

                highlights = extract_quotes(
                    content_string)  #extract quotes from content_string

                categories = []

                if "politics" in path:
                    categories.append(2)

                if "sport" in path:
                    categories.append(4)

                if "technology" in path:
                    categories.append(6)

                img_tag_array = sub_page_soup.findAll(
                    "div", {"class": "el__image--fullwidth"
                            })  #find tags in the soup object

                img_link = ""

                if (img_tag_array is not None) and (
                        len(img_tag_array) > 0
                ) and (img_tag_array[0].div) and (
                        img_tag_array[0].div.img
                ) and (
                        img_tag_array[0].div.img['data-src-large']
                ):  #if article contains references to images, extract the first one
                    img_link = img_tag_array[0].div.img['data-src-large']

                media_tag_array = sub_page_soup.findAll(
                    "div", {"class": "media__video--thumbnail-wrapper"
                            })  #find tags in the soup object

                media_link = {"link": "", "type": ""}

                if len(media_tag_array
                       ) > 0 and media_tag_array[0] and media_tag_array[
                           0].script and media_tag_array[0].script.text:

                    json_video_data = demjson.decode(
                        media_tag_array[0].script.text)
                    link = json_video_data["embedUrl"]
                    link_type = ""

                    if "youtube" in link:
                        link_type = "youtube"
                    elif "spotify" in link:
                        link_type = "spotify"
                    elif "soundcloud" in link:
                        link_type = "soundcloud"

                    media_link = {"link": link, "type": link_type}

                #frame BeefObject( title, relevant_actors, content, date, highlights, data_source, categories, img_title)
                beef_obj = BeefObject(title, actors_list, content_string,
                                      date_string, highlights, path,
                                      categories, img_link,
                                      media_link)  #create beefObject

                return beef_obj
    else:
        return None