コード例 #1
0
def scrape_bbc_home(uReq, soup, keyword_list):
    
    logging = None
    
    base_url = 'http://www.bbc.co.uk' #url to scrape
    init_path = "/news" #base url extension
    
    page_html = access_url(base_url + init_path, uReq)#make request for page
    
    if page_html is not None:
    
        page_soup = soup(page_html, "html.parser") #convert the html to a soup object
        tag_array = page_soup.findAll("div", {"class" : "gs-c-promo"}) #find tags in the soup object

        if len(tag_array) > 0: #only execute if tags have been found

            beef_objects = []
            
            #load saved urls
            saved_urls = get_saved_urls(base_url)
            
            percent_per_scrape = 100/len(tag_array)

            for x in range(0, len(tag_array)): #for each tag
                
                print(str(round(x * percent_per_scrape)) + "% complete.")

                if(tag_array[x].a): #ensure the element has an anchor tag

                    if("http://" in tag_array[x].a["href"]): #check if the a href is an absolute url or an absolute path
                        sub_page_url = tag_array[x].a["href"]

                    else:
                        sub_page_url = base_url + tag_array[x].a["href"]
                        
                    path_split_1 = sub_page_url.split("/")#split path by /
                    path_split_2 = path_split_1[len(path_split_1) - 1 ].split("-")#get final field in path_split_1 and split by -
                    
                    if path_split_2[0] != "blogs": #ensure we are not scraping a blog page
                        
                        if any(url_obj["url"] == sub_page_url for url_obj in saved_urls): #check through pre loaded urls to ensure url has not already been scraped
                            if logging:
                                print("preloaded url found, aborting scrape.")
                        
                        else:
                            if logging:
                                print("preloaded url not found, initiating scrape.")
                                
                            #url must be saved under these conditions: 1. it has not been previously scraped, 2. it may not be relevant to beef and therefore may not be added to selected events, 
                            save_url(base_url, sub_page_url)
                            
                            beef_object = scrape_article(sub_page_url, uReq, soup, keyword_list) #scrape this article
                            
                            if beef_object != None:
                                beef_objects.append(beef_object)
                                #beef_object.print_beef()

            return beef_objects
    else:
        return []
コード例 #2
0
def scrape_hiphopdx_home(uReq, soup, keyword_list):

    logging = None

    base_url = 'https://hiphopdx.com'  #url to scrape
    initial_suffix = "/news"

    raw_page_html = access_url(base_url + initial_suffix,
                               uReq)  #make request for page

    if raw_page_html is not None:

        page_soup = soup(raw_page_html,
                         "html.parser")  #convert the html to a soup object

        news_tag = page_soup.find(
            "div",
            {"class", "wire"})  #, text=pattern) #find tags in the soup object

        beef_objects = []

        #load saved urls
        saved_urls = get_saved_urls(base_url)

        percent_per_scrape = 100 / len(news_tag.findAll("a"))

        if len(news_tag) > 0:  #only execute if tags have been found

            for x, a in enumerate(news_tag.findAll("a")):

                print(str(round(x * percent_per_scrape)) + "% complete.")

                if a and a["href"] and a["class"][0] != "next":

                    sub_page_url = base_url + a["href"]

                    if any(
                            url_obj["url"] == sub_page_url
                            for url_obj in saved_urls
                    ):  #check through pre loaded urls to ensure url has not already been scraped
                        if logging:
                            print("preloaded url found, aborting scrape.")

                    else:
                        if logging:
                            print(
                                "preloaded url not found, initiating scrape.")

                        #url must be saved under these conditions: 1. it has not been previously scraped, 2. it may not be relevant to beef and therefore may not be added to selected events,
                        save_url(base_url, sub_page_url)

                        beef_object = scrape_article(sub_page_url, uReq, soup,
                                                     keyword_list)

                        if beef_object != None:
                            beef_objects.append(beef_object)

        return beef_objects
    else:
        return []
コード例 #3
0
def scrape_cnn_home(uReq, soup, keyword_list):

    logging = None

    base_url = 'http://edition.cnn.com'  #url to scrape

    raw_page_html = access_url(base_url, uReq)  #make request for page

    if raw_page_html is not None:

        print(raw_page_html)

        page_soup = soup(raw_page_html,
                         "html.parser")  #convert the html to a soup object
        tag_array = page_soup.findAll(
            "script")  #, text=pattern) #find tags in the soup object

        if len(tag_array) > 0:  #only execute if tags have been found

            if (tag_array[10].text):  #ensure the element has an anchor tag

                beef_objects = []

                #load saved urls
                saved_urls = get_saved_urls(base_url)

                script_text = tag_array[10].text
                print(script_text)
                result = re.search('CNN.contentModel = (.*);', script_text)
                if (result.group):
                    script_json = demjson.decode(result.group(1))

                    percent_per_scrape = 100 / len(
                        script_json['siblings']['articleList'])

                    for x in range(0,
                                   len(script_json['siblings']
                                       ['articleList'])):  #for each tag

                        print(
                            str(round(x * percent_per_scrape)) + "% complete.")

                        sub_page_url = base_url + script_json['siblings'][
                            'articleList'][x]['uri']

                        if any(
                                url_obj["url"] == sub_page_url
                                for url_obj in saved_urls
                        ):  #check through pre loaded urls to ensure url has not already been scraped
                            if logging:
                                print("preloaded url found, aborting scrape.")

                        else:
                            if logging:
                                print(
                                    "preloaded url not found, initiating scrape."
                                )

                            #url must be saved under these conditions: 1. it has not been previously scraped, 2. it may not be relevant to beef and therefore may not be added to selected events,
                            save_url(base_url, sub_page_url)

                            beef_object = scrape_article(
                                sub_page_url, uReq, soup, keyword_list)

                            if beef_object != None:
                                beef_objects.append(beef_object)

                    return beef_objects
                else:
                    return []
    else:
        return []
コード例 #4
0
def scrape_give_me_sport_home(uReq, soup, keyword_list):

    logging = None

    base_url = 'http://www.givemesport.com/'  #url to scrape

    raw_page_html = access_url(base_url, uReq)  #make request for page

    if raw_page_html is not None:

        page_soup = soup(raw_page_html,
                         "html.parser")  #convert the html to a soup object

        news_tag_array = page_soup.find("section", {"id": "gms-trending"})

        if news_tag_array:
            news_tag_array = news_tag_array.findAll(
                "article", {"class", "gms-feature"
                            })  #, text=pattern) #find tags in the soup object

            beef_objects = []

            #load saved urls
            saved_urls = get_saved_urls(base_url)

            percent_per_scrape = 100 / len(news_tag_array)

            if len(news_tag_array) > 0:  #only execute if tags have been found

                for x, news_tag in enumerate(news_tag_array):

                    print(str(round(x * percent_per_scrape)) + "% complete.")

                    if news_tag and news_tag.a and news_tag.a["href"]:

                        sub_page_url = base_url + news_tag.a["href"]

                        if any(
                                url_obj["url"] == sub_page_url
                                for url_obj in saved_urls
                        ):  #check through pre loaded urls to ensure url has not already been scraped
                            if logging:
                                print("preloaded url found, aborting scrape.")

                        else:
                            if logging:
                                print(
                                    "preloaded url not found, initiating scrape."
                                )

                            #url must be saved under these conditions: 1. it has not been previously scraped, 2. it may not be relevant to beef and therefore may not be added to selected events,
                            save_url(base_url, sub_page_url)

                            beef_object = scrape_article(
                                sub_page_url, uReq, soup, keyword_list)
                            if beef_object != None:
                                beef_objects.append(beef_object)

            return beef_objects
        else:
            return []
    else:
        return []