コード例 #1
0
            # let it wait in-between clicks
            # driver.implicitly_wait(2)

    except Exception as e:
        print(
            f'Exception {e} \noccured when loading more articles in the politics section in Kathimerini.'
        )

        continue

    news_soup = BeautifulSoup(driver.page_source, 'html.parser')

    article_links = [
        link.split(">")[0] for link in str(
            news_soup.find_all(
                'div', class_='article_thumbnail_wrapper')).split('href="')[1:]
    ]

    return article_links


if __name__ == "__main__":

    article_links = kathimerini_article_links()
    links_df = pd.DataFrame(article_links)
    links_df.to_csv('data/kathimerini_links.csv', index=False)

    save_articles_in_parts(links_df,
                           article_parser=kathimerini_article_parser,
                           media_name='kathimerini')
コード例 #2
0
    # searching in a predefined range, to get links of up to a year back
    for page_id in tqdm(range(360), total=360):

        try:

            news_link = 'https://kontranews.gr/politiki?page=' + str(page_id)
            response = get(news_link)

            if response.status_code == 200:
                news_soup = BeautifulSoup(response.text, 'html.parser')
                article_links += ['https://kontranews.gr/' + str(link).split('href=')[1].split('"')[1] 
                            for link in news_soup.find_all('div', class_='post-link hidden')]
            
            else:
                break

        except Exception as e:
                print(e)

    return article_links


if __name__ == "__main__":

    article_links = kontra_article_links()
    links_df = pd.DataFrame(article_links)
    links_df.to_csv('data/kontra_links.csv', index=False)

    save_articles_in_parts(links_df, article_parser=kontra_article_parser, media_name='kontra')
コード例 #3
0
    try:
        
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        # scroll_height = driver.execute_script("return document.body.scrollHeight;")  

    # pracrtically scroll until no more scroll is possible
    except Exception as e:
        print(f'Exception {e} \noccured when loading more articles in the politics section in ProtoThema.')  
        continue
    
    # create a beautifoul soup from the articles loaded in the page
    news_soup = BeautifulSoup(driver.page_source, 'html.parser')

    article_links = [str(link).split('href="')[1].split('">')[0] 
            for link in news_soup.find_all('span', class_='update_well') 
                if '2020' in str(link).split('" title="')[1].split(',')[0] and 'href' in str(link)]

    return article_links


if __name__ == "__main__":

    article_links = protothema_article_links()
    links_df = pd.DataFrame(article_links)
    links_df.to_csv('data/protothema_links.csv', index=False)

    save_articles_in_parts(links_df, article_parser=protothema_article_parser, media_name='protothema')
    
コード例 #4
0
        try:
            # the link is for search window
            news_link = 'https://www.skai.gr/s/%CE%A0%CE%9F%CE%9B%CE%99%CE%A4%CE%99%CE%9A%CE%97?page=' + str(page_id)
            response = get(news_link)

            if response.status_code == 200:
                news_soup = BeautifulSoup(response.text, 'html.parser')
                
                # not all articles returned are for politics, we keep only those
                article_links += [link['href'] for link in news_soup.find_all('a', class_='title mainLink', href=True)
                                    if 'politics' in link['href']]
            
            else:
                break

        except Exception as e:
                print(e)

    return article_links


if __name__ == "__main__":

    article_links = skai_article_links()
    links_df = pd.DataFrame(article_links)
    links_df.to_csv('data/skai_links.csv', index=False)

    save_articles_in_parts(links_df, article_parser=skai_article_parser, media_name='skai')
    
コード例 #5
0
    article_links = []

    # hardcoded value, we want to keep only data for the past year
    for page_id in tqdm(range(360), total=360):

        try:
            news_link = 'https://www.tovima.gr/category/politics/page/' + str(page_id)
            response = get(news_link)

            if response.status_code == 200:
                news_soup = BeautifulSoup(response.text, 'html.parser')
                article_links += [link['href'] for link in news_soup.find_all('a', class_='zonabold twenty black-c article-main', href=True)]
            
            else:
                break

        except Exception as e:
                print(e)

    return article_links


if __name__ == "__main__":

    article_links = tobhma_article_links()
    links_df = pd.DataFrame(article_links)
    links_df.to_csv('data/tobhma_links.csv', index=False)

    save_articles_in_parts(links_df, article_parser=tobhma_article_parser, media_name='tobhma')
コード例 #6
0
            response = get(ethnos_news_by_page)
            if response.status_code == 503:
                print(f'Broken page: {page_id}')
                # the page is broken
                broken_pages += [page_id]
                continue
            news_soup = BeautifulSoup(response.text, 'html.parser')
            ethnos_links = [
                'https://www.ethnos.gr' + link['href'] for link in
                news_soup.find_all('a', class_='full-link', href=True)[1:]
            ]

            if ethnos_links != []:
                article_links += ethnos_links
            else:
                break
        except Exception as e:
            print(e)

    return article_links, broken_pages


if __name__ == "__main__":
    article_links, _ = ethnos_article_links()
    links_df = pd.DataFrame(article_links)
    links_df.to_csv('data/ethnos_links.csv', index=False)

    save_articles_in_parts(links_df,
                           article_parser=ethnos_article_parser,
                           media_name='ethnos')
コード例 #7
0
            try:
                response = get(tanea_news_link)
                if response.status_code == 200:
                    news_soup = BeautifulSoup(response.text, 'html.parser')
                    article_links += [
                        link['href'] for link in news_soup.find_all(
                            'a',
                            class_='article-title-18 dark-c firamedium nodecor',
                            href=True)
                    ]

                    page_id += 1

                else:
                    break

            except Exception as e:
                print(e)

    return article_links


if __name__ == "__main__":
    article_links = tanea_article_links()
    links_df = pd.DataFrame(article_links)
    links_df.to_csv('data/tanea_links.csv', index=False)

    save_articles_in_parts(links_df,
                           article_parser=tanea_article_parser,
                           media_name='tanea')
コード例 #8
0
        try:
            tanea_news_link = 'https://www.efsyn.gr/politiki?page=' + str(
                page_id)
            response = get(tanea_news_link)

            if response.status_code == 200:
                news_soup = BeautifulSoup(response.text, 'html.parser')
                article_links += [
                    'https://www.efsyn.gr' + link['href'] for link in
                    news_soup.find_all('a', class_='full-link', href=True)[:12]
                ]

            else:
                break

        except Exception as e:
            print(e)

    return article_links


if __name__ == "__main__":

    article_links = efsyn_article_links()
    links_df = pd.DataFrame(article_links)
    links_df.to_csv('data/efsyn_links.csv', index=False)

    save_articles_in_parts(links_df,
                           article_parser=efsyn_article_parser,
                           media_name='efsyn')