예제 #1
0
def article_links(arts):
    sleep_r('m')
    articles_all = driver.find_element_by_class_name(
        'all-feed').find_elements_by_tag_name('article')
    if not articles_all:
        driver.refresh()
        sleep_r('l')
        articles_all = driver.find_element_by_class_name(
            'all-feed').find_elements_by_tag_name('article')
    for article in articles_all:
        article_link = article.find_element_by_tag_name('a').get_attribute(
            'href')
        article_title = article.find_element_by_tag_name('a').get_attribute(
            'title')
        try:
            article_abstract = article.find_element_by_tag_name('h3').text
        except:
            article_abstract = "Nan"
        if 'the-only-hawaiian-shirt-you-should-wear-this-summer' in article_link:
            continue
        arts.append({
            'link': article_link,
            'title_outside': article_title,
            'abstract_outside': article_abstract
        })
    return arts
def download_articles(all_articles, time_border, df_all, csv_dir, i):
    any_in = False
    topic_list = []
    for i_art, art in enumerate(all_articles):
        print(i_art, len(all_articles), art['link'])

        link_dict = {}

        driver.get(art['link'])
        sleep_r('m')
        print(link)
        link_dict['link'] = art['link']
        link_dict['title'] = driver.find_element_by_xpath(
            '//h1[@itemprop="name"]').text
        link_dict['author'] = driver.find_element_by_xpath(
            '//li[@itemprop="author"]').text
        try:
            link_dict['date'] = driver.find_element_by_xpath(
                '//time[@itemprop="datePublished"]').get_attribute('datetime')
            link_dict['date'] = pd.to_datetime(link_dict['date'])
        except:
            link_dict['date'] = 'NaN'
        article_p_list = []
        for p in driver.find_elements_by_xpath(
                '//div[@itemprop="articleBody"]//p'):
            article_p_list.append(p.text)
        link_dict['text'] = '\n\n'.join(article_p_list)

        print(link_dict['date'], time_border)
        topic_list.append(link_dict)
        if link_dict['date'] > time_border:
            any_in = True

    articles_df_inside = pd.DataFrame(topic_list)
    df = pd.DataFrame(all_articles)
    df_temp = pd.merge(df, articles_df_inside, on='link', how='right')
    df_all = df_all.append(df_temp)

    if not any_in:
        df_all.to_csv(csv_dir + 'conversation' + '.csv')

    else:

        i = i + 1
        next_page = 'https://theconversation.com/uk/technology/articles' + '?page=' + str(
            i)
        driver.get(next_page)
        print(next_page)
        arts = []
        article_links(arts)

        return download_articles(all_articles=arts,
                                 time_border=time_border,
                                 df_all=df_all,
                                 csv_dir=csv_dir,
                                 i=i)

        df_all.to_csv(csv_dir + 'conversation' + '.csv')

    return df_all
    def article_links(time_border, art_df, category='//'):
        stop = False
        article_category = category.split('/')[-2]
        articles = driver.find_elements_by_xpath('//article')
        for article in articles:
            article_link = article.find_element_by_xpath('.//a').get_attribute(
                'href')

            # 2a
            article_title = article.find_element_by_xpath('.//h2').text

            # 2b
            article_abstract = article.find_element_by_xpath(
                './/div[@class="summary"]').text

            # 2c
            article_author = article.find_element_by_xpath(
                './div[@class="meta_list"]/h4/a').text

            # 2d
            date = article.find_element_by_xpath(
                './div[@class="meta_list"]/h4').text

            date = re.search('[0-9]+\ (.+)\ [0-9]+,', date).group()[:-1]
            article_date = pd.to_datetime(date)

            # 2e
            article_number_of_comments = article.find_element_by_xpath(
                './div[@class="meta_list"]/h4').text.split(',')[-1]

            art_df.append({
                'link': article_link,
                'category': article_category,
                'title_outside': article_title,  # 2a
                'abstract_outside': article_abstract,  # 2b
                'author_outside': article_author,  # 2c
                'date_outside': article_date,  # 2d
                'comments_count_outside': article_number_of_comments
            }  # 2d
                          )

            if article_date < time_border:
                stop = True

        if not stop:
            sleep_r('m')
            next_page = driver.find_elements_by_xpath(
                '//ul[@class="pagination"]/li[contains(@class, "arrow")]/a'
            )[-1].get_attribute('href')

            curr_page = driver.current_url
            if curr_page == next_page:
                stop = True
                return art_df, stop
            driver.get(next_page)
            return article_links(time_border=time_border,
                                 art_df=art_df,
                                 category=category)

        return art_df, stop
예제 #4
0
def categories_links(time_border, arts, category):
    stop_categories = False
    article_category = category.split('/')[-2]
    print('category', article_category)
#     articles_all = driver.find_element_by_class_name('headlines').find_elements_by_class_name('story_link')
    articles_all = driver.find_elements_by_xpath('//div[contains(@class, "rt-1")]/article')
    if len(articles_all) == 0:
        articles_all = driver.find_elements_by_xpath('//div[contains(@class, "rt-")]/article')
#         articles_all = driver.find_elements_by_xpath('//div[contains(@class, "one_story")]')

    for article in articles_all:
        article_link = article.get_attribute('href')
        if article_link is None:
            article_link = article.find_element_by_xpath('./a').get_attribute('href')

        try:
            article_title = article.find_element_by_tag_name('h4').text  # 2a
        except:
            article_title = article.find_element_by_tag_name('h3').text

        article_abstract = article.find_element_by_class_name('standfirst').text  # 2b

        try:
            date = article.find_element_by_class_name('time_stamp').get_attribute('data-epoch')
        except:
            date = np.nan

        article_date = pd.to_datetime(date, unit='s')
        print(date, article_date, article_link)
        arts.append({'link': article_link,
                     'category': article_category,
                     'title_outside': article_title,  # 2a
                     'abstract_outside': article_abstract,
                     'date_outside': article_date  # 2d
                     })

        if article_date < time_border:
            stop_categories = True

    if not stop_categories:
        sleep_r('m')
        try:
            next_page = driver.find_element_by_xpath('//div[contains(@class, "more_content")]//a')
            driver.get(next_page.get_attribute('href'))
        except NoSuchElementException:
            next_page = driver.find_element_by_class_name('earlier_pages').find_elements_by_tag_name('a')
            driver.get(next_page[-1].get_attribute('href'))
        except Exception as e:
            print(e)
            return arts

        return categories_links(time_border=time_border, arts=arts, category=category)

    return arts
예제 #5
0
def download_articles(all_articles, time_border, csv_dir):
    print(all_articles)
    for i_art, art in enumerate(all_articles):
        print(i_art, len(all_articles), art)
        link_dict = {}
        try:
            driver.get(art['link'])

            sleep_r('m')

            link_dict['link'] = art['link']
            try:
                link_dict['title'] = driver.find_element_by_xpath('//h1[@itemprop="headline"]').text
            except NoSuchElementException:
                try:
                    link_dict['title'] = driver.find_element_by_xpath('//h1[@articleprop="headline"]').text
                except NoSuchElementException:
                    print(art['link'], 'no title')
                    continue
            try:
                link_dict['author'] = driver.find_element_by_xpath('//span[@itemprop="name"]').text
            except NoSuchElementException:
                link_dict['author'] = ''

            try:
                link_dict['description'] = driver.find_element_by_xpath('//div[@class="gs-container"]//p').text
            except NoSuchElementException:
                link_dict['description'] = ''

            try:
                link_dict['date'] = driver.find_element_by_xpath(
                    '//div[contains(@class,"content__meta-container")]//time').get_attribute('datetime')  # 5a
            except NoSuchElementException:
                continue  # we don't need articles if we're not certain about date
            if pd.to_datetime(link_dict['date']) < time_border:
                continue

            article_p_list = []
            for p in [x.text for x in driver.find_elements_by_xpath('//div[(@itemprop="articleBody") or'
                      '(@itemprop="reviewBody")]/p')]:
                article_p_list.append(p)

            link_dict['text'] = '\n\n'.join(article_p_list)

            art.update(link_dict)
            all_articles[i_art] = art

        except TimeoutException:
            print('timeout', art['link'])
            all_articles.append(art)
            continue

    pd.DataFrame(all_articles).to_csv(csv_dir + 'guardian.csv')
예제 #6
0
def download_articles(all_articles, time_border, csv_dir):
    for cat, df_grp in all_articles.groupby(
            'category'):  # only one category at a time
        print(cat, df_grp.shape)
        topic_list = []
        for i, row in df_grp.iterrows():
            link_dict = {}
            link = row['link']
            if row['link'] != None:
                link = row['link']
            else:
                print('no', link)
                continue
            print(i, link)
            sleep_r('m')
            driver.get(link)
            sleep_r('s')

            link_dict['link'] = link
            link_dict['title'] = driver.find_element_by_class_name(
                'article_head').find_element_by_tag_name('h1').text  # 4a
            link_dict['description'] = driver.find_element_by_class_name(
                'article_head').find_element_by_tag_name('h2').text  # 2b/4a
            link_dict['author'] = driver.find_element_by_class_name(
                'byline').find_element_by_tag_name('a').text
            link_dict['date'] = driver.find_element_by_class_name(
                'dateline').text

            # all_articles may contain too many articles
            # time_border is checked only with regards to page
            print(pd.to_datetime(link_dict['date']),
                  pd.to_datetime(row['date_outside']), time_border)
            if pd.to_datetime(
                    link_dict['date']) < time_border or pd.to_datetime(
                        row['date_outside']) < time_border:
                continue

            article_p_list = []
            for p in driver.find_element_by_id(
                    'body').find_elements_by_tag_name('p'):
                article_p_list.append(p.text)
            link_dict['text'] = '\n\n'.join(article_p_list)  # 4a

            topic_list.append(link_dict)

        articles_df_inside = pd.DataFrame(
            topic_list)  # converting list of dicts to DataFrame
        print(articles_df_inside.shape[0])

        # merging outside with inside
        pd.merge(df_grp, articles_df_inside, on='link',
                 how='right').to_csv(csv_dir + 'register_' + cat + '.csv')
예제 #7
0
def categories_links(time_border, arts, category):
    stop_categories = False
    article_category = category.split('/')[-2]
    articles_all = driver.find_elements_by_xpath(
        '//li[contains(@class, "article")]')
    for article in articles_all:
        article_link = article.find_element_by_xpath('.//a').get_attribute(
            'href')
        article_title = article.find_element_by_xpath(
            './/header/h2').text  # 2a
        article_abstract = article.find_element_by_xpath(
            './/header/p[@class="excerpt"]').text  # 2b
        article_author = article.find_element_by_xpath(
            './/p[@class="byline"]//span[@itemprop="name"]').text  # 2c
        date = article.find_element_by_xpath('.//time')
        article_date = pd.to_datetime(date.get_attribute('datetime'))  # 2d
        article_number_of_comments = article.find_element_by_xpath(
            './/footer//span[@class="comment-count-number"]').text  # 2e

        arts.append({
            'link': article_link,
            'category': article_category,
            'title_outside': article_title,  # 2a
            'abstract_outside': article_abstract,  # 2b
            'author_outside': article_author,  # 2c
            'date_outside': article_date,  # 2d
            'comments_count_outside': article_number_of_comments
        }  # 2e
                    )
        print(article_date, time_border)
        if article_date < time_border:
            stop_categories = True

    if not stop_categories:
        sleep_r('m')
        next_page_div = driver.find_elements_by_xpath(
            '//div[contains(@class, "prev-next-links")]/a')
        if len(next_page_div
               ) == 1:  # only load more stories available on the first page
            next_page = next_page_div[0].get_attribute('href')
        elif len(next_page_div) > 1:  # older stories / newer stories
            next_page = next_page_div[0].find_element_by_xpath(
                './../a[@rel="prev"]').get_attribute('href')
        else:
            return arts
        driver.get(next_page)
        return categories_links(time_border=time_border,
                                arts=arts,
                                category=category)

    return arts
예제 #8
0
def categories_links(time_border, arts, category):
    any_in = False
    print('---')
    article_category = category.split('/')[-2]
    articles_all = driver.find_elements_by_xpath('//article[not(contains(@class, "sub-post"))]')
    for article in articles_all:
        article_link = article.find_element_by_tag_name('a').get_attribute('href')
        article_title = article.find_element_by_class_name('entry-title').text
        try:
            article_author = article.find_element_by_class_name('article-header').find_element_by_css_selector(
                '.author').text
        except:
            article_author = "NaN"
        try:
            article_date = article.find_element_by_class_name('article-header').find_element_by_tag_name('time').text
            article_date = pd.to_datetime(article_date)
        except:
            article_date = "NaN"
        print(article_date)
        try:
            # sponsor=article.find_element_by_class_name('article-header').find_element_by_class_name('sponsored-by ').find_element_by_tag_name('a').get_attribute('href')
            sponsored_div = article.find_elements_by_xpath('./header/div[contains(@class, "sponsored-by")]//a')
            article_sponsor = ''
            if len(sponsored_div) > 0:
                article_sponsor = sponsored_div[0].get_attribute('href')

        except:
            sponsor = "NaN"

        if article_date == 'NaN' or article_date > time_border:
            any_in = True

            arts.append({'link': article_link,
                         'title_outside': article_title,
                         'author_outside': article_author,
                         'date_outside': article_date,
                         'sponsor_outside': article_sponsor,
                         'category': article_category}
                        )
        else:
            return arts

    if not any_in:
        return arts
    else:
        sleep_r('m')
        next_page = driver.find_element_by_class_name('page-numbers').find_element_by_class_name('next').get_attribute(
            'href')
        driver.get(next_page)
        return categories_links(time_border=time_border, arts=arts, category=category)
def download_articles(all_articles, csv_dir):
    topic_list = []
    for i_art, art in enumerate(all_articles):
        print(i_art, len(all_articles), art['link'])
        link_dict = {}
        sleep_r('s')
        link = art['link']
        driver.get(link)
        sleep_r('m')

        link_dict['link'] = link
        link_dict['title'] = driver.find_element_by_xpath(
            '//h1[contains(@class, "headline")]').text
        link_dict['author'] = driver.find_element_by_xpath(
            '//div[contains(@class, "first-container")]/div/div').text
        if 'MIN READ' in link_dict['author']:
            try:
                link_dict['author'] = driver.find_element_by_xpath(
                    '//div[contains(@class, "first-container")]/div/p').text
            except NoSuchElementException:
                try:
                    link_dict['author'] = driver.find_element_by_xpath(
                        '//p[@class="Attribution_content"]').text
                except NoSuchElementException:
                    link_dict['author'] = 'NaN'
        try:
            link_dict['date'] = driver.find_element_by_xpath(
                '//div[contains(@class, "date")]').text.split('/')[0]
            link_dict['date'] = pd.to_datetime(link_dict['date'])
        except:
            link_dict['date'] = 'NaN'

        article_p_list = []
        for p in driver.find_elements_by_xpath(
                '//div[contains(@class, "body")]/p'):
            try:
                article_p_list.append(p.text)
            except:
                pass
        link_dict['text'] = '\n\n'.join(article_p_list)

        topic_list.append(link_dict)

    articles_df_inside = pd.DataFrame(topic_list)
    print(articles_df_inside.shape[0])
    df = pd.DataFrame(all_articles)

    df_all = pd.merge(df, articles_df_inside, on='link', how='right')
    pd.merge(df, articles_df_inside, on='link',
             how='right').to_csv(csv_dir + 'reuters' + '.csv')
def download_articles(all_articles, csv_dir, category_name, driver):
    for i_art, art in enumerate(all_articles):
        link_dict = {}
        print(i_art, len(all_articles), category_name, art['link'])

        try:
            driver.get(art['link'])

            sleep_r('m')
            link_dict['link'] = art['link']

            try:
                link_dict['title'] = driver.find_element_by_xpath('//header/h1[@itemprop = "headline"]').text

            except NoSuchElementException:
                link_dict['title'] = ''

            try:
                link_dict['author'] = driver.find_element_by_xpath('//p/a[@itemprop="author"]').text
            except NoSuchElementException:
                link_dict['author'] = ''

            try:
                link_dict['description'] = driver.find_element_by_xpath(
                    '//header[@class="storyHeader article"]/p[@itemprop="description alternativeHeadline"]').text
            except NoSuchElementException:
                link_dict['description'] = ''

            try:
                link_dict['date'] = driver.find_element_by_xpath(
                    '//header[@class="storyHeader article"]/div/p/time').get_attribute('datetime')  # 5a
            except NoSuchElementException:
                continue

            article_p_list = []
            for p in [x.text for x in driver.find_elements_by_xpath('//article/div/p')]:
                article_p_list.append(p)

            link_dict['text'] = '\n\n'.join(article_p_list)

            art.update(link_dict)
            all_articles[i_art] = art


        except TimeoutException:
            print('timeout', art['link'])
            all_articles.append(art)
            continue

    pd.DataFrame(all_articles).to_csv(csv_dir + 'zdnet_' + category_name + '.csv')
def article_links(time_border, arts):
    any_in = False
    articles = driver.find_elements_by_xpath(
        '//section[@class="main"]//div[contains(@class, "post-wrapper")]')
    for article in articles:
        article_link = article.find_element_by_xpath(
            './/h1[contains(@class, "headline")]/a').get_attribute('href')
        if ('gizmodo' not in article_link) or ('io9.gizmodo' in article_link):
            continue

        article_title = article.find_element_by_xpath(
            './/h1[(contains(@class, "headline")) or (contains(@class, "title"))]'
        ).text
        try:
            article_date = article.find_element_by_xpath(
                './/div[contains(@class, "meta__container")]/time'
            ).get_attribute('datetime')
            article_date = pd.to_datetime(article_date)
        except:
            article_date = 'NaN'
        try:
            article_author = article.find_element_by_xpath(
                './/div[contains(@class, "author")]').text
        except:
            article_author = 'NaN'

        if article_date == 'NaN' or article_date > time_border:
            print(article_date)
            any_in = True
            arts.append({
                'link': article_link,
                'title_outside': article_title,
                'date_outside': article_date,
                'author_outside': article_author,
            })
        #else:
        #    return arts

    if not any_in:
        return arts

    else:
        sleep_r('m')
        next_page = driver.find_element_by_xpath(
            '//div[@class="load-more__button"]/a').get_attribute('href')
        driver.get(next_page)
        return article_links(time_border=time_border, arts=arts)

    return arts
예제 #12
0
def categories_links(time_border, arts, category):
    stop_categories = False
    article_category = category.split('/')[-2]
    articles_all = driver.find_element_by_class_name(
        'skin-wrapper').find_elements_by_xpath(
            '//a[@class="article"] | //div/article')
    try:
        date = articles_all[-1].find_element_by_class_name(
            'the-time').get_attribute('title')
    except:
        date = np.nan
    if date == np.nan:
        article_date = np.nan
    else:
        article_date = pd.to_datetime(str(date)[:11])
    if article_date < time_border:
        stop_categories = True

    if stop_categories:
        for article in articles_all:
            article_link = article.find_element_by_tag_name('a').get_attribute(
                'href')
            article_title = article.text  # 2a
            article_date = article.find_element_by_class_name(
                'the-time').get_attribute('title')
            print(date, article_date)
            arts.append({
                'link': article_link,
                'category': article_category,
                'title_outside': article_title,  # 2a
                'date_outside': article_date  # 2d
            })
        print(len(arts), article_category, category)
        return arts
    else:
        try:
            driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")
            sleep_r('l')
            # sleep_r('m')
            driver.find_element_by_xpath(
                '//div[@class="load-more"]/button').click()
            # driver.execute_script('arguments[0].click();', driver.find_element_by_xpath('//div[@class="load-more"]/button'))
        except Exception as e:
            print(e)

        return categories_links(time_border=time_border,
                                arts=arts,
                                category=category)
예제 #13
0
def download_articles(all_articles, time_border, csv_dir):
    for cat, df_grp in all_articles.groupby(
            'category'):  # only one category at a time
        topic_list = []
        for i, row in df_grp.iterrows():
            link_dict = {}
            if 'newsletters-signup' not in row['link']:
                link = row['link']
                print(i, link)
            else:
                continue
            sleep_r('m')
            driver.get(link)
            sleep_r('s')

            link_dict['link'] = link

            link_dict['title'] = driver.find_element_by_class_name(
                'article-main-title').text  # 4a
            link_dict['description'] = driver.find_element_by_class_name(
                'article-dek').text  # 2b/4a
            link_dict['author'] = driver.find_element_by_class_name(
                'author-name').text  # 2c
            link_dict['date'] = driver.find_element_by_tag_name('label').text

            # all_articles may contain too many articles
            # time_border is checked only with regards to page
            if pd.to_datetime(
                    link_dict['date'][:11]) < time_border or pd.to_datetime(
                        row['date_outside']) < time_border:
                continue

            article_p_list = []
            for p in driver.find_elements_by_xpath(
                    '//div[contains(@class, "articleBody")]//p'):
                article_p_list.append(p.text)
            link_dict['text'] = '\n\n'.join(article_p_list)  # 4a

            topic_list.append(link_dict)

        articles_df_inside = pd.DataFrame(
            topic_list)  # converting list of dicts to DataFrame
        print(articles_df_inside.shape[0])

        # merging outside with inside
        pd.merge(df_grp, articles_df_inside, on='link',
                 how='right').to_csv(csv_dir + 'ieee_' + cat + '.csv')
예제 #14
0
def download_articles(all_articles, time_border, csv_dir):
    for cat, df_grp in all_articles.groupby(
            'category'):  # only one category at a time
        topic_list = []
        for i, row in df_grp.iterrows():
            link_dict = {}
            link = row['link']
            print(i, link)

            sleep_r('m')
            driver.get(link)
            sleep_r('s')

            link_dict['link'] = link

            link_dict['title'] = driver.find_element_by_xpath(
                '//h1[@itemprop="headline"]').text  # 4a
            link_dict['description'] = driver.find_element_by_xpath(
                '//h2[@itemprop="description"]').text  # 2b/4a
            link_dict['author'] = driver.find_element_by_xpath(
                '//span[@itemprop="name"]').text  # 2c
            link_dict['date'] = driver.find_element_by_xpath(
                '//section[contains(@class, "post-meta")]//time'
            ).get_attribute('datetime')

            # all_articles may contain too many articles
            # time_border is checked only with regards to page
            if pd.to_datetime(
                    link_dict['date']) < time_border or pd.to_datetime(
                        row['date_outside']) < time_border:
                continue

            article_p_list = []
            for p in driver.find_elements_by_xpath(
                    '//div[@itemprop="articleBody"]/p'):
                article_p_list.append(p.text)
            link_dict['text'] = '\n\n'.join(article_p_list)  # 4a

            topic_list.append(link_dict)

        articles_df_inside = pd.DataFrame(
            topic_list)  # converting list of dicts to DataFrame
        print(articles_df_inside.shape[0])

        # merging outside with inside
        pd.merge(df_grp, articles_df_inside, on='link',
                 how='right').to_csv(csv_dir + 'arstechnica_' + cat + '.csv')
예제 #15
0
def categories_links(time_border, art_links):
    temp_art_links = []
    articles = driver.find_elements_by_xpath('//div[@class="fc-item__container"]')
    next_page = driver.find_element_by_xpath('//a[contains(@aria-label, " next page")]').get_attribute('href')
    any_in = False

    for article in articles:
        article_link = article.find_element_by_xpath('.//a').get_attribute('href')
        article_title = article.find_element_by_xpath('.//*[contains(@class,"fc-item__title")]').text
        article_date = article.find_element_by_xpath('.//div[contains(@class, "fc-item__meta")]/time').get_attribute('datetime')

        try:
            article_author = article.find_element_by_xpath('.//div[@class="fc-item__byline"]').text
        except NoSuchElementException:
            article_author = ''

        try:
            article_comment_count = article.find_element_by_xpath('.//a[@data-link-name="Comment count"]').text
        except NoSuchElementException:
            article_comment_count = ''

        if pd.to_datetime(article_date) > time_border:
            any_in = True
            temp_art_links.append({'link': article_link,
                                   'title_outside': article_title,
                                   'date_outside': article_date,
                                   'author_outside': article_author,
                                   'comment_count_outside': article_comment_count,
                                   }
                                  )
    print(art_links)
    if not any_in:
        pickle.dump(art_links, open('guardian_links.pickle', 'wb'))  # in case something goes wrong with articles
        return art_links
    else:
        art_links += temp_art_links
        sleep_r('m')
        try:
            driver.get(next_page)
        except Exception as ex:
            print(ex)
            return art_links
        return categories_links(time_border=time_border, art_links=art_links)
def download_articles(all_articles, csv_dir):
    topic_list = []
    for i_art, art in enumerate(all_articles):
        print(i_art, len(all_articles), art['link'])
        link_dict = {}
        sleep_r('s')
        link = art['link']
        driver.get(link)
        sleep_r('l')

        try:
            link_dict['link'] = link

            link_dict['title'] = driver.find_element_by_xpath('//h1').text
            try:
                link_dict['author'] = driver.find_element_by_xpath(
                    '//div[contains(@class, "author")]').text
            except:
                link_dict['author'] = 'NaN'

            link_dict['date'] = driver.find_element_by_xpath(
                '//div[contains(@class,"meta__container")]/time'
            ).get_attribute('datetime')

            article_p_list = []
            for p in [
                    x.text for x in driver.find_elements_by_xpath(
                        '//div[contains(@class, "entry-content")]//p')
            ]:
                article_p_list.append(p)

            link_dict['text'] = '\n\n'.join(article_p_list)
            topic_list.append(link_dict)
            # print(topic_list)
        except:
            print('problem', link)

    articles_df_inside = pd.DataFrame(topic_list)
    df = pd.DataFrame(all_articles)

    # df_all=pd.merge(df, articles_df_inside, on='link', how='right')
    pd.merge(df, articles_df_inside, on='link',
             how='right').to_csv(csv_dir + 'gizmodo_' + '.csv')
예제 #17
0
def download_articles(all_articles, time_border, csv_dir):
    for cat, df_grp in all_articles.groupby('category'):
        topic_list = []
        for i, row in df_grp.iterrows():
            link_dict = {}
            link = row['link']
            print(i, link)
            sleep_r('m')
            driver.get(link)
            sleep_r('s')

            link_dict['link'] = link
            link_dict['title'] = link_dict['title'] = driver.find_element_by_id(
                'inner-content').find_element_by_tag_name('h1').text
            try:
                link_dict['author'] = driver.find_element_by_class_name('article-header').find_element_by_css_selector(
                    '.author').text
            except:
                link_dict['author'] = "NaN"
            try:
                link_dict['date'] = driver.find_element_by_class_name('entry-time').text
            except:
                link_dict['date'] = "NaN"

            # if pd.to_datetime(link_dict['date']) < time_border:
            # continue

            ## It may contain sponsor information at the beginning
            article_p_list = []
            for p in driver.find_element_by_id('main').find_element_by_class_name(
                    'entry-content').find_elements_by_tag_name('p'):
                article_p_list.append(p.text)
                link_dict['text'] = '\n\n'.join(article_p_list)

            topic_list.append(link_dict)

        articles_df_inside = pd.DataFrame(topic_list)
        print(articles_df_inside.shape[0])

        df_all = pd.merge(df, articles_df_inside, on='link', how='right')
        pd.merge(df, articles_df_inside, on='link', how='right').to_csv(csv_dir + 'gigaom_' + cat + '.csv')
def download_articles(articles_df, time_border, csv_dir):
    for cat, df_grp in articles_df.groupby('category'):  # only one category at a time
        topic_list = []
        for i, row in df_grp.iterrows():

            link_dict = {}
            link = row['link']
            print(i, link)

            sleep_r('m')
            driver.get(link)
            sleep_r('s')

            link_dict['link'] = link

            link_dict['title'] = driver.find_element_by_xpath('//header/h1').text
            link_dict['author'] = driver.find_element_by_xpath('//span[@itemprop="name"]').text
            link_dict['date'] = driver.find_element_by_xpath('//time[@itemprop="datePublished"]').get_attribute(
                'datetime')

            # here time_border necessary, no date on the outside
            if pd.to_datetime(link_dict['date']) < time_border:
                continue

            # 4a
            article_p_list = []
            for p in driver.find_elements_by_xpath('//div[@id="content-main"]//p'):
                article_p_list.append(p.text)
            link_dict['text'] = '\n\n'.join(article_p_list)

            topic_list.append(link_dict)

        articles_df_inside = pd.DataFrame(topic_list)  # converting list of dicts to dataframe
        print(articles_df_inside.shape[0])

        # merging outside with inside
        if articles_df_inside.shape[0] != 0:  # it may happen that articles_df_inside is empty
            pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv(
                csv_dir + 'techforge_blockchain_' + cat + '.csv')
def articles_links(time_border, topics, arts, driver):
    for topic in topics:

        first_date = pd.to_datetime('now')
        i = 1
        while first_date > time_border:

            driver.get(topic + '/' + str(i))
            sleep_r('m')

            first_date = pd.to_datetime(
                driver.find_element_by_xpath('//article[@class = "item"]/div/div/p/span').get_attribute('data-date'))
            if first_date is None:
                first_date = pd.to_datetime('now')
            i = i + 1

            # category = driver.find_element_by_xpath('//header/h1[@itemprop = "headline"]').text
            category = topics[0].split('/')[-2]
            sleep_r('m')

            articles_all = driver.find_elements_by_xpath('//section[@id="topic-river-latest"]/div/div/div/article')
            if not len(articles_all):
                break
            elif pd.to_datetime(driver.find_element_by_xpath('//article[@class = "item"]/div/div/p/span').get_attribute(
                    'data-date')) is not None and pd.to_datetime(driver.find_element_by_xpath('//article[@class = "item"]/div/div/p/span').get_attribute(
                    'data-date')) < time_border:
                break

            else:
                for article in articles_all:

                    article_link = article.find_element_by_xpath('.//h3/a').get_attribute('href')
                    article_title = article.find_element_by_xpath('.//h3/a').text  # 2a
                    article_abstract = article.find_element_by_xpath('.//p[@class = "summary"]').text  # 2b

                    try:
                        article_author = article.find_element_by_xpath('.//p[@class="meta"]/a').text  # 2c
                    except NoSuchElementException:
                        article_author = ''

                    try:
                        date = article.find_element_by_xpath('.//p[@class="meta"]/span')
                        article_date = pd.to_datetime(date.get_attribute('data-date'))  # 2d
                    except NoSuchElementException:
                        article_date = ''

                    arts.append({'link': article_link,
                                 'category': category,
                                 'title_outside': article_title,  # 2a
                                 'abstract_outside': article_abstract  # 2b
                                    , 'author_outside': article_author,  # 2c
                                 'date_outside': article_date  # 2d
                                 }
                                )
                sleep_r('l')

    return arts
예제 #20
0
def download_articles(all_articles, time_border, csv_dir):
    all_articles.drop_duplicates(subset=['link'], inplace=True)
    all_links = []
    for i in os.listdir(csv_dir):
        if 'venturebeat' in i:
            all_links.extend(pd.read_csv(csv_dir + i)['link'].unique().tolist())
    for cat, df_grp in all_articles.groupby('category'):  # only one category at a time
        print(cat, df_grp.shape)
        topic_list = []
        for i, row in df_grp.iterrows():
            link_dict = {}
            if row['link'] is not None and row['link'] not in all_links:
                link = row['link']
                print(row['link'], all_links[0])
            else:
                continue
            if pd.to_datetime(row['date_outside']) < time_border:
                continue
            print(i, link)
            sleep_r('m')
            while True:
                try:
                    driver.get(link)
                    break
                except TimeoutException:
                    sleep_r('l')
            sleep_r('s')

            link_dict['link'] = link
            link_dict['title'] = driver.find_element_by_class_name('article-header').find_element_by_class_name(
                'article-title').text  # 4a
            link_dict['author'] = driver.find_element_by_xpath('//a[contains(@class, "author")]').text
            link_dict['date'] = driver.find_element_by_class_name('the-time').get_attribute('title')[:11]
            print(link_dict['date'], pd.to_datetime(link_dict['date']), row['date_outside'], time_border)
            # all_articles may contain too many articles
            # time_border is checked only with regards to page
            if pd.to_datetime(link_dict['date']) < time_border or pd.to_datetime(row['date_outside']) < time_border:
                continue

            article_p_list = []
            for p in driver.find_element_by_class_name('article-content').find_elements_by_tag_name('p'):
                article_p_list.append(p.text)
            link_dict['text'] = '\n\n'.join(article_p_list)  # 4a

            topic_list.append(link_dict)
            print(len(topic_list))
        articles_df_inside = pd.DataFrame(topic_list)  # converting list of dicts to DataFrame
        print(articles_df_inside.shape)

        # merging outside with inside
        print('save', cat)
        pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv(csv_dir + 'venturebeat_' + cat + '.csv')
예제 #21
0
def categories_links(time_border, arts, category):
    stop_categories = False
    article_category = category.split('/')[-1]
    try:
        driver.execute_script('splashpage.closeit()')
    except:
        pass
    try:
        driver.find_element_by_class_name('cc-compliance').click()
        sleep_r('l')
    except:
        print('no cc compliance')
        pass

    articles_all = driver.find_element_by_class_name(
        'topic-wrap').find_elements_by_tag_name('article')

    for article in articles_all:

        try:
            article_link = article.find_element_by_tag_name('a').get_attribute(
                'href')
        except NoSuchElementException:
            print(article.text, 'no link')
            continue
        article_title = article.find_element_by_tag_name('h3').text  # 2a
        try:
            article_author = article.find_element_by_class_name(
                'author-name').text  # 2c
        except:
            article_author = np.nan
        try:
            date = article.find_element_by_tag_name('label').text
        except:
            try:
                date = article.find_element_by_tag_name('time').text
            except:
                date = np.nan

        if len(str(date)) > 10:
            article_date = pd.to_datetime(date[:11])
        elif len(str(date)) < 10 and len(str(date)) > 4:
            date = yr + str(date)
            article_date = pd.to_datetime(date, format="%Y %d %b")
        else:
            article_date = pd.to_datetime('2100-01-01')

        arts.append({
            'link': article_link,
            'category': article_category,
            'title_outside': article_title,  # 2a
            'author_outside': article_author,  # 2c
            'date_outside': article_date  # 2d
        })

        if article_date < time_border:
            stop_categories = True

    if not stop_categories:
        sleep_r('m')
        WebDriverWait(driver, 100).until(
            lambda driver: driver.find_element_by_id('blog-load-more'))
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        sleep_r('s')
        loadMoreButton = driver.find_element_by_id('blog-load-more')
        try:
            loadMoreButton.click()
        except Exception as e:
            print(e)
            return arts

        return categories_links(time_border=time_border,
                                arts=arts,
                                category=category)

    return arts
            pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv(
                csv_dir + 'techforge_blockchain_' + cat + '.csv')


articles_df = []  # keep all articles in this list

## [:2] - default first two topics
## remember to pass a list in for i in topics[begin:end], otherwise it will not work
## you may freely change the parameters in two lines below
time_border = pd.to_datetime('now') - pd.Timedelta('190 days')  # 3
topics = list(set(topics_links()))
for i in topics:
    print(i)
    driver.get(i)
    stop = False

    while True:
        sleep_r('m')
        inf_more = driver.find_element_by_xpath('//a[@class="inf-more-but"]')
        if 'display: none' in inf_more.get_attribute('style'):
            break
        else:
            driver.execute_script('arguments[0].click();', inf_more)
            sleep_r('m')

    articles_df = article_links(art_df=articles_df, category=i)

articles_df = pd.DataFrame(articles_df)  # finally convert to dataframe

download_articles(articles_df, time_border, csv_dir_common())
예제 #23
0
        # merging outside with inside
        print('save', cat)
        pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv(csv_dir + 'venturebeat_' + cat + '.csv')


csvs = csv_dir_common()
topics = topics_links()
arts = []

# remember to pass a list in for i (even if downloading a single topic), otherwise it will not work
# you may freely change the parameters in two lines below
time_limit = pd.to_datetime('now') - pd.Timedelta('190 days')  # 3
articles = []  # keep all articles in this list
<<<<<<< HEAD
print(topics)
for topic in topics:
=======
print(topics, topics[8:])
for topic in topics[8:]:
>>>>>>> 0cd159f401e3f5c0fe800013dd578d725f38fc98
    articles = []  # keep all articles in this list
    print(topic)
    driver.get(topic)
    print('topic', topic, 'opened')
    sleep_r('l')
    articles = categories_links(time_border=time_limit, arts=articles, category=topic)

    articles_df = pd.DataFrame(articles)
    download_articles(articles_df, time_border=time_limit, csv_dir=csvs)
def tf_site(site):
    driver = full_driver()
    sleep_r('m')
    driver.get(site)

    # find links to categories (BIZ & IT, TECH etc.)
    while True:
        topics_a = driver.find_elements_by_xpath('//ul[@class="right"]/li[contains(@class,'
                                                 '"has-dropdown")]/ul[@class="dropdown"]/li/a')
        if len(topics_a) > 0:
            break
        else:
            print('no topics')
            sleep_r('m')
            driver.get(site)
            sleep_r('m')

    topics = []

    for i in topics_a:
        if '/categories/' in i.get_attribute('href'):
            # if 'cloudcomputing' in site and 'case-studies' in i.get_attribute(
            #         'href') or 'data-analytics' in i.get_attribute('href'):
            #     topics.append(i.get_attribute('href'))
            # if 'cloudcomputing' not in site:
            if 'developer' not in site:
                topics.append(i.get_attribute('href'))
            elif 'Gaming' not in i.get_attribute('href'):
                topics.append(i.get_attribute('href'))

    # topic numbers, if you want to choose only some
    for i, topic in enumerate(topics):
        print(site, i, topic)

    def article_links(time_border, art_df, category='//'):
        stop = False
        article_category = category.split('/')[-2]
        articles = driver.find_elements_by_xpath('//article')
        for article in articles:
            article_link = article.find_element_by_xpath('.//a').get_attribute('href')

            # 2a
            article_title = article.find_element_by_xpath('.//h2').text

            # 2b
            article_abstract = article.find_element_by_xpath('.//div[@class="summary"]').text

            # 2c
            article_author = article.find_element_by_xpath('./div[@class="meta_list"]/h4/a').text

            # 2d
            date = article.find_element_by_xpath('./div[@class="meta_list"]/h4').text

            date = re.search('[0-9]+\ (.+)\ [0-9]+,', date).group()[:-1]
            article_date = pd.to_datetime(date)

            # 2e
            article_number_of_comments = article.find_element_by_xpath('./div[@class="meta_list"]/h4').text.split(',')[
                -1]

            art_df.append({'link': article_link,
                           'category': article_category,
                           'title_outside': article_title,  # 2a
                           'abstract_outside': article_abstract,  # 2b
                           'author_outside': article_author,  # 2c
                           'date_outside': article_date,  # 2d
                           'comments_count_outside': article_number_of_comments}  # 2d
                          )

            if article_date < time_border:
                stop = True

        if not stop:
            sleep_r('m')
            next_page = driver.find_elements_by_xpath('//ul[@class="pagination"]/li[contains(@class, "arrow")]/a')[
                -1].get_attribute('href')

            curr_page = driver.current_url
            if curr_page == next_page:
                stop = True
                return art_df, stop
            driver.get(next_page)
            return article_links(time_border=time_border, art_df=art_df, category=category)

        return art_df, stop

    articles_df = []  # keep all articles in this list

    ## 30 days - default timelimit
    ## [:2] - default first two topics
    ## remember to pass a list in for i in topics[begin:end], otherwise it will not work
    ## you may freely change the parameters in two lines below
    timeborder = pd.to_datetime('now') - pd.Timedelta('190 days')  # 3
    print(topics)
    for i in topics:
        print(i)
        driver.get(i)
        stop = False

        while not stop:
            sleep_r('m')
            articles_df, stop = article_links(time_border=timeborder, art_df=articles_df, category=i)

    articles_df = pd.DataFrame(articles_df)  # finally convert to dataframe

    for cat, df_grp in articles_df.groupby('category'):  # only one category at a time
        print(cat, df_grp.shape)
        topic_list = []
        for i, row in df_grp.iterrows():

            link_dict = {}
            link = row['link']
            # print(i, link)

            # articles_df may contain too many articles
            # timeborder is checked only with regards to page
            # no reasonable change in speed, current solution avoids breaking a for loop
            if pd.to_datetime(row['date_outside']) < timeborder:
                continue

            sleep_r('s')
            while True:
                try:
                    driver.get(link)
                    break
                except:
                    sleep_r('l')
            sleep_r('s')

            link_dict['link'] = link

            link_dict['title'] = driver.find_element_by_xpath('//h2').text
            if len(driver.find_elements_by_xpath('//div[@class="meta"]')) > 0:
                link_dict['author'] = ornone(
                    driver.find_element_by_xpath('//div[@class="meta"]/h4/a[@rel="author"]').text)
                link_dict['date'] = ornone(driver.find_element_by_xpath('//div[@class="meta"]/h4').text.split('\n')[-2])
                # as above, but with date inside (double-check)
                # doesn't seem necessary, but doesn't make code slower
                # print(link_dict['date'], link_dict['date'].split(',')[0], pd.to_datetime(link_dict['date'].split(',')[0]))
                if pd.to_datetime(link_dict['date'].split(',')[0]) < timeborder:
                    continue
            link_dict['categories'] = ', '.join(
                [x.text for x in driver.find_elements_by_xpath('//div[@class="meta"]/a[@id="categories"]')])

            # 4a
            article_p_list = []
            for p in driver.find_elements_by_xpath('//div[@class="content"]//p'):
                article_p_list.append(p.text)
            link_dict['text'] = '\n\n'.join(article_p_list)

            topic_list.append(link_dict)

        articles_df_inside = pd.DataFrame(topic_list)  # converting list of dicts to dataframe
        print(articles_df_inside.shape[0])

        # merging outside with inside
        if articles_df_inside.shape[0] != 0:  # it may happen that articles_df_inside is empty
            pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv(csv_dir_common() +
                'techforge_' + site.split('.')[-2].replace('http://','') + '_' + cat + '.csv')
예제 #25
0
def download_articles(all_articles, time_border, df_all, i, csv_dir):
    stop_categories = False
    topic_list = []
    for i_art, art in enumerate(all_articles):
        print(i_art, len(all_articles), art['link'])
        link_dict = {}
        sleep_r('m')
        driver.get(art['link'])
        sleep_r('s')
        link_dict['link'] = art['link']
        link_dict['title'] = driver.find_element_by_class_name(
            'post__title').find_element_by_tag_name('a').text
        try:
            link_dict['author'] = driver.find_element_by_class_name(
                'post__by').text
        except:
            link_dict['author'] = "NaN"
        try:
            link_dict['date'] = driver.find_element_by_class_name(
                'eyebrow__item').find_element_by_tag_name(
                    'time').get_attribute('datetime')
            article_date = pd.to_datetime(link_dict['date'])
        except:
            article_date = pd.to_datetime('2050-01-01')

        article_p_list = []
        retry_count = 0
        while retry_count < 5:
            try:
                for p in driver.find_element_by_class_name(
                        'post__article ').find_elements_by_tag_name('p'):
                    article_p_list.append(p.text)
                    link_dict['text'] = '\n\n'.join(article_p_list)
                break
            except:
                retry_count += 1
                driver.refresh()
                sleep_r('m')

        topic_list.append(link_dict)

        if article_date < time_border:
            stop_categories = True

    articles_df_inside = pd.DataFrame(topic_list)
    df = pd.DataFrame(all_articles)
    df_temp = pd.merge(df, articles_df_inside, on='link', how='right')
    df_all.append(df_temp)

    if not stop_categories:
        i = i + 1
        next_page = 'https://www.fastcompany.com/category/technology/' + str(i)
        driver.get(next_page)
        print(next_page)
        arts = []
        article_links(arts)

        return download_articles(all_articles=arts,
                                 time_border=time_border,
                                 df_all=df_all,
                                 i=i,
                                 csv_dir=csv_dir)

    pd.DataFrame(pd.concat(df_all)).to_csv(csv_dir + 'fastcompany' + '.csv')

    return df_all