def article_links(arts): sleep_r('m') articles_all = driver.find_element_by_class_name( 'all-feed').find_elements_by_tag_name('article') if not articles_all: driver.refresh() sleep_r('l') articles_all = driver.find_element_by_class_name( 'all-feed').find_elements_by_tag_name('article') for article in articles_all: article_link = article.find_element_by_tag_name('a').get_attribute( 'href') article_title = article.find_element_by_tag_name('a').get_attribute( 'title') try: article_abstract = article.find_element_by_tag_name('h3').text except: article_abstract = "Nan" if 'the-only-hawaiian-shirt-you-should-wear-this-summer' in article_link: continue arts.append({ 'link': article_link, 'title_outside': article_title, 'abstract_outside': article_abstract }) return arts
def download_articles(all_articles, time_border, df_all, csv_dir, i): any_in = False topic_list = [] for i_art, art in enumerate(all_articles): print(i_art, len(all_articles), art['link']) link_dict = {} driver.get(art['link']) sleep_r('m') print(link) link_dict['link'] = art['link'] link_dict['title'] = driver.find_element_by_xpath( '//h1[@itemprop="name"]').text link_dict['author'] = driver.find_element_by_xpath( '//li[@itemprop="author"]').text try: link_dict['date'] = driver.find_element_by_xpath( '//time[@itemprop="datePublished"]').get_attribute('datetime') link_dict['date'] = pd.to_datetime(link_dict['date']) except: link_dict['date'] = 'NaN' article_p_list = [] for p in driver.find_elements_by_xpath( '//div[@itemprop="articleBody"]//p'): article_p_list.append(p.text) link_dict['text'] = '\n\n'.join(article_p_list) print(link_dict['date'], time_border) topic_list.append(link_dict) if link_dict['date'] > time_border: any_in = True articles_df_inside = pd.DataFrame(topic_list) df = pd.DataFrame(all_articles) df_temp = pd.merge(df, articles_df_inside, on='link', how='right') df_all = df_all.append(df_temp) if not any_in: df_all.to_csv(csv_dir + 'conversation' + '.csv') else: i = i + 1 next_page = 'https://theconversation.com/uk/technology/articles' + '?page=' + str( i) driver.get(next_page) print(next_page) arts = [] article_links(arts) return download_articles(all_articles=arts, time_border=time_border, df_all=df_all, csv_dir=csv_dir, i=i) df_all.to_csv(csv_dir + 'conversation' + '.csv') return df_all
def article_links(time_border, art_df, category='//'): stop = False article_category = category.split('/')[-2] articles = driver.find_elements_by_xpath('//article') for article in articles: article_link = article.find_element_by_xpath('.//a').get_attribute( 'href') # 2a article_title = article.find_element_by_xpath('.//h2').text # 2b article_abstract = article.find_element_by_xpath( './/div[@class="summary"]').text # 2c article_author = article.find_element_by_xpath( './div[@class="meta_list"]/h4/a').text # 2d date = article.find_element_by_xpath( './div[@class="meta_list"]/h4').text date = re.search('[0-9]+\ (.+)\ [0-9]+,', date).group()[:-1] article_date = pd.to_datetime(date) # 2e article_number_of_comments = article.find_element_by_xpath( './div[@class="meta_list"]/h4').text.split(',')[-1] art_df.append({ 'link': article_link, 'category': article_category, 'title_outside': article_title, # 2a 'abstract_outside': article_abstract, # 2b 'author_outside': article_author, # 2c 'date_outside': article_date, # 2d 'comments_count_outside': article_number_of_comments } # 2d ) if article_date < time_border: stop = True if not stop: sleep_r('m') next_page = driver.find_elements_by_xpath( '//ul[@class="pagination"]/li[contains(@class, "arrow")]/a' )[-1].get_attribute('href') curr_page = driver.current_url if curr_page == next_page: stop = True return art_df, stop driver.get(next_page) return article_links(time_border=time_border, art_df=art_df, category=category) return art_df, stop
def categories_links(time_border, arts, category): stop_categories = False article_category = category.split('/')[-2] print('category', article_category) # articles_all = driver.find_element_by_class_name('headlines').find_elements_by_class_name('story_link') articles_all = driver.find_elements_by_xpath('//div[contains(@class, "rt-1")]/article') if len(articles_all) == 0: articles_all = driver.find_elements_by_xpath('//div[contains(@class, "rt-")]/article') # articles_all = driver.find_elements_by_xpath('//div[contains(@class, "one_story")]') for article in articles_all: article_link = article.get_attribute('href') if article_link is None: article_link = article.find_element_by_xpath('./a').get_attribute('href') try: article_title = article.find_element_by_tag_name('h4').text # 2a except: article_title = article.find_element_by_tag_name('h3').text article_abstract = article.find_element_by_class_name('standfirst').text # 2b try: date = article.find_element_by_class_name('time_stamp').get_attribute('data-epoch') except: date = np.nan article_date = pd.to_datetime(date, unit='s') print(date, article_date, article_link) arts.append({'link': article_link, 'category': article_category, 'title_outside': article_title, # 2a 'abstract_outside': article_abstract, 'date_outside': article_date # 2d }) if article_date < time_border: stop_categories = True if not stop_categories: sleep_r('m') try: next_page = driver.find_element_by_xpath('//div[contains(@class, "more_content")]//a') driver.get(next_page.get_attribute('href')) except NoSuchElementException: next_page = driver.find_element_by_class_name('earlier_pages').find_elements_by_tag_name('a') driver.get(next_page[-1].get_attribute('href')) except Exception as e: print(e) return arts return categories_links(time_border=time_border, arts=arts, category=category) return arts
def download_articles(all_articles, time_border, csv_dir): print(all_articles) for i_art, art in enumerate(all_articles): print(i_art, len(all_articles), art) link_dict = {} try: driver.get(art['link']) sleep_r('m') link_dict['link'] = art['link'] try: link_dict['title'] = driver.find_element_by_xpath('//h1[@itemprop="headline"]').text except NoSuchElementException: try: link_dict['title'] = driver.find_element_by_xpath('//h1[@articleprop="headline"]').text except NoSuchElementException: print(art['link'], 'no title') continue try: link_dict['author'] = driver.find_element_by_xpath('//span[@itemprop="name"]').text except NoSuchElementException: link_dict['author'] = '' try: link_dict['description'] = driver.find_element_by_xpath('//div[@class="gs-container"]//p').text except NoSuchElementException: link_dict['description'] = '' try: link_dict['date'] = driver.find_element_by_xpath( '//div[contains(@class,"content__meta-container")]//time').get_attribute('datetime') # 5a except NoSuchElementException: continue # we don't need articles if we're not certain about date if pd.to_datetime(link_dict['date']) < time_border: continue article_p_list = [] for p in [x.text for x in driver.find_elements_by_xpath('//div[(@itemprop="articleBody") or' '(@itemprop="reviewBody")]/p')]: article_p_list.append(p) link_dict['text'] = '\n\n'.join(article_p_list) art.update(link_dict) all_articles[i_art] = art except TimeoutException: print('timeout', art['link']) all_articles.append(art) continue pd.DataFrame(all_articles).to_csv(csv_dir + 'guardian.csv')
def download_articles(all_articles, time_border, csv_dir): for cat, df_grp in all_articles.groupby( 'category'): # only one category at a time print(cat, df_grp.shape) topic_list = [] for i, row in df_grp.iterrows(): link_dict = {} link = row['link'] if row['link'] != None: link = row['link'] else: print('no', link) continue print(i, link) sleep_r('m') driver.get(link) sleep_r('s') link_dict['link'] = link link_dict['title'] = driver.find_element_by_class_name( 'article_head').find_element_by_tag_name('h1').text # 4a link_dict['description'] = driver.find_element_by_class_name( 'article_head').find_element_by_tag_name('h2').text # 2b/4a link_dict['author'] = driver.find_element_by_class_name( 'byline').find_element_by_tag_name('a').text link_dict['date'] = driver.find_element_by_class_name( 'dateline').text # all_articles may contain too many articles # time_border is checked only with regards to page print(pd.to_datetime(link_dict['date']), pd.to_datetime(row['date_outside']), time_border) if pd.to_datetime( link_dict['date']) < time_border or pd.to_datetime( row['date_outside']) < time_border: continue article_p_list = [] for p in driver.find_element_by_id( 'body').find_elements_by_tag_name('p'): article_p_list.append(p.text) link_dict['text'] = '\n\n'.join(article_p_list) # 4a topic_list.append(link_dict) articles_df_inside = pd.DataFrame( topic_list) # converting list of dicts to DataFrame print(articles_df_inside.shape[0]) # merging outside with inside pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv(csv_dir + 'register_' + cat + '.csv')
def categories_links(time_border, arts, category): stop_categories = False article_category = category.split('/')[-2] articles_all = driver.find_elements_by_xpath( '//li[contains(@class, "article")]') for article in articles_all: article_link = article.find_element_by_xpath('.//a').get_attribute( 'href') article_title = article.find_element_by_xpath( './/header/h2').text # 2a article_abstract = article.find_element_by_xpath( './/header/p[@class="excerpt"]').text # 2b article_author = article.find_element_by_xpath( './/p[@class="byline"]//span[@itemprop="name"]').text # 2c date = article.find_element_by_xpath('.//time') article_date = pd.to_datetime(date.get_attribute('datetime')) # 2d article_number_of_comments = article.find_element_by_xpath( './/footer//span[@class="comment-count-number"]').text # 2e arts.append({ 'link': article_link, 'category': article_category, 'title_outside': article_title, # 2a 'abstract_outside': article_abstract, # 2b 'author_outside': article_author, # 2c 'date_outside': article_date, # 2d 'comments_count_outside': article_number_of_comments } # 2e ) print(article_date, time_border) if article_date < time_border: stop_categories = True if not stop_categories: sleep_r('m') next_page_div = driver.find_elements_by_xpath( '//div[contains(@class, "prev-next-links")]/a') if len(next_page_div ) == 1: # only load more stories available on the first page next_page = next_page_div[0].get_attribute('href') elif len(next_page_div) > 1: # older stories / newer stories next_page = next_page_div[0].find_element_by_xpath( './../a[@rel="prev"]').get_attribute('href') else: return arts driver.get(next_page) return categories_links(time_border=time_border, arts=arts, category=category) return arts
def categories_links(time_border, arts, category): any_in = False print('---') article_category = category.split('/')[-2] articles_all = driver.find_elements_by_xpath('//article[not(contains(@class, "sub-post"))]') for article in articles_all: article_link = article.find_element_by_tag_name('a').get_attribute('href') article_title = article.find_element_by_class_name('entry-title').text try: article_author = article.find_element_by_class_name('article-header').find_element_by_css_selector( '.author').text except: article_author = "NaN" try: article_date = article.find_element_by_class_name('article-header').find_element_by_tag_name('time').text article_date = pd.to_datetime(article_date) except: article_date = "NaN" print(article_date) try: # sponsor=article.find_element_by_class_name('article-header').find_element_by_class_name('sponsored-by ').find_element_by_tag_name('a').get_attribute('href') sponsored_div = article.find_elements_by_xpath('./header/div[contains(@class, "sponsored-by")]//a') article_sponsor = '' if len(sponsored_div) > 0: article_sponsor = sponsored_div[0].get_attribute('href') except: sponsor = "NaN" if article_date == 'NaN' or article_date > time_border: any_in = True arts.append({'link': article_link, 'title_outside': article_title, 'author_outside': article_author, 'date_outside': article_date, 'sponsor_outside': article_sponsor, 'category': article_category} ) else: return arts if not any_in: return arts else: sleep_r('m') next_page = driver.find_element_by_class_name('page-numbers').find_element_by_class_name('next').get_attribute( 'href') driver.get(next_page) return categories_links(time_border=time_border, arts=arts, category=category)
def download_articles(all_articles, csv_dir): topic_list = [] for i_art, art in enumerate(all_articles): print(i_art, len(all_articles), art['link']) link_dict = {} sleep_r('s') link = art['link'] driver.get(link) sleep_r('m') link_dict['link'] = link link_dict['title'] = driver.find_element_by_xpath( '//h1[contains(@class, "headline")]').text link_dict['author'] = driver.find_element_by_xpath( '//div[contains(@class, "first-container")]/div/div').text if 'MIN READ' in link_dict['author']: try: link_dict['author'] = driver.find_element_by_xpath( '//div[contains(@class, "first-container")]/div/p').text except NoSuchElementException: try: link_dict['author'] = driver.find_element_by_xpath( '//p[@class="Attribution_content"]').text except NoSuchElementException: link_dict['author'] = 'NaN' try: link_dict['date'] = driver.find_element_by_xpath( '//div[contains(@class, "date")]').text.split('/')[0] link_dict['date'] = pd.to_datetime(link_dict['date']) except: link_dict['date'] = 'NaN' article_p_list = [] for p in driver.find_elements_by_xpath( '//div[contains(@class, "body")]/p'): try: article_p_list.append(p.text) except: pass link_dict['text'] = '\n\n'.join(article_p_list) topic_list.append(link_dict) articles_df_inside = pd.DataFrame(topic_list) print(articles_df_inside.shape[0]) df = pd.DataFrame(all_articles) df_all = pd.merge(df, articles_df_inside, on='link', how='right') pd.merge(df, articles_df_inside, on='link', how='right').to_csv(csv_dir + 'reuters' + '.csv')
def download_articles(all_articles, csv_dir, category_name, driver): for i_art, art in enumerate(all_articles): link_dict = {} print(i_art, len(all_articles), category_name, art['link']) try: driver.get(art['link']) sleep_r('m') link_dict['link'] = art['link'] try: link_dict['title'] = driver.find_element_by_xpath('//header/h1[@itemprop = "headline"]').text except NoSuchElementException: link_dict['title'] = '' try: link_dict['author'] = driver.find_element_by_xpath('//p/a[@itemprop="author"]').text except NoSuchElementException: link_dict['author'] = '' try: link_dict['description'] = driver.find_element_by_xpath( '//header[@class="storyHeader article"]/p[@itemprop="description alternativeHeadline"]').text except NoSuchElementException: link_dict['description'] = '' try: link_dict['date'] = driver.find_element_by_xpath( '//header[@class="storyHeader article"]/div/p/time').get_attribute('datetime') # 5a except NoSuchElementException: continue article_p_list = [] for p in [x.text for x in driver.find_elements_by_xpath('//article/div/p')]: article_p_list.append(p) link_dict['text'] = '\n\n'.join(article_p_list) art.update(link_dict) all_articles[i_art] = art except TimeoutException: print('timeout', art['link']) all_articles.append(art) continue pd.DataFrame(all_articles).to_csv(csv_dir + 'zdnet_' + category_name + '.csv')
def article_links(time_border, arts): any_in = False articles = driver.find_elements_by_xpath( '//section[@class="main"]//div[contains(@class, "post-wrapper")]') for article in articles: article_link = article.find_element_by_xpath( './/h1[contains(@class, "headline")]/a').get_attribute('href') if ('gizmodo' not in article_link) or ('io9.gizmodo' in article_link): continue article_title = article.find_element_by_xpath( './/h1[(contains(@class, "headline")) or (contains(@class, "title"))]' ).text try: article_date = article.find_element_by_xpath( './/div[contains(@class, "meta__container")]/time' ).get_attribute('datetime') article_date = pd.to_datetime(article_date) except: article_date = 'NaN' try: article_author = article.find_element_by_xpath( './/div[contains(@class, "author")]').text except: article_author = 'NaN' if article_date == 'NaN' or article_date > time_border: print(article_date) any_in = True arts.append({ 'link': article_link, 'title_outside': article_title, 'date_outside': article_date, 'author_outside': article_author, }) #else: # return arts if not any_in: return arts else: sleep_r('m') next_page = driver.find_element_by_xpath( '//div[@class="load-more__button"]/a').get_attribute('href') driver.get(next_page) return article_links(time_border=time_border, arts=arts) return arts
def categories_links(time_border, arts, category): stop_categories = False article_category = category.split('/')[-2] articles_all = driver.find_element_by_class_name( 'skin-wrapper').find_elements_by_xpath( '//a[@class="article"] | //div/article') try: date = articles_all[-1].find_element_by_class_name( 'the-time').get_attribute('title') except: date = np.nan if date == np.nan: article_date = np.nan else: article_date = pd.to_datetime(str(date)[:11]) if article_date < time_border: stop_categories = True if stop_categories: for article in articles_all: article_link = article.find_element_by_tag_name('a').get_attribute( 'href') article_title = article.text # 2a article_date = article.find_element_by_class_name( 'the-time').get_attribute('title') print(date, article_date) arts.append({ 'link': article_link, 'category': article_category, 'title_outside': article_title, # 2a 'date_outside': article_date # 2d }) print(len(arts), article_category, category) return arts else: try: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") sleep_r('l') # sleep_r('m') driver.find_element_by_xpath( '//div[@class="load-more"]/button').click() # driver.execute_script('arguments[0].click();', driver.find_element_by_xpath('//div[@class="load-more"]/button')) except Exception as e: print(e) return categories_links(time_border=time_border, arts=arts, category=category)
def download_articles(all_articles, time_border, csv_dir): for cat, df_grp in all_articles.groupby( 'category'): # only one category at a time topic_list = [] for i, row in df_grp.iterrows(): link_dict = {} if 'newsletters-signup' not in row['link']: link = row['link'] print(i, link) else: continue sleep_r('m') driver.get(link) sleep_r('s') link_dict['link'] = link link_dict['title'] = driver.find_element_by_class_name( 'article-main-title').text # 4a link_dict['description'] = driver.find_element_by_class_name( 'article-dek').text # 2b/4a link_dict['author'] = driver.find_element_by_class_name( 'author-name').text # 2c link_dict['date'] = driver.find_element_by_tag_name('label').text # all_articles may contain too many articles # time_border is checked only with regards to page if pd.to_datetime( link_dict['date'][:11]) < time_border or pd.to_datetime( row['date_outside']) < time_border: continue article_p_list = [] for p in driver.find_elements_by_xpath( '//div[contains(@class, "articleBody")]//p'): article_p_list.append(p.text) link_dict['text'] = '\n\n'.join(article_p_list) # 4a topic_list.append(link_dict) articles_df_inside = pd.DataFrame( topic_list) # converting list of dicts to DataFrame print(articles_df_inside.shape[0]) # merging outside with inside pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv(csv_dir + 'ieee_' + cat + '.csv')
def download_articles(all_articles, time_border, csv_dir): for cat, df_grp in all_articles.groupby( 'category'): # only one category at a time topic_list = [] for i, row in df_grp.iterrows(): link_dict = {} link = row['link'] print(i, link) sleep_r('m') driver.get(link) sleep_r('s') link_dict['link'] = link link_dict['title'] = driver.find_element_by_xpath( '//h1[@itemprop="headline"]').text # 4a link_dict['description'] = driver.find_element_by_xpath( '//h2[@itemprop="description"]').text # 2b/4a link_dict['author'] = driver.find_element_by_xpath( '//span[@itemprop="name"]').text # 2c link_dict['date'] = driver.find_element_by_xpath( '//section[contains(@class, "post-meta")]//time' ).get_attribute('datetime') # all_articles may contain too many articles # time_border is checked only with regards to page if pd.to_datetime( link_dict['date']) < time_border or pd.to_datetime( row['date_outside']) < time_border: continue article_p_list = [] for p in driver.find_elements_by_xpath( '//div[@itemprop="articleBody"]/p'): article_p_list.append(p.text) link_dict['text'] = '\n\n'.join(article_p_list) # 4a topic_list.append(link_dict) articles_df_inside = pd.DataFrame( topic_list) # converting list of dicts to DataFrame print(articles_df_inside.shape[0]) # merging outside with inside pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv(csv_dir + 'arstechnica_' + cat + '.csv')
def categories_links(time_border, art_links): temp_art_links = [] articles = driver.find_elements_by_xpath('//div[@class="fc-item__container"]') next_page = driver.find_element_by_xpath('//a[contains(@aria-label, " next page")]').get_attribute('href') any_in = False for article in articles: article_link = article.find_element_by_xpath('.//a').get_attribute('href') article_title = article.find_element_by_xpath('.//*[contains(@class,"fc-item__title")]').text article_date = article.find_element_by_xpath('.//div[contains(@class, "fc-item__meta")]/time').get_attribute('datetime') try: article_author = article.find_element_by_xpath('.//div[@class="fc-item__byline"]').text except NoSuchElementException: article_author = '' try: article_comment_count = article.find_element_by_xpath('.//a[@data-link-name="Comment count"]').text except NoSuchElementException: article_comment_count = '' if pd.to_datetime(article_date) > time_border: any_in = True temp_art_links.append({'link': article_link, 'title_outside': article_title, 'date_outside': article_date, 'author_outside': article_author, 'comment_count_outside': article_comment_count, } ) print(art_links) if not any_in: pickle.dump(art_links, open('guardian_links.pickle', 'wb')) # in case something goes wrong with articles return art_links else: art_links += temp_art_links sleep_r('m') try: driver.get(next_page) except Exception as ex: print(ex) return art_links return categories_links(time_border=time_border, art_links=art_links)
def download_articles(all_articles, csv_dir): topic_list = [] for i_art, art in enumerate(all_articles): print(i_art, len(all_articles), art['link']) link_dict = {} sleep_r('s') link = art['link'] driver.get(link) sleep_r('l') try: link_dict['link'] = link link_dict['title'] = driver.find_element_by_xpath('//h1').text try: link_dict['author'] = driver.find_element_by_xpath( '//div[contains(@class, "author")]').text except: link_dict['author'] = 'NaN' link_dict['date'] = driver.find_element_by_xpath( '//div[contains(@class,"meta__container")]/time' ).get_attribute('datetime') article_p_list = [] for p in [ x.text for x in driver.find_elements_by_xpath( '//div[contains(@class, "entry-content")]//p') ]: article_p_list.append(p) link_dict['text'] = '\n\n'.join(article_p_list) topic_list.append(link_dict) # print(topic_list) except: print('problem', link) articles_df_inside = pd.DataFrame(topic_list) df = pd.DataFrame(all_articles) # df_all=pd.merge(df, articles_df_inside, on='link', how='right') pd.merge(df, articles_df_inside, on='link', how='right').to_csv(csv_dir + 'gizmodo_' + '.csv')
def download_articles(all_articles, time_border, csv_dir): for cat, df_grp in all_articles.groupby('category'): topic_list = [] for i, row in df_grp.iterrows(): link_dict = {} link = row['link'] print(i, link) sleep_r('m') driver.get(link) sleep_r('s') link_dict['link'] = link link_dict['title'] = link_dict['title'] = driver.find_element_by_id( 'inner-content').find_element_by_tag_name('h1').text try: link_dict['author'] = driver.find_element_by_class_name('article-header').find_element_by_css_selector( '.author').text except: link_dict['author'] = "NaN" try: link_dict['date'] = driver.find_element_by_class_name('entry-time').text except: link_dict['date'] = "NaN" # if pd.to_datetime(link_dict['date']) < time_border: # continue ## It may contain sponsor information at the beginning article_p_list = [] for p in driver.find_element_by_id('main').find_element_by_class_name( 'entry-content').find_elements_by_tag_name('p'): article_p_list.append(p.text) link_dict['text'] = '\n\n'.join(article_p_list) topic_list.append(link_dict) articles_df_inside = pd.DataFrame(topic_list) print(articles_df_inside.shape[0]) df_all = pd.merge(df, articles_df_inside, on='link', how='right') pd.merge(df, articles_df_inside, on='link', how='right').to_csv(csv_dir + 'gigaom_' + cat + '.csv')
def download_articles(articles_df, time_border, csv_dir): for cat, df_grp in articles_df.groupby('category'): # only one category at a time topic_list = [] for i, row in df_grp.iterrows(): link_dict = {} link = row['link'] print(i, link) sleep_r('m') driver.get(link) sleep_r('s') link_dict['link'] = link link_dict['title'] = driver.find_element_by_xpath('//header/h1').text link_dict['author'] = driver.find_element_by_xpath('//span[@itemprop="name"]').text link_dict['date'] = driver.find_element_by_xpath('//time[@itemprop="datePublished"]').get_attribute( 'datetime') # here time_border necessary, no date on the outside if pd.to_datetime(link_dict['date']) < time_border: continue # 4a article_p_list = [] for p in driver.find_elements_by_xpath('//div[@id="content-main"]//p'): article_p_list.append(p.text) link_dict['text'] = '\n\n'.join(article_p_list) topic_list.append(link_dict) articles_df_inside = pd.DataFrame(topic_list) # converting list of dicts to dataframe print(articles_df_inside.shape[0]) # merging outside with inside if articles_df_inside.shape[0] != 0: # it may happen that articles_df_inside is empty pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv( csv_dir + 'techforge_blockchain_' + cat + '.csv')
def articles_links(time_border, topics, arts, driver): for topic in topics: first_date = pd.to_datetime('now') i = 1 while first_date > time_border: driver.get(topic + '/' + str(i)) sleep_r('m') first_date = pd.to_datetime( driver.find_element_by_xpath('//article[@class = "item"]/div/div/p/span').get_attribute('data-date')) if first_date is None: first_date = pd.to_datetime('now') i = i + 1 # category = driver.find_element_by_xpath('//header/h1[@itemprop = "headline"]').text category = topics[0].split('/')[-2] sleep_r('m') articles_all = driver.find_elements_by_xpath('//section[@id="topic-river-latest"]/div/div/div/article') if not len(articles_all): break elif pd.to_datetime(driver.find_element_by_xpath('//article[@class = "item"]/div/div/p/span').get_attribute( 'data-date')) is not None and pd.to_datetime(driver.find_element_by_xpath('//article[@class = "item"]/div/div/p/span').get_attribute( 'data-date')) < time_border: break else: for article in articles_all: article_link = article.find_element_by_xpath('.//h3/a').get_attribute('href') article_title = article.find_element_by_xpath('.//h3/a').text # 2a article_abstract = article.find_element_by_xpath('.//p[@class = "summary"]').text # 2b try: article_author = article.find_element_by_xpath('.//p[@class="meta"]/a').text # 2c except NoSuchElementException: article_author = '' try: date = article.find_element_by_xpath('.//p[@class="meta"]/span') article_date = pd.to_datetime(date.get_attribute('data-date')) # 2d except NoSuchElementException: article_date = '' arts.append({'link': article_link, 'category': category, 'title_outside': article_title, # 2a 'abstract_outside': article_abstract # 2b , 'author_outside': article_author, # 2c 'date_outside': article_date # 2d } ) sleep_r('l') return arts
def download_articles(all_articles, time_border, csv_dir): all_articles.drop_duplicates(subset=['link'], inplace=True) all_links = [] for i in os.listdir(csv_dir): if 'venturebeat' in i: all_links.extend(pd.read_csv(csv_dir + i)['link'].unique().tolist()) for cat, df_grp in all_articles.groupby('category'): # only one category at a time print(cat, df_grp.shape) topic_list = [] for i, row in df_grp.iterrows(): link_dict = {} if row['link'] is not None and row['link'] not in all_links: link = row['link'] print(row['link'], all_links[0]) else: continue if pd.to_datetime(row['date_outside']) < time_border: continue print(i, link) sleep_r('m') while True: try: driver.get(link) break except TimeoutException: sleep_r('l') sleep_r('s') link_dict['link'] = link link_dict['title'] = driver.find_element_by_class_name('article-header').find_element_by_class_name( 'article-title').text # 4a link_dict['author'] = driver.find_element_by_xpath('//a[contains(@class, "author")]').text link_dict['date'] = driver.find_element_by_class_name('the-time').get_attribute('title')[:11] print(link_dict['date'], pd.to_datetime(link_dict['date']), row['date_outside'], time_border) # all_articles may contain too many articles # time_border is checked only with regards to page if pd.to_datetime(link_dict['date']) < time_border or pd.to_datetime(row['date_outside']) < time_border: continue article_p_list = [] for p in driver.find_element_by_class_name('article-content').find_elements_by_tag_name('p'): article_p_list.append(p.text) link_dict['text'] = '\n\n'.join(article_p_list) # 4a topic_list.append(link_dict) print(len(topic_list)) articles_df_inside = pd.DataFrame(topic_list) # converting list of dicts to DataFrame print(articles_df_inside.shape) # merging outside with inside print('save', cat) pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv(csv_dir + 'venturebeat_' + cat + '.csv')
def categories_links(time_border, arts, category): stop_categories = False article_category = category.split('/')[-1] try: driver.execute_script('splashpage.closeit()') except: pass try: driver.find_element_by_class_name('cc-compliance').click() sleep_r('l') except: print('no cc compliance') pass articles_all = driver.find_element_by_class_name( 'topic-wrap').find_elements_by_tag_name('article') for article in articles_all: try: article_link = article.find_element_by_tag_name('a').get_attribute( 'href') except NoSuchElementException: print(article.text, 'no link') continue article_title = article.find_element_by_tag_name('h3').text # 2a try: article_author = article.find_element_by_class_name( 'author-name').text # 2c except: article_author = np.nan try: date = article.find_element_by_tag_name('label').text except: try: date = article.find_element_by_tag_name('time').text except: date = np.nan if len(str(date)) > 10: article_date = pd.to_datetime(date[:11]) elif len(str(date)) < 10 and len(str(date)) > 4: date = yr + str(date) article_date = pd.to_datetime(date, format="%Y %d %b") else: article_date = pd.to_datetime('2100-01-01') arts.append({ 'link': article_link, 'category': article_category, 'title_outside': article_title, # 2a 'author_outside': article_author, # 2c 'date_outside': article_date # 2d }) if article_date < time_border: stop_categories = True if not stop_categories: sleep_r('m') WebDriverWait(driver, 100).until( lambda driver: driver.find_element_by_id('blog-load-more')) driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") sleep_r('s') loadMoreButton = driver.find_element_by_id('blog-load-more') try: loadMoreButton.click() except Exception as e: print(e) return arts return categories_links(time_border=time_border, arts=arts, category=category) return arts
pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv( csv_dir + 'techforge_blockchain_' + cat + '.csv') articles_df = [] # keep all articles in this list ## [:2] - default first two topics ## remember to pass a list in for i in topics[begin:end], otherwise it will not work ## you may freely change the parameters in two lines below time_border = pd.to_datetime('now') - pd.Timedelta('190 days') # 3 topics = list(set(topics_links())) for i in topics: print(i) driver.get(i) stop = False while True: sleep_r('m') inf_more = driver.find_element_by_xpath('//a[@class="inf-more-but"]') if 'display: none' in inf_more.get_attribute('style'): break else: driver.execute_script('arguments[0].click();', inf_more) sleep_r('m') articles_df = article_links(art_df=articles_df, category=i) articles_df = pd.DataFrame(articles_df) # finally convert to dataframe download_articles(articles_df, time_border, csv_dir_common())
# merging outside with inside print('save', cat) pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv(csv_dir + 'venturebeat_' + cat + '.csv') csvs = csv_dir_common() topics = topics_links() arts = [] # remember to pass a list in for i (even if downloading a single topic), otherwise it will not work # you may freely change the parameters in two lines below time_limit = pd.to_datetime('now') - pd.Timedelta('190 days') # 3 articles = [] # keep all articles in this list <<<<<<< HEAD print(topics) for topic in topics: ======= print(topics, topics[8:]) for topic in topics[8:]: >>>>>>> 0cd159f401e3f5c0fe800013dd578d725f38fc98 articles = [] # keep all articles in this list print(topic) driver.get(topic) print('topic', topic, 'opened') sleep_r('l') articles = categories_links(time_border=time_limit, arts=articles, category=topic) articles_df = pd.DataFrame(articles) download_articles(articles_df, time_border=time_limit, csv_dir=csvs)
def tf_site(site): driver = full_driver() sleep_r('m') driver.get(site) # find links to categories (BIZ & IT, TECH etc.) while True: topics_a = driver.find_elements_by_xpath('//ul[@class="right"]/li[contains(@class,' '"has-dropdown")]/ul[@class="dropdown"]/li/a') if len(topics_a) > 0: break else: print('no topics') sleep_r('m') driver.get(site) sleep_r('m') topics = [] for i in topics_a: if '/categories/' in i.get_attribute('href'): # if 'cloudcomputing' in site and 'case-studies' in i.get_attribute( # 'href') or 'data-analytics' in i.get_attribute('href'): # topics.append(i.get_attribute('href')) # if 'cloudcomputing' not in site: if 'developer' not in site: topics.append(i.get_attribute('href')) elif 'Gaming' not in i.get_attribute('href'): topics.append(i.get_attribute('href')) # topic numbers, if you want to choose only some for i, topic in enumerate(topics): print(site, i, topic) def article_links(time_border, art_df, category='//'): stop = False article_category = category.split('/')[-2] articles = driver.find_elements_by_xpath('//article') for article in articles: article_link = article.find_element_by_xpath('.//a').get_attribute('href') # 2a article_title = article.find_element_by_xpath('.//h2').text # 2b article_abstract = article.find_element_by_xpath('.//div[@class="summary"]').text # 2c article_author = article.find_element_by_xpath('./div[@class="meta_list"]/h4/a').text # 2d date = article.find_element_by_xpath('./div[@class="meta_list"]/h4').text date = re.search('[0-9]+\ (.+)\ [0-9]+,', date).group()[:-1] article_date = pd.to_datetime(date) # 2e article_number_of_comments = article.find_element_by_xpath('./div[@class="meta_list"]/h4').text.split(',')[ -1] art_df.append({'link': article_link, 'category': article_category, 'title_outside': article_title, # 2a 'abstract_outside': article_abstract, # 2b 'author_outside': article_author, # 2c 'date_outside': article_date, # 2d 'comments_count_outside': article_number_of_comments} # 2d ) if article_date < time_border: stop = True if not stop: sleep_r('m') next_page = driver.find_elements_by_xpath('//ul[@class="pagination"]/li[contains(@class, "arrow")]/a')[ -1].get_attribute('href') curr_page = driver.current_url if curr_page == next_page: stop = True return art_df, stop driver.get(next_page) return article_links(time_border=time_border, art_df=art_df, category=category) return art_df, stop articles_df = [] # keep all articles in this list ## 30 days - default timelimit ## [:2] - default first two topics ## remember to pass a list in for i in topics[begin:end], otherwise it will not work ## you may freely change the parameters in two lines below timeborder = pd.to_datetime('now') - pd.Timedelta('190 days') # 3 print(topics) for i in topics: print(i) driver.get(i) stop = False while not stop: sleep_r('m') articles_df, stop = article_links(time_border=timeborder, art_df=articles_df, category=i) articles_df = pd.DataFrame(articles_df) # finally convert to dataframe for cat, df_grp in articles_df.groupby('category'): # only one category at a time print(cat, df_grp.shape) topic_list = [] for i, row in df_grp.iterrows(): link_dict = {} link = row['link'] # print(i, link) # articles_df may contain too many articles # timeborder is checked only with regards to page # no reasonable change in speed, current solution avoids breaking a for loop if pd.to_datetime(row['date_outside']) < timeborder: continue sleep_r('s') while True: try: driver.get(link) break except: sleep_r('l') sleep_r('s') link_dict['link'] = link link_dict['title'] = driver.find_element_by_xpath('//h2').text if len(driver.find_elements_by_xpath('//div[@class="meta"]')) > 0: link_dict['author'] = ornone( driver.find_element_by_xpath('//div[@class="meta"]/h4/a[@rel="author"]').text) link_dict['date'] = ornone(driver.find_element_by_xpath('//div[@class="meta"]/h4').text.split('\n')[-2]) # as above, but with date inside (double-check) # doesn't seem necessary, but doesn't make code slower # print(link_dict['date'], link_dict['date'].split(',')[0], pd.to_datetime(link_dict['date'].split(',')[0])) if pd.to_datetime(link_dict['date'].split(',')[0]) < timeborder: continue link_dict['categories'] = ', '.join( [x.text for x in driver.find_elements_by_xpath('//div[@class="meta"]/a[@id="categories"]')]) # 4a article_p_list = [] for p in driver.find_elements_by_xpath('//div[@class="content"]//p'): article_p_list.append(p.text) link_dict['text'] = '\n\n'.join(article_p_list) topic_list.append(link_dict) articles_df_inside = pd.DataFrame(topic_list) # converting list of dicts to dataframe print(articles_df_inside.shape[0]) # merging outside with inside if articles_df_inside.shape[0] != 0: # it may happen that articles_df_inside is empty pd.merge(df_grp, articles_df_inside, on='link', how='right').to_csv(csv_dir_common() + 'techforge_' + site.split('.')[-2].replace('http://','') + '_' + cat + '.csv')
def download_articles(all_articles, time_border, df_all, i, csv_dir): stop_categories = False topic_list = [] for i_art, art in enumerate(all_articles): print(i_art, len(all_articles), art['link']) link_dict = {} sleep_r('m') driver.get(art['link']) sleep_r('s') link_dict['link'] = art['link'] link_dict['title'] = driver.find_element_by_class_name( 'post__title').find_element_by_tag_name('a').text try: link_dict['author'] = driver.find_element_by_class_name( 'post__by').text except: link_dict['author'] = "NaN" try: link_dict['date'] = driver.find_element_by_class_name( 'eyebrow__item').find_element_by_tag_name( 'time').get_attribute('datetime') article_date = pd.to_datetime(link_dict['date']) except: article_date = pd.to_datetime('2050-01-01') article_p_list = [] retry_count = 0 while retry_count < 5: try: for p in driver.find_element_by_class_name( 'post__article ').find_elements_by_tag_name('p'): article_p_list.append(p.text) link_dict['text'] = '\n\n'.join(article_p_list) break except: retry_count += 1 driver.refresh() sleep_r('m') topic_list.append(link_dict) if article_date < time_border: stop_categories = True articles_df_inside = pd.DataFrame(topic_list) df = pd.DataFrame(all_articles) df_temp = pd.merge(df, articles_df_inside, on='link', how='right') df_all.append(df_temp) if not stop_categories: i = i + 1 next_page = 'https://www.fastcompany.com/category/technology/' + str(i) driver.get(next_page) print(next_page) arts = [] article_links(arts) return download_articles(all_articles=arts, time_border=time_border, df_all=df_all, i=i, csv_dir=csv_dir) pd.DataFrame(pd.concat(df_all)).to_csv(csv_dir + 'fastcompany' + '.csv') return df_all