def getUrls(): urls = [] #number of pages, there are 30 articles per page for i in range(1, 19): tree = scrape.getHTML('http://www.breitbart.com/big-government/page/' + str(i) + '/') urls.extend(tree.xpath('//div[@class="article-list"]/article/a/@href')) return urls[0:500] def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath( '//article/div[@class="entry-content"]/p|//article/div[@class="entry-content"]/blockquote' ) paragraphsUTF = [] for paragraph in paragraphs: paragraph = scrape.encodeParagraph(paragraph.text_content()) paragraphsUTF.append(paragraph) return paragraphsUTF if __name__ == "__main__": urls = getUrls() scrape.writeArticleFile('./RealNews/BreitbartArticles', urls, sys.modules[__name__])
tree = scrape.getHTML('http://www.nationalreview.com/archives?page=' + str(page_num)) extensions = tree.xpath( '//div[@class=" timeline blog cf p-r"]/ul//a/@href') if extensions == []: break urls.extend([base_path + extension for extension in extensions]) page_num += 1 return urls[0:numUrls] def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath('//div[@itemprop="articleBody"]/p') paragraphsUTF = [] for paragraph in paragraphs: stripped_par = scrape.encodeParagraph(paragraph.text_content()).strip() if len(stripped_par) > 0: paragraphsUTF.append(stripped_par) return paragraphsUTF if __name__ == "__main__": urls = getUrls(500) scrape.writeArticleFile('./RealNews/NationalReviewArticles', urls, sys.modules[__name__])
def getUrls(numUrls): urls = [] base_path = 'http://www.economist.com/' page_num = 1 while len(urls) < numUrls: tree = scrape.getHTML('http://www.economist.com/sections/united-states?page=' + str(page_num)) extensions = tree.xpath('//div[@class="teaser-list"]/article/a/@href') if extensions == []: break urls.extend([base_path + extension for extension in extensions]) page_num += 1 return urls[0:numUrls] def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath('//div[@class="blog-post__text"]/p') paragraphsUTF = [] for paragraph in paragraphs: paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content())) return paragraphsUTF if __name__ == "__main__": urls = getUrls(500) scrape.writeArticleFile('./RealNews/TheEconomistArticles', urls, sys.modules[__name__])
page_num = 2 while len(urls) < numUrls: tree = scrape.getHTML('http://www.oann.com/category/politics/page/' + str(page_num)) extensions = tree.xpath( '//div[@id="main-content"]/article/header/h3/a/@href') urls.extend(extensions) page_num += 1 if len(urls) % 99 == 0: print len(urls) return urls[0:numUrls] def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath('//div[@class="entry-content clearfix"]/p') paragraphsUTF = [] for paragraph in paragraphs: encoded_par = scrape.encodeParagraph(paragraph.text_content()).strip() if not encoded_par.startswith('('): paragraphsUTF.append(encoded_par) return paragraphsUTF if __name__ == "__main__": urls = getUrls(500) scrape.writeArticleFile('./RealNews/OANNArticles', urls, sys.modules[__name__])
if page_num == 1: extensions.extend( tree.xpath( '//div[@class="block article-list featured-articles"]/article/h2/a/@href' )) print extensions if extensions == []: break urls.extend([base_path + extension for extension in extensions]) page_num += 1 return urls[0:numUrls] def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath( '//div[@class="storyareawrapper"]/div[@class="bigtext"]/p') paragraphsUTF = [] for paragraph in paragraphs: paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content())) return paragraphsUTF if __name__ == "__main__": urls = getUrls(500) scrape.writeArticleFile('./RealNews/WashingtonTimesArticles', urls, sys.modules[__name__])
for i in range(2, 35): tree = scrape.getHTML('https://www.theatlantic.com/politics/?page=' + str(i)) urls.extend( tree.xpath( '//div[@class="river-body"]/ul[@class="river"]/li[@class="article blog-article "]/a/@href' )) urls = [x for x in urls if 'politics/archive/' in x] return urls[0:500] def scraper(url): tree = scrape.getHTML('https://www.theatlantic.com/' + url) paragraphs = tree.xpath( '//div[@class="article-body"]/section/p|//div[@class="article-body"]/section/blockquote' ) paragraphsUTF = [] for paragraph in paragraphs: paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content())) return paragraphsUTF if __name__ == "__main__": urls = getUrls() print len(urls), len(set(urls)) scrape.writeArticleFile('./RealNews/TheAtlanticArticles', urls, sys.modules[__name__])
str(counter)) newUrls = tree.xpath( '//div[@class="u-cf index-page"]/section//a/@href') for url in newUrls: if '/us-news/2017/' in url: urls.append(url) counter += 1 urls = list(set(urls)) return urls[0:numUrls] def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath( '//div[@class="content__article-body from-content-api js-article__body"]/p' ) paragraphsUTF = [] for paragraph in paragraphs: paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content())) return paragraphsUTF if __name__ == "__main__": urls = getUrls(500) print 'LENGTH', len(urls) scrape.writeArticleFile('./RealNews/TheGuardianArticles', urls, sys.modules[__name__])
def getUrls(): urls = [] #number of pages, there are 15 articles per page for i in range(1, 16): tree = scrape.getHTML('http://countercurrentnews.com/category/corruption/page/' + str(i) + '/') urls.extend(tree.xpath('//article/div[@class="post-thumb"]/a/@href')) return urls def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath('//article/div[@class="entry-content"]/p[not(descendant::script)]|//article/div[@class="entry-content"]/blockquote/p') paragraphsUTF = [] for paragraph in paragraphs: paragraph = scrape.encodeParagraph(paragraph.text_content()) if paragraph.startswith(("(Article by", "(Article By", "(Article From", "(from", "Article by")): break paragraphsUTF.append(paragraph) return paragraphsUTF if __name__ == "__main__": urls = getUrls() scrape.writeArticleFile('./FakeNews/CounterCurrentNewsArticles', urls, sys.modules[__name__])
'http://www.newsexaminer.com/news/election_politics/') urls = tree.xpath( '//div[@id="tncms-region-index-primary"]//div[@class="card-headline"]//a/@href' ) return urls def scraper(url): tree = scrape.getHTML('http://www.newsexaminer.com' + url) paragraphs = tree.xpath( '//div[@class="asset-content subscriber-premium"]//p') paragraphsUTF = [] for paragraph in paragraphs: paragraph = paragraph.text_content().encode('latin1').decode( 'utf-8').encode('utf-8') #extra content at the end of some articles that we don't want if paragraph == '---' or 3 * paragraph.count('\xe2') == len(paragraph): break paragraphsUTF.append(paragraph) return paragraphsUTF if __name__ == "__main__": urls = getUrls() scrape.writeArticleFile('NewsExaminerArticles', urls, sys.modules[__name__])