def getUrls(): tree = scrape.getHTML( 'http://www.newsexaminer.com/news/election_politics/') urls = tree.xpath( '//div[@id="tncms-region-index-primary"]//div[@class="card-headline"]//a/@href' ) return urls
def getUrls(): urls = [] #number of pages, there are 15 articles per page for i in range(1, 16): tree = scrape.getHTML('http://countercurrentnews.com/category/corruption/page/' + str(i) + '/') urls.extend(tree.xpath('//article/div[@class="post-thumb"]/a/@href')) return urls
def getUrls(): urls = [] #number of pages, there are 30 articles per page for i in range(1, 19): tree = scrape.getHTML('http://www.breitbart.com/big-government/page/' + str(i) + '/') urls.extend(tree.xpath('//div[@class="article-list"]/article/a/@href')) return urls[0:500]
def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath('//div[@class="story-text "]/p') paragraphsUTF = [] for paragraph in paragraphs: paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content())) return paragraphsUTF
def scraper(url): tree = scrape.getHTML('http://www.reuters.com' + url) paragraphs = tree.xpath('//span[@id="article-text"]//p')[0:-1] paragraphsUTF = [] for paragraph in paragraphs: paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content())) return paragraphsUTF
def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath('//div[@class="entry-content clearfix"]/p') paragraphsUTF = [] for paragraph in paragraphs: encoded_par = scrape.encodeParagraph(paragraph.text_content()).strip() if not encoded_par.startswith('('): paragraphsUTF.append(encoded_par) return paragraphsUTF
def getUrls(): urls = [] #number of pages, there 9 articles per page for i in range(1, 57): tree = scrape.getHTML('http://www.politico.com/white-house/' + str(i)) urls.extend( tree.xpath( '//div[@class="content-group tag-latest"]/ul/li/article/figure/div/a/@href' )) return urls[0:500]
def scraper(url): tree = scrape.getHTML('https://www.theatlantic.com/' + url) paragraphs = tree.xpath( '//div[@class="article-body"]/section/p|//div[@class="article-body"]/section/blockquote' ) paragraphsUTF = [] for paragraph in paragraphs: paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content())) return paragraphsUTF
def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath('//div[@itemprop="articleBody"]/p') paragraphsUTF = [] for paragraph in paragraphs: stripped_par = scrape.encodeParagraph(paragraph.text_content()).strip() if len(stripped_par) > 0: paragraphsUTF.append(stripped_par) return paragraphsUTF
def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath( '//div[@class="content__article-body from-content-api js-article__body"]/p' ) paragraphsUTF = [] for paragraph in paragraphs: paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content())) return paragraphsUTF
def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath( '//article/div[@class="entry-content"]/p|//article/div[@class="entry-content"]/blockquote' ) paragraphsUTF = [] for paragraph in paragraphs: paragraph = scrape.encodeParagraph(paragraph.text_content()) paragraphsUTF.append(paragraph) return paragraphsUTF
def getUrls(numUrls): urls = [] page_num = 2 while len(urls) < numUrls: tree = scrape.getHTML('http://www.oann.com/category/politics/page/' + str(page_num)) extensions = tree.xpath( '//div[@id="main-content"]/article/header/h3/a/@href') urls.extend(extensions) page_num += 1 if len(urls) % 99 == 0: print len(urls) return urls[0:numUrls]
def getUrls(): urls = [] #number of pages, there 10 articles per page for i in range(1, 50): tree = scrape.getHTML( 'http://www.reuters.com/news/archive/politicsNews?view=page&page=' + str(i) + '&pageSize=10') urls.extend( tree.xpath( '//div[@class="news-headline-list medium"]/article/div[@class="story-content"]/a/@href' )) return urls
def getUrls(numUrls): urls = [] base_path = 'http://www.economist.com/' page_num = 1 while len(urls) < numUrls: tree = scrape.getHTML('http://www.economist.com/sections/united-states?page=' + str(page_num)) extensions = tree.xpath('//div[@class="teaser-list"]/article/a/@href') if extensions == []: break urls.extend([base_path + extension for extension in extensions]) page_num += 1 return urls[0:numUrls]
def getUrls(): urls = [] #number of pages for i in range(2, 35): tree = scrape.getHTML('https://www.theatlantic.com/politics/?page=' + str(i)) urls.extend( tree.xpath( '//div[@class="river-body"]/ul[@class="river"]/li[@class="article blog-article "]/a/@href' )) urls = [x for x in urls if 'politics/archive/' in x] return urls[0:500]
def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath('//article/div[@class="entry-content"]/p[not(descendant::script)]|//article/div[@class="entry-content"]/blockquote/p') paragraphsUTF = [] for paragraph in paragraphs: paragraph = scrape.encodeParagraph(paragraph.text_content()) if paragraph.startswith(("(Article by", "(Article By", "(Article From", "(from", "Article by")): break paragraphsUTF.append(paragraph) return paragraphsUTF
def getUrls(numUrls): urls = [] base_path = 'http://www.nationalreview.com/' page_num = 0 while len(urls) < numUrls: tree = scrape.getHTML('http://www.nationalreview.com/archives?page=' + str(page_num)) extensions = tree.xpath( '//div[@class=" timeline blog cf p-r"]/ul//a/@href') if extensions == []: break urls.extend([base_path + extension for extension in extensions]) page_num += 1 return urls[0:numUrls]
def getUrls(numUrls): urls = [] counter = 0 while len(urls) < numUrls: tree = scrape.getHTML( 'https://www.theguardian.com/us-news/us-politics?page=' + str(counter)) newUrls = tree.xpath( '//div[@class="u-cf index-page"]/section//a/@href') for url in newUrls: if '/us-news/2017/' in url: urls.append(url) counter += 1 urls = list(set(urls)) return urls[0:numUrls]
def scraper(url): tree = scrape.getHTML('http://www.newsexaminer.com' + url) paragraphs = tree.xpath( '//div[@class="asset-content subscriber-premium"]//p') paragraphsUTF = [] for paragraph in paragraphs: paragraph = paragraph.text_content().encode('latin1').decode( 'utf-8').encode('utf-8') #extra content at the end of some articles that we don't want if paragraph == '---' or 3 * paragraph.count('\xe2') == len(paragraph): break paragraphsUTF.append(paragraph) return paragraphsUTF
def getUrls(numUrls): urls = [] base_path = 'http://www.washingtontimes.com/' page_num = 1 while len(urls) < numUrls: tree = scrape.getHTML( 'http://www.washingtontimes.com/news/politics/?page=' + str(page_num)) extensions = tree.xpath( '//section[@class="block article-list related-articles"]/article/h2/a/@href' ) if page_num == 1: extensions.extend( tree.xpath( '//div[@class="block article-list featured-articles"]/article/h2/a/@href' )) print extensions if extensions == []: break urls.extend([base_path + extension for extension in extensions]) page_num += 1 return urls[0:numUrls]