def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath('//div[@class="story-text "]/p') paragraphsUTF = [] for paragraph in paragraphs: paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content())) return paragraphsUTF
def scraper(url): tree = scrape.getHTML('http://www.reuters.com' + url) paragraphs = tree.xpath('//span[@id="article-text"]//p')[0:-1] paragraphsUTF = [] for paragraph in paragraphs: paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content())) return paragraphsUTF
def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath('//div[@itemprop="articleBody"]/p') paragraphsUTF = [] for paragraph in paragraphs: stripped_par = scrape.encodeParagraph(paragraph.text_content()).strip() if len(stripped_par) > 0: paragraphsUTF.append(stripped_par) return paragraphsUTF
def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath('//div[@class="entry-content clearfix"]/p') paragraphsUTF = [] for paragraph in paragraphs: encoded_par = scrape.encodeParagraph(paragraph.text_content()).strip() if not encoded_par.startswith('('): paragraphsUTF.append(encoded_par) return paragraphsUTF
def scraper(url): tree = scrape.getHTML('https://www.theatlantic.com/' + url) paragraphs = tree.xpath( '//div[@class="article-body"]/section/p|//div[@class="article-body"]/section/blockquote' ) paragraphsUTF = [] for paragraph in paragraphs: paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content())) return paragraphsUTF
def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath( '//div[@class="content__article-body from-content-api js-article__body"]/p' ) paragraphsUTF = [] for paragraph in paragraphs: paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content())) return paragraphsUTF
def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath( '//article/div[@class="entry-content"]/p|//article/div[@class="entry-content"]/blockquote' ) paragraphsUTF = [] for paragraph in paragraphs: paragraph = scrape.encodeParagraph(paragraph.text_content()) paragraphsUTF.append(paragraph) return paragraphsUTF
def scraper(url): tree = scrape.getHTML(url) paragraphs = tree.xpath('//article/div[@class="entry-content"]/p[not(descendant::script)]|//article/div[@class="entry-content"]/blockquote/p') paragraphsUTF = [] for paragraph in paragraphs: paragraph = scrape.encodeParagraph(paragraph.text_content()) if paragraph.startswith(("(Article by", "(Article By", "(Article From", "(from", "Article by")): break paragraphsUTF.append(paragraph) return paragraphsUTF