コード例 #1
0
def scraper(url):
    tree = scrape.getHTML(url)
    paragraphs = tree.xpath('//div[@class="story-text "]/p')
    paragraphsUTF = []

    for paragraph in paragraphs:
        paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content()))

    return paragraphsUTF
コード例 #2
0
def scraper(url):
    tree = scrape.getHTML('http://www.reuters.com' + url)
    paragraphs = tree.xpath('//span[@id="article-text"]//p')[0:-1]
    paragraphsUTF = []

    for paragraph in paragraphs:
        paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content()))

    return paragraphsUTF
コード例 #3
0
def scraper(url):
    tree = scrape.getHTML(url)
    paragraphs = tree.xpath('//div[@itemprop="articleBody"]/p')
    paragraphsUTF = []

    for paragraph in paragraphs:
        stripped_par = scrape.encodeParagraph(paragraph.text_content()).strip()
        if len(stripped_par) > 0:
            paragraphsUTF.append(stripped_par)

    return paragraphsUTF
コード例 #4
0
ファイル: OANNScrape.py プロジェクト: jasoncode/cs216project
def scraper(url):
    tree = scrape.getHTML(url)
    paragraphs = tree.xpath('//div[@class="entry-content clearfix"]/p')
    paragraphsUTF = []

    for paragraph in paragraphs:
        encoded_par = scrape.encodeParagraph(paragraph.text_content()).strip()
        if not encoded_par.startswith('('):
            paragraphsUTF.append(encoded_par)

    return paragraphsUTF
コード例 #5
0
def scraper(url):
    tree = scrape.getHTML('https://www.theatlantic.com/' + url)
    paragraphs = tree.xpath(
        '//div[@class="article-body"]/section/p|//div[@class="article-body"]/section/blockquote'
    )
    paragraphsUTF = []

    for paragraph in paragraphs:
        paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content()))

    return paragraphsUTF
コード例 #6
0
def scraper(url):
    tree = scrape.getHTML(url)
    paragraphs = tree.xpath(
        '//div[@class="content__article-body from-content-api js-article__body"]/p'
    )
    paragraphsUTF = []

    for paragraph in paragraphs:
        paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content()))

    return paragraphsUTF
コード例 #7
0
def scraper(url):
    tree = scrape.getHTML(url)
    paragraphs = tree.xpath(
        '//article/div[@class="entry-content"]/p|//article/div[@class="entry-content"]/blockquote'
    )
    paragraphsUTF = []

    for paragraph in paragraphs:
        paragraph = scrape.encodeParagraph(paragraph.text_content())
        paragraphsUTF.append(paragraph)

    return paragraphsUTF
コード例 #8
0
def scraper(url):
    tree = scrape.getHTML(url) 
    paragraphs = tree.xpath('//article/div[@class="entry-content"]/p[not(descendant::script)]|//article/div[@class="entry-content"]/blockquote/p') 
    paragraphsUTF = []
    
    for paragraph in paragraphs:
        paragraph = scrape.encodeParagraph(paragraph.text_content())
        if paragraph.startswith(("(Article by", "(Article By", "(Article From", "(from", "Article by")):
            break
            
        paragraphsUTF.append(paragraph)
    
    return paragraphsUTF