Python getHTMLの例、scrapingutil.getHTML Pythonの例

コード例 #1

0

ファイルを表示

def getUrls():
    tree = scrape.getHTML(
        'http://www.newsexaminer.com/news/election_politics/')
    urls = tree.xpath(
        '//div[@id="tncms-region-index-primary"]//div[@class="card-headline"]//a/@href'
    )
    return urls

コード例 #2

0

ファイルを表示

def getUrls():
    urls = []
    #number of pages, there are 15 articles per page
    for i in range(1, 16):
        tree = scrape.getHTML('http://countercurrentnews.com/category/corruption/page/' + str(i) + '/')   
        urls.extend(tree.xpath('//article/div[@class="post-thumb"]/a/@href'))
    
    return urls

コード例 #3

0

ファイルを表示

ファイル: breitbartscrape.py プロジェクト: jasoncode/cs216project

def getUrls():
    urls = []
    #number of pages, there are 30 articles per page
    for i in range(1, 19):
        tree = scrape.getHTML('http://www.breitbart.com/big-government/page/' +
                              str(i) + '/')
        urls.extend(tree.xpath('//div[@class="article-list"]/article/a/@href'))
    return urls[0:500]

コード例 #4

0

ファイルを表示

ファイル: politicoscrape.py プロジェクト: jasoncode/cs216project

def scraper(url):
    tree = scrape.getHTML(url)
    paragraphs = tree.xpath('//div[@class="story-text "]/p')
    paragraphsUTF = []

    for paragraph in paragraphs:
        paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content()))

    return paragraphsUTF

コード例 #5

0

ファイルを表示

ファイル: reutersscrape.py プロジェクト: jasoncode/cs216project

def scraper(url):
    tree = scrape.getHTML('http://www.reuters.com' + url)
    paragraphs = tree.xpath('//span[@id="article-text"]//p')[0:-1]
    paragraphsUTF = []

    for paragraph in paragraphs:
        paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content()))

    return paragraphsUTF

コード例 #6

0

ファイルを表示

ファイル: OANNScrape.py プロジェクト: jasoncode/cs216project

def scraper(url):
    tree = scrape.getHTML(url)
    paragraphs = tree.xpath('//div[@class="entry-content clearfix"]/p')
    paragraphsUTF = []

    for paragraph in paragraphs:
        encoded_par = scrape.encodeParagraph(paragraph.text_content()).strip()
        if not encoded_par.startswith('('):
            paragraphsUTF.append(encoded_par)

    return paragraphsUTF

コード例 #7

0

ファイルを表示

ファイル: politicoscrape.py プロジェクト: jasoncode/cs216project

def getUrls():
    urls = []
    #number of pages, there 9 articles per page
    for i in range(1, 57):
        tree = scrape.getHTML('http://www.politico.com/white-house/' + str(i))
        urls.extend(
            tree.xpath(
                '//div[@class="content-group tag-latest"]/ul/li/article/figure/div/a/@href'
            ))

    return urls[0:500]

コード例 #8

0

ファイルを表示

def scraper(url):
    tree = scrape.getHTML('https://www.theatlantic.com/' + url)
    paragraphs = tree.xpath(
        '//div[@class="article-body"]/section/p|//div[@class="article-body"]/section/blockquote'
    )
    paragraphsUTF = []

    for paragraph in paragraphs:
        paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content()))

    return paragraphsUTF

コード例 #9

0

ファイルを表示

ファイル: NationalReviewScrape.py プロジェクト: jasoncode/cs216project

def scraper(url):
    tree = scrape.getHTML(url)
    paragraphs = tree.xpath('//div[@itemprop="articleBody"]/p')
    paragraphsUTF = []

    for paragraph in paragraphs:
        stripped_par = scrape.encodeParagraph(paragraph.text_content()).strip()
        if len(stripped_par) > 0:
            paragraphsUTF.append(stripped_par)

    return paragraphsUTF

コード例 #10

0

ファイルを表示

def scraper(url):
    tree = scrape.getHTML(url)
    paragraphs = tree.xpath(
        '//div[@class="content__article-body from-content-api js-article__body"]/p'
    )
    paragraphsUTF = []

    for paragraph in paragraphs:
        paragraphsUTF.append(scrape.encodeParagraph(paragraph.text_content()))

    return paragraphsUTF

コード例 #11

0

ファイルを表示

ファイル: breitbartscrape.py プロジェクト: jasoncode/cs216project

def scraper(url):
    tree = scrape.getHTML(url)
    paragraphs = tree.xpath(
        '//article/div[@class="entry-content"]/p|//article/div[@class="entry-content"]/blockquote'
    )
    paragraphsUTF = []

    for paragraph in paragraphs:
        paragraph = scrape.encodeParagraph(paragraph.text_content())
        paragraphsUTF.append(paragraph)

    return paragraphsUTF

コード例 #12

0

ファイルを表示

ファイル: OANNScrape.py プロジェクト: jasoncode/cs216project

def getUrls(numUrls):
    urls = []
    page_num = 2
    while len(urls) < numUrls:
        tree = scrape.getHTML('http://www.oann.com/category/politics/page/' +
                              str(page_num))
        extensions = tree.xpath(
            '//div[@id="main-content"]/article/header/h3/a/@href')
        urls.extend(extensions)
        page_num += 1
        if len(urls) % 99 == 0:
            print len(urls)
    return urls[0:numUrls]

コード例 #13

0

ファイルを表示

ファイル: reutersscrape.py プロジェクト: jasoncode/cs216project

def getUrls():
    urls = []
    #number of pages, there 10 articles per page
    for i in range(1, 50):
        tree = scrape.getHTML(
            'http://www.reuters.com/news/archive/politicsNews?view=page&page='
            + str(i) + '&pageSize=10')
        urls.extend(
            tree.xpath(
                '//div[@class="news-headline-list medium"]/article/div[@class="story-content"]/a/@href'
            ))

    return urls

コード例 #14

0

ファイルを表示

ファイル: economistscrape.py プロジェクト: jasoncode/cs216project

def getUrls(numUrls):
    urls = []
    base_path = 'http://www.economist.com/'
    page_num = 1
    while len(urls) < numUrls:
        tree = scrape.getHTML('http://www.economist.com/sections/united-states?page=' + str(page_num))
        extensions =  tree.xpath('//div[@class="teaser-list"]/article/a/@href')
        if extensions == []:
            break
        urls.extend([base_path + extension for extension in extensions])
        page_num += 1
    
    return urls[0:numUrls]

コード例 #15

0

ファイルを表示

def getUrls():
    urls = []
    #number of pages
    for i in range(2, 35):
        tree = scrape.getHTML('https://www.theatlantic.com/politics/?page=' +
                              str(i))
        urls.extend(
            tree.xpath(
                '//div[@class="river-body"]/ul[@class="river"]/li[@class="article blog-article "]/a/@href'
            ))

    urls = [x for x in urls if 'politics/archive/' in x]
    return urls[0:500]

コード例 #16

0

ファイルを表示

def scraper(url):
    tree = scrape.getHTML(url) 
    paragraphs = tree.xpath('//article/div[@class="entry-content"]/p[not(descendant::script)]|//article/div[@class="entry-content"]/blockquote/p') 
    paragraphsUTF = []
    
    for paragraph in paragraphs:
        paragraph = scrape.encodeParagraph(paragraph.text_content())
        if paragraph.startswith(("(Article by", "(Article By", "(Article From", "(from", "Article by")):
            break
            
        paragraphsUTF.append(paragraph)
    
    return paragraphsUTF

コード例 #17

0

ファイルを表示

ファイル: NationalReviewScrape.py プロジェクト: jasoncode/cs216project

def getUrls(numUrls):
    urls = []
    base_path = 'http://www.nationalreview.com/'
    page_num = 0
    while len(urls) < numUrls:
        tree = scrape.getHTML('http://www.nationalreview.com/archives?page=' +
                              str(page_num))
        extensions = tree.xpath(
            '//div[@class=" timeline blog cf p-r"]/ul//a/@href')
        if extensions == []:

            break
        urls.extend([base_path + extension for extension in extensions])
        page_num += 1

    return urls[0:numUrls]

コード例 #18

0

ファイルを表示

def getUrls(numUrls):
    urls = []

    counter = 0
    while len(urls) < numUrls:
        tree = scrape.getHTML(
            'https://www.theguardian.com/us-news/us-politics?page=' +
            str(counter))
        newUrls = tree.xpath(
            '//div[@class="u-cf index-page"]/section//a/@href')
        for url in newUrls:
            if '/us-news/2017/' in url:
                urls.append(url)
        counter += 1
        urls = list(set(urls))

    return urls[0:numUrls]

コード例 #19

0

ファイルを表示

def scraper(url):
    tree = scrape.getHTML('http://www.newsexaminer.com' + url)
    paragraphs = tree.xpath(
        '//div[@class="asset-content  subscriber-premium"]//p')
    paragraphsUTF = []

    for paragraph in paragraphs:
        paragraph = paragraph.text_content().encode('latin1').decode(
            'utf-8').encode('utf-8')

        #extra content at the end of some articles that we don't want
        if paragraph == '---' or 3 * paragraph.count('\xe2') == len(paragraph):
            break

        paragraphsUTF.append(paragraph)

    return paragraphsUTF

コード例 #20

0

ファイルを表示

ファイル: WashingtonTimesScrape.py プロジェクト: jasoncode/cs216project

def getUrls(numUrls):
    urls = []
    base_path = 'http://www.washingtontimes.com/'
    page_num = 1
    while len(urls) < numUrls:
        tree = scrape.getHTML(
            'http://www.washingtontimes.com/news/politics/?page=' +
            str(page_num))
        extensions = tree.xpath(
            '//section[@class="block article-list related-articles"]/article/h2/a/@href'
        )
        if page_num == 1:
            extensions.extend(
                tree.xpath(
                    '//div[@class="block article-list featured-articles"]/article/h2/a/@href'
                ))
        print extensions
        if extensions == []:
            break
        urls.extend([base_path + extension for extension in extensions])
        page_num += 1

    return urls[0:numUrls]