Esempio n. 1
0
def cl_news_util(args, cache):
    if not cache:
        htmlfile = utils.get_html_file('http://www.reuters.com/')
        parser = REUTERSHTMLParser()
        parser.feed(htmlfile)
        articlelist = parser.articlelist
    else:
        articlelist = cache

    if len(args) > 1:
        if args[1] == '--headlines' or args[1] == '-h':
            utils.reuters_headlines(articlelist)
            return articlelist

        if len(args) > 2:

            if args[1] == '--open' or args[1] == '-o':
                index = args[2]
                article = get_reuters_article(articlelist, index)
                utils.go_to_page(article['url'])
                return articlelist

            if args[1] == '--read' or args[1] == '-r':
                index = args[2]
                article = get_reuters_article(articlelist, index)
                htmlfile = utils.get_html_file(article['url'])
                abbrevurl = article['url'][28:]
                print '\n' + article['title'] + ' -- ' + abbrevurl
                print '==================\n'
                parser = REUTERSARTICLEParser()
                parser.feed(htmlfile)
                return articlelist

    utils.handle_error('reuters_error')
Esempio n. 2
0
def cl_news_util(args, cache):
    if not cache:
        htmlfile = utils.get_html_file('http://bigstory.ap.org/')
        parser = APHTMLParser()
        parser.feed(htmlfile)
        articlelist = parser.articlelist
    else:
        articlelist = cache

    if len(args) > 1:
        if args[1] == '--headlines' or args[1] == '-h':
            utils.ap_headlines(articlelist)
            return articlelist

        if len(args) > 2:

            if args[1] == '--open' or args[1] == '-o':
                index = args[2]
                article = get_ap_article(articlelist, index)
                utils.go_to_page(article['url'])
                return articlelist

            if args[1] == '--read' or args[1] == '-r':
                index = args[2]
                article = get_ap_article(articlelist, index)
                htmlfile = utils.get_html_file(article['url'])
                content = re.search(
                    r'<meta name="description" content="(.+?)" />', htmlfile)
                print_article_header(article['title'], content.group(1))
                return articlelist

    utils.handle_error('ap_error')
Esempio n. 3
0
def cl_news_util(args, cache):
    if not cache:
        htmlfile = utils.get_html_file('https://www.washingtonpost.com')
        htmlfile = htmlfile.decode('utf-8')
        parser = WPHTMLParser()
        parser.feed(htmlfile)
        articlelist = parser.articlelist
    else:
        articlelist = cache

    if len(args) > 1:
        if args[1] == '--headlines' or args[1] == '-h':
            utils.wp_headlines(articlelist)
            return articlelist

        if len(args) > 2:

            if args[1] == '--open' or args[1] == '-o':
                index = args[2]
                article = articlelist[index]
                utils.go_to_page(article['url'])
                return articlelist

            if args[1] == '--read' or args[1] == '-r':
                index = int(args[2]) - 1
                article = articlelist[index]
                htmlfile = utils.get_html_file(article['url'])
                htmlfile = htmlfile.decode('utf-8')
                print '\n' + article['title']
                print '==================\n'
                parser = WPARTICLEParser()
                parser.feed(htmlfile)
                return articlelist

    utils.handle_error('wp_error')
Esempio n. 4
0
def cl_news_util(args, cache):
    if not cache:
        htmlfile = utils.get_html_file('http://www.theguardian.com/us')
        parser = GUARDIANHTMLParser()
        parser.feed(htmlfile)
        articlelist = parser.articlelist
    else:
        articlelist = cache

    if len(args) > 1:
        if args[1] == '--headlines' or args[1] =='-h':
            utils.gu_headlines(articlelist)
            return articlelist

        if len(args) > 2:

            if args[1] == '--open' or args[1] == '-o':
                index = args[2]
                article = get_gu_article(articlelist, index)
                utils.go_to_page(article['url'])
                return articlelist

            if args[1] == '--read' or args[1] == '-r':
                index = args[2]
                article = get_gu_article(articlelist, index)
                htmlfile = utils.get_html_file(article['url'])
                abbrevurl = article['url'][28:]
                print '\n' + article['title'] + ' -- ' + abbrevurl
                print '==================\n'
                htmlfile = htmlfile.decode('utf-8')
                parser = GUARDIANARTICLEParser()
                parser.feed(htmlfile)
                return articlelist

    utils.handle_error('ap_error')
Esempio n. 5
0
def main():
    a = 'http://bigstory.ap.org/article/f7645d59944d47228f2eb195a35a19a4/'
    htmlfile = utils.get_html_file(
        a + 'get-without-planned-parenthood-one-texas-effort-stumbles')
    content = re.search(r'<meta name="description" content="(.+?)" />',
                        htmlfile)
    print content.group(1)
Esempio n. 6
0
def cl_news_util(arguments, cache):
    if not cache:
        htmlfile = utils.get_html_file('http://news.ycombinator.com')
        storylinks = re.findall(r'href="(.+)" class="storylink">(.+)</a><span',
                                htmlfile)
    else:
        storylinks = cache

    if len(arguments) > 1:
        if arguments[1] == '--headlines' or arguments[1] == '-h':
            utils.hn_headlines(storylinks)
            return storylinks

        if arguments[1] == '--open' or arguments[1] == '-o':
            if len(arguments) > 2:
                index = int(arguments[2])
                openpage(storylinks, index)
                return storylinks

        if arguments[1] == '--copy' or arguments[1] == '-cp':
            if len(arguments) > 2:
                utils.copy_file(arguments[2], htmlfile)
                return storylinks

    utils.handle_error('hn_error')
Esempio n. 7
0
def get_article_list():
    htmlfile = utils.get_html_file('http://cnn.com')
    articles = re.findall(r'articleList":\[(.+?)\]', htmlfile)
    articles = re.findall(r'({.+?})', articles[0])
    article_list = []
    for article in articles:
        article_list.append(json.loads(article))
    return article_list
Esempio n. 8
0
def cl_news_util(args, cache):
    if not cache:
        htmlfile = utils.get_html_file('http://www.aljazeera.com/')
        htmlfile = htmlfile.decode('utf-8')
        parser = AJHTMLParser()
        parser.feed(htmlfile)
        articlelist = parser.articlelist
    else:
        articlelist = cache

    if len(args) > 1:
        if args[1] == '--headlines' or args[1] == '-h':
            utils.aj_headlines(articlelist)
            return articlelist

        if len(args) > 2:

            if args[1] == '--open' or args[1] == '-o':
                index = args[2]
                article = get_aj_article(articlelist, index)
                utils.go_to_page(article['url'])
                return articlelist

            if args[1] == '--read' or args[1] == '-r':
                index = args[2]
                article = get_aj_article(articlelist, index)
                htmlfile = utils.get_html_file('http://www.aljazeera.com/' +
                                               article['url'])
                htmlfile = htmlfile.decode('utf-8')
                print '\n' + article['title']
                print '====================='
                parser = AJARTICLEParser()
                parser.feed(htmlfile)
                return articlelist

    utils.handle_error('aj_error')
Esempio n. 9
0
def cl_news_util(args, cache):
    if not cache:
        htmlfile = utils.get_html_file('https://www.nytimes.com')
        parser = NYTIMESHTMLParser()
        parser.feed(htmlfile)
        articlelist = parser.articlelist
    else:
        articlelist = cache

    if len(args) > 1:
        if args[1] == '--headlines' or args[1] == '-h':
            utils.nyt_headlines(articlelist)
            return articlelist

        if len(args) > 2:
            # NOT CURRENTLY USED
            # if args[1] == '--open' or args[1] == '-o':
            #     index = args[2]
            #     article = get_nyt_article(articlelist, index)
            #     utils.go_to_page(article['url'])
            #     return articlelist

            if args[1] == '--read' or args[1] == '-r':
                try:
                    index = int(args[2]) - 1
                    url = articlelist[index]['url']
                    article = articlelist[index]
                    # This url call is specific to NYT
                    htmlfile = urllib2.build_opener(
                        urllib2.HTTPCookieProcessor).open(url)
                    htmlfile = htmlfile.read()
                    parser = NYTIMESARTICLEParser()
                    print '=========nyt=========\n'
                    print article['title'] + '\n'
                    print '=====================\n'
                    parser.feed(htmlfile)
                    return articlelist
                except:
                    return

    utils.handle_error('nyt_error')
Esempio n. 10
0
def main():
    arguments = sys.argv
    if len(arguments) > 1:
        htmlfile = utils.get_html_file('http://news.ycombinator.com')
        storylinks = re.findall(r'href="(.+)" class="storylink">(.+)</a><span',
                                htmlfile)

        if arguments[1] == '--headlines' or arguments[1] == '-h':
            utils.hn_headlines(storylinks)
            return

        if arguments[1] == '--open' or arguments[1] == '-o':
            if len(arguments) > 2:
                index = int(arguments[2])
                openpage(storylinks, index)
                return

        if arguments[1] == '--copy' or arguments[1] == '-cp':
            if len(arguments) > 2:
                utils.copy_file(arguments[2], htmlfile)
                return

    utils.handle_error('hn_error')
Esempio n. 11
0
def main():
    # cl_news_util(['gu', '-h'], False)
    htmlfile = utils.get_html_file('https://www.theguardian.com/us-news/2017/mar/14/mosque-obama-visited-trump-travel-ban-muslim')
    parser = GUARDIANARTICLEParser()
    parser.feed(htmlfile)
    print parser.collectdata