Ejemplo n.º 1
0
def get_feed():
    logging.warning('get_feed: RSS check...')
    parsed_feed = feedparser.parse(config.RSS_FEED)
    parser_client = ParserClient(readability_api_key)

    feed_urls_cached = Feeds.query.all()

    db_url_list = [cached_feed.url for cached_feed in feed_urls_cached]
    logging.warning('get_feed: db urls count {}'.format(len(db_url_list)))

    for rss_url in parsed_feed['entries']:
        if rss_url['link'] not in db_url_list:
            logging.warning('get_feed: Added from rss: {}'.format(rss_url['link']))
            parser_response = parser_client.get_article_content(rss_url['link'])

            try:
                logging.warning('get_feed: Data len {}'.format(len(parser_response.content['content'])))
                save_to_db(rss_url['link'], parser_response.content['title'], parser_response.content['content'])
                add_feed = Feeds(url=rss_url['link'])

                db.session.add(add_feed)
                db.session.commit()

                write_action_log('rss', url=rss_url['link'], title=parser_response.content['title'])

            except KeyError, e:
                logging.warning('get_feed: ERR {}, no content'.format(e))
                db.session.rollback()
                add_feed = Feeds(url=rss_url['link'])
                db.session.add(add_feed)
                db.session.commit()

                write_action_log('rss', url=rss_url['link'], title="Err parse, no title")
Ejemplo n.º 2
0
def getHTML(url):
    print '\nPROCESSING URL'
    parser = ParserClient(rk)
    p = parser.get_article_content(url)
    html = p.content['content']
    #print cleanHTML(html)
    return cleanHTML(html)
Ejemplo n.º 3
0
def get_text(wiki_title="Jabari_Parker"):
  from readability import ParserClient
  parser_client = ParserClient(settings.PARSER_TOKEN)
  parser_response_text = parser_client.get_article_content(settings.WIKI_URL + wiki_title).content['content'].replace("\n", " ")

  ## Filter out the end of Wikipedia articles
  text = parser_response_text.replace("/<img[^>]*>/g","")
  text = text.split('<span class="mw-headline" id="See_also"')[0]
  text = text.split('<span class="mw-headline" id="Notes"')[0]
  text = text.split('<span class="mw-headline" id="References"')[0]
  text = text.split('<span class="mw-headline" id="Notes_and_references"')[0]
  return text
Ejemplo n.º 4
0
 def extract_raw_content(a):
     #data = Aylien().extract(a["resolved_url"])
     parser_client = ParserClient('0ae1d8bed72a91ed706dcf9f354a0db4b430cb47')
     parser_response = parser_client.get_article_content(a['resolved_url'])
     try:
         content = parser_response.content
         if 'error' in content:
             raise Exception
         return content
     except Exception as e:
         print parser_response
         print parser_response.content
         print e
         return False
Ejemplo n.º 5
0
def get_feed():
    logging.warning('get_feed: RSS check...')
    parsed_feed = feedparser.parse(config.RSS_FEED)
    parser_client = ParserClient(readability_api_key)

    feed_urls_cached = Feeds.query.all()

    db_url_list = [cached_feed.url for cached_feed in feed_urls_cached]
    logging.warning('get_feed: db urls count {}'.format(len(db_url_list)))

    for rss_url in parsed_feed['entries']:
        if rss_url['link'] not in db_url_list:
            logging.warning('get_feed: Added from rss: {}'.format(
                rss_url['link']))
            parser_response = parser_client.get_article_content(
                rss_url['link'])

            try:
                logging.warning('get_feed: Data len {}'.format(
                    len(parser_response.content['content'])))
                save_to_db(rss_url['link'], parser_response.content['title'],
                           parser_response.content['content'])
                add_feed = Feeds(url=rss_url['link'])

                db.session.add(add_feed)
                db.session.commit()

                write_action_log('rss',
                                 url=rss_url['link'],
                                 title=parser_response.content['title'])

            except KeyError, e:
                logging.warning('get_feed: ERR {}, no content'.format(e))
                db.session.rollback()
                add_feed = Feeds(url=rss_url['link'])
                db.session.add(add_feed)
                db.session.commit()

                write_action_log('rss',
                                 url=rss_url['link'],
                                 title="Err parse, no title")
Ejemplo n.º 6
0
 def get_page_content():
     parser_client = ParserClient(readability_api_key)
     parser_response = parser_client.get_article_content(url)
     return parser_response
Ejemplo n.º 7
0
# -*- coding: utf-8 -*-
import nltk
import string
from nltk.collocations import *
from nltk.stem.wordnet import WordNetLemmatizer

from bs4 import BeautifulSoup
from readability import ParserClient

parser_client = ParserClient('0ae1d8bed72a91ed706dcf9f354a0db4b430cb47')
parser_response = parser_client.get_article_content('http://www.theatlantic.com/entertainment/archive/2014/02/russias-gold-medal-figure-skaters-celeb-relationship-status-pioneers/283804')
article = parser_response.content['content']

soup = BeautifulSoup(article, "lxml")
text = soup.get_text()

for k, v in parser_response.content.iteritems():
	if k in ['title', 'dek']:
		text = text + v

exclude = set(string.punctuation+'”'+'’')
text = ''.join(ch for ch in text if ch not in exclude and ch in string.printable).lower()

words = nltk.word_tokenize(text)
filtered_words = [w for w in words if not w in nltk.corpus.stopwords.words('english')]

for w in filtered_words:
	print w
Ejemplo n.º 8
0
 def get_page_content():
     parser_client = ParserClient(readability_api_key)
     parser_response = parser_client.get_article_content(url)
     return parser_response
Ejemplo n.º 9
0
 def _get_article(self, url):
     parser_client = ParserClient(PARSER_TOKEN)
     return parser_client.get_article_content(url)
Ejemplo n.º 10
0
class ParserClientTest(TestCase):
    """Test case for the Parser Client

    """
    def setUp(self):
        self.parser_client = ParserClient(PARSER_TOKEN)
        self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain'

    def test_generate_url(self):
        """Test the clients ability to generate urls to endpoints.

        """
        # test root resource
        expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('')
        expected_url = '{0}?token={1}'.format(expected_url, PARSER_TOKEN)
        generated_url = self.parser_client._generate_url('')
        self.assertEqual(generated_url, expected_url)

        expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('parser')
        params = {'url': 'http://www.beanis.biz/blog.html'}
        expected_url = '{0}?url=http%3A%2F%2Fwww.beanis.biz%2Fblog.html&token={1}'.format(
            expected_url, PARSER_TOKEN)

        generated_url = self.parser_client._generate_url('parser',
                                                         query_params=params)
        self.assertEqual(generated_url, expected_url)

    def test_get_root(self):
        """Test the client's ability to hit the root endpoint.

        """
        response = self.parser_client.get_root()

        expected_keys = set([
            'resources',
        ])
        self.assertEqual(set(response.content.keys()), expected_keys)

    def test_get_confidence(self):
        """Test the client's ability to hit the confidence endpoint.

        """
        # hit without an article_id or url. Should get an error.
        response = self.parser_client.get_confidence()
        self.assertEqual(response.status, 400)

        expected_keys = set(['url', 'confidence'])

        response = self.parser_client.get_confidence(url=self.test_url)
        self.assertEqual(response.status, 200)
        self.assertEqual(set(response.content.keys()), expected_keys)
        # confidence for wikipedia should be over .5
        self.assertTrue(response.content['confidence'] > .5)

    def test_get_article_status(self):
        """Test the client's ability to hit the parser endpoint with a HEAD
        request.

        """
        # hit without an article_id or url. Should get an error.
        response = self.parser_client.get_confidence()
        self.assertEqual(response.status, 400)

        response = self.parser_client.get_article_status(url=self.test_url)
        self.assertEqual(response.status, 200)
        self.assertTrue(response.get('x-article-status') is not None)
        self.assertTrue(response.get('x-article-id') is not None)

    def test_get_article_content(self):
        """Test the client's ability to hit the parser endpoint with a GET
        request.

        """
        # test with incorrect params
        response = self.parser_client.get_article_content()
        self.assertEqual(response.status, 400)

        response = self.parser_client.get_article_content(url=self.test_url)
        self.assertEqual(response.status, 200)

        some_expected_keys = set([
            'content', 'domain', 'author', 'word_count', 'title', 'total_pages'
        ])
        self.assertTrue(
            some_expected_keys.issubset(set(response.content.keys())))

    def test_post_article_content(self):
        """Test the client's ability to hit the parser endpoint with a POST
        request.

        """
        # I'm sorry...
        content = """
        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
        <html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
        <title>Readability v1 Parser API</title><style type="text/css">
                            body {
                                font-family: sans-serif;
                                font: 0.8em/1.4 Arial, sans-serif;
                                margin: 2em 6em;
                                width: 65em;
                            }
                            pre {
                                font-family: Courier, monospace;
                                font-weight: 500;
                                font-size: 0.8em;
                                background-color: #eef;
                                padding: 1em;
                            }
                            .methods {
                                background-color: #e4e4e4;
                                margin-top: .4em;
                                padding: .6em;
                            }
                            .methods h4 {
                                border-bottom: 1px solid #fff;
                                padding: .1em 0;
                                margin-bottom: .4em;
                                color: #0b3c97;
                                font-size: 1.1em;
                            }
                            .methods h6 {
                                color: #666;
                                text-transform: lowercase;
                                margin: .6em 0 .3em;
                            }
                            .resource {
                                margin-bottom: 2em;
                                margin-top: .4em;
                            }
                            .resource h3 {
                                margin-bottom: .4em;
                                font-size: 1.4em;
                                color: #ff5700;
                            }
                            h1 {
                                font-size: 2.5em;
                            }
                            h2 {
                                border-bottom: 1px solid black;
                                margin-top: 1em;
                                color: #666;
                                margin-bottom: 0.5em;
                                font-size: 2em;
                            }
                            h3 {
                                font-size: 1.75em;
                                margin: 0.6em 0;
                            }
                            h4 {
                                color: #666;
                                margin: 0;
                                padding: 0.3em 0;
                                border-bottom: 2px solid white;
                            }
                            h6 {
                                font-size: 1.1em;
                                color: #99a;
                                margin: 0.5em 0em 0.25em 0em;
                            }
                            dd {
                                margin-left: 1em;
                            }
                            tt {
                                font-size: 1.2em;
                            }
                            table {
                                margin-bottom: 0.5em;
                                width: 100%;
                                border-collapse: collapse;
                            }
                            th {
                                text-align: left;
                                font-weight: normal;
                                color: black;
                                border-bottom: 1px solid black;
                                padding: 3px 6px;
                            }
                            td {
                                padding: 3px 6px;
                                vertical-align: top;
                                background-color: f6f6ff;
                                font-size: 0.85em;
                            }
                            td p {
                                margin: 0px;
                            }
                            ul {
                                padding-left: 1.75em;
                            }
                            p + ul, p + ol, p + dl {
                                margin-top: 0em;
                            }
                            .optional {
                                font-weight: normal;
                                opacity: 0.75;
                            }
                        </style><link href="prettify/prettify.css" type="text/css" rel="stylesheet"></link><script type="text/javascript" src="prettify/prettify.js"></script></head><body onload="prettyPrint()"><h1>Readability v1 Parser API</h1>
                <section>
                    <h2 id="authentication">Authentication</h2>
                    <p>
                        Requests to the Parser API are not signed like an OAuth
                        request.  The Parser token is simply passed as a POST or GET
                        parameter depending on the request type. Be careful not to
                        reveal this token, requests directly to the Parser API should
                        not be made on the client device but rather proxied to keep the
                        API token secure.
                    </p>
                </section>

                <section>
                    <h2 id="quick-start">Quick Start</h2>
                    <p class="section-intro">
                                Here's how to pull an article's content from the Readability Parser API:
                    </p>
                    <h4>Request</h4>
                    <pre>GET /api/content/v1/parser?url=http://blog.readability.com/2011/02/step-up-be-heard-readability-ideas/&amp;token=1b830931777ac7c2ac954e9f0d67df437175e66e</pre>
                    <h4>Response</h4>
                    <pre>
        HTTP/1.0 200 OK
        {
            "content" &lt;div class=\"article-text\"&gt;\n&lt;p&gt;I'm idling outside Diamante's, [snip] ...&lt;/p&gt;&lt;/div&gt;",
            "domain": "www.gq.com",
            "author": "Rafi Kohan",
            "url": "http://www.gq.com/sports/profiles/201202/david-diamante-interview-cigar-lounge-brooklyn-new-jersey-nets?currentPage=all",
            "short_url": "http://rdd.me/g3jcb1sr",
            "title": "Blowing Smoke with Boxing's Big Voice",
            "excerpt": "I'm idling outside Diamante's, a cigar lounge in Fort Greene, waiting for David Diamante, and soon I smell him coming. It's late January but warm. A motorcycle growls down the Brooklyn side street,&amp;hellip;",
            "direction": "ltr",
            "word_count": 2892,
            "total_pages": 1,
            "date_published": null,
            "dek": "Announcer &lt;strong&gt;David Diamante&lt;/strong&gt;, the new voice of the New Jersey (soon Brooklyn) Nets, has been calling boxing matches for years. On the side, he owns a cigar lounge in the heart of Brooklyn. We talk with Diamante about his new gig and the fine art of cigars",
            "lead_image_url": "http://www.gq.com/images/entertainment/2012/02/david-diamante/diamante-628.jpg",
            "next_page_id": null,
            "rendered_pages": 1
        }
        </pre>
                </section>

                <section>
                    <h2 id="data-formats">Data Formats</h2>
                    <p>
                        All requests are, by default, provided as JSON. You may also pass "?format=xml" in the URL to convert this into XML data to be consumed.
                    </p>
                </section>

            <h3>Resources, Representations &amp; Errors</h3><ul><li><a href="#resources">Resources</a><ul><li><a href="#idp3728">https://readability.com/api/content/v1/</a></li><li><a href="#idp4080">https://readability.com/api/content/v1/parser</a></li><li><a href="#idp39744">https://readability.com/api/content/v1/confidence</a></li></ul></li><li><a href="#representations">Representations</a><ul><li><a href="#https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</a></li></ul></li><li><a href="#faults">Errors</a><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</a></li></ul></li></ul><h2 id="resources">Resources</h2><div class="resource"><h3 id="idp3728">/</h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp5008">GET</h4>
                            Retrieve the base API URI - information about subresources.
                        <h6>request header parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>Authorization</strong></p></td><td><p><em><a href="" title=""></a></em><small> (required)</small></p></td><td></td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</a></li></ul></div></div></div><div class="resource"><h3 id="idp4080">/parser?token<span class="optional">&amp;url</span><span class="optional">&amp;id</span><span class="optional">&amp;max_pages</span></h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp36384">GET</h4>
                            Parse an article
                        <h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>token</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td></td></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The URL of an article to return the content for.</td></tr><tr><td><p><strong>id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The ID of an article to return the content for.</td></tr><tr><td><p><strong>max_pages</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#integer">integer</a></em></p></td><td>The maximum number of pages to parse and combine. Default is 25.</td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</a></li></ul><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div><div class="method"><h4 id="idp63552">HEAD</h4>
                            <p>
                                Retrieve the Content Status of an article. This is useful if you want to save yourself from POSTing a large html document. You can do a HEAD request on the resource, and check for the status of the article in the X-Article-Status header. <strong>Additionally, if we've never seen the article before, we'll return a 404, which also means you should POST.</strong>
                            </p>
                        <h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>token</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td></td></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The URL of an article to check.</td></tr><tr><td><p><strong>id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The ID of an article to check.</td></tr></table><h6>response header parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>X-Article-Id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>
                                <p>The ID of the article within Readablity.</p>
                            </td></tr><tr><td><p><strong>X-Article-Status</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>
                                <p>The status of the content in Readability. One of:</p>
                                <dl>
                                <dt>INVALID</dt>
                                <dd>We were unable to parse this URL for some reason. <em>Recommendation: Fail</em></dd>
                                <dt>UNRETRIEVED</dt>
                                <dd>We know of this article, but have not yet retrieved its content, or the cache has expired. <em>Recommendation: POST content to us</em></dd>
                                <dt>PROVIDED_BY_USER</dt>
                                <dd>We have retrieved the content for this URL from at least one user. <em>Recommendation: POST content to us</em></dd>
                                <dt>VALIDATED_BY_USERS</dt>
                                <dd>We have retrieved the content for this URL from multiple users, and have validated it. <em>Recommendation: GET the content from us.</em></dd>
                                <dt>FETCHED</dt>
                                <dd>We fetched the content for this URL manually, and it has been cached. <em>Recommendation:GET the content from us.</em></dd>
                                </dl>
                            </td></tr></table><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div></div></div><div class="resource"><h3 id="idp39744">/confidence?url<span class="optional">&amp;callback</span></h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp89296">GET</h4>Detect the confidence with which Readability could parse a given URL. Does not require a token.<h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td>The URL of an article to return the confidence for.</td></tr><tr><td><p><strong>callback</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The jsonp callback function name.</td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</a></li></ul><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div></div></div><h2 id="representations">Representations</h2><h3 id="https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        {
            "resources": {
                "parser": {
                    "description": "The Content Parser Resource",
                    "href": "/api/content/v1/parser"
                }
            }
        }
                    </pre>
                <h3 id="https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        {
            "content" &lt;div class=\"article-text\"&gt;\n&lt;p&gt;I'm idling outside Diamante's, [snip] ...&lt;/p&gt;&lt;/div&gt;",
            "domain": "www.gq.com",
            "author": "Rafi Kohan",
            "url": "http://www.gq.com/sports/profiles/201202/david-diamante-interview-cigar-lounge-brooklyn-new-jersey-nets?currentPage=all",
            "short_url": "http://rdd.me/g3jcb1sr",
            "title": "Blowing Smoke with Boxing's Big Voice",
            "excerpt": "I'm idling outside Diamante's, a cigar lounge in Fort Greene, waiting for David Diamante, and soon I smell him coming. It's late January but warm. A motorcycle growls down the Brooklyn side street,&amp;hellip;",
            "direction": "ltr",
            "word_count": 2892,
            "total_pages": 1,
            "date_published": null,
            "dek": "Announcer &lt;strong&gt;David Diamante&lt;/strong&gt;, the new voice of the New Jersey (soon Brooklyn) Nets, has been calling boxing matches for years. On the side, he owns a cigar lounge in the heart of Brooklyn. We talk with Diamante about his new gig and the fine art of cigars",
            "lead_image_url": "http://www.gq.com/images/entertainment/2012/02/david-diamante/diamante-628.jpg",
            "next_page_id": null,
            "rendered_pages": 1
        }

        </pre>
                <h3 id="https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        {
            "url": "http://www.gq.com/article/12",
            "confidence": .7
        }

        </pre>
                <h3 id="https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        callback({
            "url": "http://www.gq.com/article/12",
            "confidence": .7
        });

        </pre>
                <h2 id="faults">Errors</h2><h3 id="https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</h3>
                    The server could not understand your request. Verify that request parameters (and content, if any) are valid.
                <h3 id="https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</h3>
                    <p>
                        Authentication failed or was not provided. Verify that you have sent valid ixDirectory credentials via HTTP Basic.
                    </p>
                    <p>A 'Www-Authenticate' challenge header will be sent with this type of error response.</p>
                <h3 id="https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</h3>
                    An unknown error has occurred.
                <h3 id="https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</h3>
                    The resource that you requested does not exist.
                </body></html>
        """
        url = 'http://readability.com/developers/api/parser#https://readability.com/api/content/v1#test_suite'
        response = self.parser_client.post_article_content(content, url)
        self.assertEqual(response.status, 200)
        # should have gotten back content that is shorter than original
        self.assertTrue(len(content) > len(response.content['content']))
Ejemplo n.º 11
0
	def _get_article(self, url):
		parser_client = ParserClient(PARSER_TOKEN)
		return parser_client.get_article_content(url)
Ejemplo n.º 12
0
	html = re.sub(u'[\xad]', '&shy;', html)
	# Replace/clobber any remaining UTF-8 characters that aren't in ISO-8859-1
	return fix_text(html)
	#return (html)

i=0 # counter for loop
for u in sys.argv:

	if (i==0):
		i=i+1
		pass
	else:
		print "processing URL: " + u

		# get parse article
		p = parser.get_article_content(u)

		#calc reading time (assumine 200 wpm)
		time = len(re.findall(r'\w+', p.content['content']))/250

		# process template
		template = unicode(open("template.html","r").read())
		html = pystache.render(template,{ 'url':p.content['url'], 'title':p.content['title'], 'author':p.content['author'], 'publisher': p.content['domain'], 'content':p.content['content'], 'time':time, 'order':i })

		print "HTML output: " + html.encode('utf-8')

		# write out HTML - but remember, need to check for missing data like usually author name
		f = open(str(i)+'.html', 'w')

		html = strip(html)
Ejemplo n.º 13
0
		print text
		return text

def getDomainName(domain):
	try:
		return domainToName[domain]
	except KeyError:
		return domain

with open('url.csv') as csvfile:
	urls = csvfile.read().split('\r\n')
	for url in urls:
		article = " "
		try:
			parser_client = ParserClient('dab74f9def9312c90473befef4181cf66bab7321')
			parser_response = parser_client.get_article_content(url)
			s = parser_response.content['content']
			x = parser_response.content['date_published']
			title = parser_response.content['title']
			author = parser_response.content['author']
			article = re.sub('href', '', s)
			article1 = re.sub('<img.*?>', '', article)
			article2 = re.sub('<p>','<br>', article1)
			article3 = re.sub('</p>','<br />', article2)
		except:
			print 'fail', url
		results+=[(tounicode(url), tounicode(title), tounicode(author), tounicode(article3), timeconvert(x))]
output = time.strftime('articles-%x.html').replace('/', '_')
with open(output, 'w') as outputfile:
	for result in results:
		url, title, _, _, _ = result
class ParserClientTest(TestCase):
    """Test case for the Parser Client

    """

    def setUp(self):
        self.parser_client = ParserClient(PARSER_TOKEN)
        self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain'

    def test_generate_url(self):
        """Test the clients ability to generate urls to endpoints.

        """
        # test root resource
        expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('')
        expected_url = '{0}?token={1}'.format(expected_url, PARSER_TOKEN)
        generated_url = self.parser_client._generate_url('')
        self.assertEqual(generated_url, expected_url)

        expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('parser')
        params = {'url': 'http://www.beanis.biz/blog.html'}
        expected_url = '{0}?url=http%3A%2F%2Fwww.beanis.biz%2Fblog.html&token={1}'.format(
            expected_url, PARSER_TOKEN)

        generated_url = self.parser_client._generate_url(
            'parser', query_params=params)
        self.assertEqual(generated_url, expected_url)

    def test_get_root(self):
        """Test the client's ability to hit the root endpoint.

        """
        response = self.parser_client.get_root()

        expected_keys = set(['resources', ])
        self.assertEqual(set(response.content.keys()), expected_keys)

    def test_get_confidence(self):
        """Test the client's ability to hit the confidence endpoint.

        """
        # hit without an article_id or url. Should get an error.
        response = self.parser_client.get_confidence()
        self.assertEqual(response.status, 400)

        expected_keys = set(['url', 'confidence'])

        response = self.parser_client.get_confidence(url=self.test_url)
        self.assertEqual(response.status, 200)
        self.assertEqual(set(response.content.keys()), expected_keys)
        # confidence for wikipedia should be over .5
        self.assertTrue(response.content['confidence'] > .5)

    def test_get_article_status(self):
        """Test the client's ability to hit the parser endpoint with a HEAD
        request.

        """
        # hit without an article_id or url. Should get an error.
        response = self.parser_client.get_confidence()
        self.assertEqual(response.status, 400)

        response = self.parser_client.get_article_status(url=self.test_url)
        self.assertEqual(response.status, 200)
        self.assertTrue(response.get('x-article-status') is not None)
        self.assertTrue(response.get('x-article-id') is not None)

    def test_get_article_content(self):
        """Test the client's ability to hit the parser endpoint with a GET
        request.

        """
        # test with incorrect params
        response = self.parser_client.get_article_content()
        self.assertEqual(response.status, 400)

        response = self.parser_client.get_article_content(url=self.test_url)
        self.assertEqual(response.status, 200)

        some_expected_keys = set(['content', 'domain', 'author', 'word_count',
            'title', 'total_pages'])
        self.assertTrue(
            some_expected_keys.issubset(set(response.content.keys())))

    def test_post_article_content(self):
        """Test the client's ability to hit the parser endpoint with a POST
        request.

        """
        # I'm sorry...
        content = """
        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
        <html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
        <title>Readability v1 Parser API</title><style type="text/css">
                            body {
                                font-family: sans-serif;
                                font: 0.8em/1.4 Arial, sans-serif;
                                margin: 2em 6em;
                                width: 65em;
                            }
                            pre {
                                font-family: Courier, monospace;
                                font-weight: 500;
                                font-size: 0.8em;
                                background-color: #eef;
                                padding: 1em;
                            }
                            .methods {
                                background-color: #e4e4e4;
                                margin-top: .4em;
                                padding: .6em;
                            }
                            .methods h4 {
                                border-bottom: 1px solid #fff;
                                padding: .1em 0;
                                margin-bottom: .4em;
                                color: #0b3c97;
                                font-size: 1.1em;
                            }
                            .methods h6 {
                                color: #666;
                                text-transform: lowercase;
                                margin: .6em 0 .3em;
                            }
                            .resource {
                                margin-bottom: 2em;
                                margin-top: .4em;
                            }
                            .resource h3 {
                                margin-bottom: .4em;
                                font-size: 1.4em;
                                color: #ff5700;
                            }
                            h1 {
                                font-size: 2.5em;
                            }
                            h2 {
                                border-bottom: 1px solid black;
                                margin-top: 1em;
                                color: #666;
                                margin-bottom: 0.5em;
                                font-size: 2em;
                            }
                            h3 {
                                font-size: 1.75em;
                                margin: 0.6em 0;
                            }
                            h4 {
                                color: #666;
                                margin: 0;
                                padding: 0.3em 0;
                                border-bottom: 2px solid white;
                            }
                            h6 {
                                font-size: 1.1em;
                                color: #99a;
                                margin: 0.5em 0em 0.25em 0em;
                            }
                            dd {
                                margin-left: 1em;
                            }
                            tt {
                                font-size: 1.2em;
                            }
                            table {
                                margin-bottom: 0.5em;
                                width: 100%;
                                border-collapse: collapse;
                            }
                            th {
                                text-align: left;
                                font-weight: normal;
                                color: black;
                                border-bottom: 1px solid black;
                                padding: 3px 6px;
                            }
                            td {
                                padding: 3px 6px;
                                vertical-align: top;
                                background-color: f6f6ff;
                                font-size: 0.85em;
                            }
                            td p {
                                margin: 0px;
                            }
                            ul {
                                padding-left: 1.75em;
                            }
                            p + ul, p + ol, p + dl {
                                margin-top: 0em;
                            }
                            .optional {
                                font-weight: normal;
                                opacity: 0.75;
                            }
                        </style><link href="prettify/prettify.css" type="text/css" rel="stylesheet"></link><script type="text/javascript" src="prettify/prettify.js"></script></head><body onload="prettyPrint()"><h1>Readability v1 Parser API</h1>
                <section>
                    <h2 id="authentication">Authentication</h2>
                    <p>
                        Requests to the Parser API are not signed like an OAuth
                        request.  The Parser token is simply passed as a POST or GET
                        parameter depending on the request type. Be careful not to
                        reveal this token, requests directly to the Parser API should
                        not be made on the client device but rather proxied to keep the
                        API token secure.
                    </p>
                </section>

                <section>
                    <h2 id="quick-start">Quick Start</h2>
                    <p class="section-intro">
                                Here's how to pull an article's content from the Readability Parser API:
                    </p>
                    <h4>Request</h4>
                    <pre>GET /api/content/v1/parser?url=http://blog.readability.com/2011/02/step-up-be-heard-readability-ideas/&amp;token=1b830931777ac7c2ac954e9f0d67df437175e66e</pre>
                    <h4>Response</h4>
                    <pre>
        HTTP/1.0 200 OK
        {
            "content" &lt;div class=\"article-text\"&gt;\n&lt;p&gt;I'm idling outside Diamante's, [snip] ...&lt;/p&gt;&lt;/div&gt;",
            "domain": "www.gq.com",
            "author": "Rafi Kohan",
            "url": "http://www.gq.com/sports/profiles/201202/david-diamante-interview-cigar-lounge-brooklyn-new-jersey-nets?currentPage=all",
            "short_url": "http://rdd.me/g3jcb1sr",
            "title": "Blowing Smoke with Boxing's Big Voice",
            "excerpt": "I'm idling outside Diamante's, a cigar lounge in Fort Greene, waiting for David Diamante, and soon I smell him coming. It's late January but warm. A motorcycle growls down the Brooklyn side street,&amp;hellip;",
            "direction": "ltr",
            "word_count": 2892,
            "total_pages": 1,
            "date_published": null,
            "dek": "Announcer &lt;strong&gt;David Diamante&lt;/strong&gt;, the new voice of the New Jersey (soon Brooklyn) Nets, has been calling boxing matches for years. On the side, he owns a cigar lounge in the heart of Brooklyn. We talk with Diamante about his new gig and the fine art of cigars",
            "lead_image_url": "http://www.gq.com/images/entertainment/2012/02/david-diamante/diamante-628.jpg",
            "next_page_id": null,
            "rendered_pages": 1
        }
        </pre>
                </section>

                <section>
                    <h2 id="data-formats">Data Formats</h2>
                    <p>
                        All requests are, by default, provided as JSON. You may also pass "?format=xml" in the URL to convert this into XML data to be consumed.
                    </p>
                </section>

            <h3>Resources, Representations &amp; Errors</h3><ul><li><a href="#resources">Resources</a><ul><li><a href="#idp3728">https://readability.com/api/content/v1/</a></li><li><a href="#idp4080">https://readability.com/api/content/v1/parser</a></li><li><a href="#idp39744">https://readability.com/api/content/v1/confidence</a></li></ul></li><li><a href="#representations">Representations</a><ul><li><a href="#https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</a></li></ul></li><li><a href="#faults">Errors</a><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</a></li></ul></li></ul><h2 id="resources">Resources</h2><div class="resource"><h3 id="idp3728">/</h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp5008">GET</h4>
                            Retrieve the base API URI - information about subresources.
                        <h6>request header parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>Authorization</strong></p></td><td><p><em><a href="" title=""></a></em><small> (required)</small></p></td><td></td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</a></li></ul></div></div></div><div class="resource"><h3 id="idp4080">/parser?token<span class="optional">&amp;url</span><span class="optional">&amp;id</span><span class="optional">&amp;max_pages</span></h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp36384">GET</h4>
                            Parse an article
                        <h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>token</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td></td></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The URL of an article to return the content for.</td></tr><tr><td><p><strong>id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The ID of an article to return the content for.</td></tr><tr><td><p><strong>max_pages</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#integer">integer</a></em></p></td><td>The maximum number of pages to parse and combine. Default is 25.</td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</a></li></ul><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div><div class="method"><h4 id="idp63552">HEAD</h4>
                            <p>
                                Retrieve the Content Status of an article. This is useful if you want to save yourself from POSTing a large html document. You can do a HEAD request on the resource, and check for the status of the article in the X-Article-Status header. <strong>Additionally, if we've never seen the article before, we'll return a 404, which also means you should POST.</strong>
                            </p>
                        <h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>token</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td></td></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The URL of an article to check.</td></tr><tr><td><p><strong>id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The ID of an article to check.</td></tr></table><h6>response header parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>X-Article-Id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>
                                <p>The ID of the article within Readablity.</p>
                            </td></tr><tr><td><p><strong>X-Article-Status</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>
                                <p>The status of the content in Readability. One of:</p>
                                <dl>
                                <dt>INVALID</dt>
                                <dd>We were unable to parse this URL for some reason. <em>Recommendation: Fail</em></dd>
                                <dt>UNRETRIEVED</dt>
                                <dd>We know of this article, but have not yet retrieved its content, or the cache has expired. <em>Recommendation: POST content to us</em></dd>
                                <dt>PROVIDED_BY_USER</dt>
                                <dd>We have retrieved the content for this URL from at least one user. <em>Recommendation: POST content to us</em></dd>
                                <dt>VALIDATED_BY_USERS</dt>
                                <dd>We have retrieved the content for this URL from multiple users, and have validated it. <em>Recommendation: GET the content from us.</em></dd>
                                <dt>FETCHED</dt>
                                <dd>We fetched the content for this URL manually, and it has been cached. <em>Recommendation:GET the content from us.</em></dd>
                                </dl>
                            </td></tr></table><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div></div></div><div class="resource"><h3 id="idp39744">/confidence?url<span class="optional">&amp;callback</span></h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp89296">GET</h4>Detect the confidence with which Readability could parse a given URL. Does not require a token.<h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td>The URL of an article to return the confidence for.</td></tr><tr><td><p><strong>callback</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The jsonp callback function name.</td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</a></li></ul><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div></div></div><h2 id="representations">Representations</h2><h3 id="https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        {
            "resources": {
                "parser": {
                    "description": "The Content Parser Resource",
                    "href": "/api/content/v1/parser"
                }
            }
        }
                    </pre>
                <h3 id="https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        {
            "content" &lt;div class=\"article-text\"&gt;\n&lt;p&gt;I'm idling outside Diamante's, [snip] ...&lt;/p&gt;&lt;/div&gt;",
            "domain": "www.gq.com",
            "author": "Rafi Kohan",
            "url": "http://www.gq.com/sports/profiles/201202/david-diamante-interview-cigar-lounge-brooklyn-new-jersey-nets?currentPage=all",
            "short_url": "http://rdd.me/g3jcb1sr",
            "title": "Blowing Smoke with Boxing's Big Voice",
            "excerpt": "I'm idling outside Diamante's, a cigar lounge in Fort Greene, waiting for David Diamante, and soon I smell him coming. It's late January but warm. A motorcycle growls down the Brooklyn side street,&amp;hellip;",
            "direction": "ltr",
            "word_count": 2892,
            "total_pages": 1,
            "date_published": null,
            "dek": "Announcer &lt;strong&gt;David Diamante&lt;/strong&gt;, the new voice of the New Jersey (soon Brooklyn) Nets, has been calling boxing matches for years. On the side, he owns a cigar lounge in the heart of Brooklyn. We talk with Diamante about his new gig and the fine art of cigars",
            "lead_image_url": "http://www.gq.com/images/entertainment/2012/02/david-diamante/diamante-628.jpg",
            "next_page_id": null,
            "rendered_pages": 1
        }

        </pre>
                <h3 id="https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        {
            "url": "http://www.gq.com/article/12",
            "confidence": .7
        }

        </pre>
                <h3 id="https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        callback({
            "url": "http://www.gq.com/article/12",
            "confidence": .7
        });

        </pre>
                <h2 id="faults">Errors</h2><h3 id="https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</h3>
                    The server could not understand your request. Verify that request parameters (and content, if any) are valid.
                <h3 id="https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</h3>
                    <p>
                        Authentication failed or was not provided. Verify that you have sent valid ixDirectory credentials via HTTP Basic.
                    </p>
                    <p>A 'Www-Authenticate' challenge header will be sent with this type of error response.</p>
                <h3 id="https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</h3>
                    An unknown error has occurred.
                <h3 id="https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</h3>
                    The resource that you requested does not exist.
                </body></html>
        """
        url = 'http://readability.com/developers/api/parser#https://readability.com/api/content/v1#test_suite'
        response = self.parser_client.post_article_content(content, url)
        self.assertEqual(response.status, 200)
        # should have gotten back content that is shorter than original
        self.assertTrue(len(content) > len(response.content['content']))