Exemple #1
0
def getHTML(url):
    print '\nPROCESSING URL'
    parser = ParserClient(rk)
    p = parser.get_article_content(url)
    html = p.content['content']
    #print cleanHTML(html)
    return cleanHTML(html)
def generate_content(url,category):
    parser_client = ParserClient(token='7c8daeedd7726bf0c7d6b042098ee320ae336d87')
    parser_response = parser_client.get_article(str(url))
    article = parser_response.json()
    str_article_title=article['title']
    strarticle = article['content']
    final_article = re.sub('<.*?>', '', strarticle)
    final_article2 = re.sub('&.*?;', '', final_article)

    line = re.sub('["]', '', final_article2)
    final_article3=line.encode('utf-8').strip()
    final_article3=os.linesep.join([s for s in final_article3.splitlines() if s])
    final_article4=re.sub(' +',' ',final_article3)
    linet=re.sub('["]', '', str_article_title)
    final_article_title = linet.encode('utf-8').strip()


    intcategory=int(category)
    db = MySQLdb.connect("localhost", 'root', '', "inswipes")
    cursor = db.cursor()
    try:
        sql='INSERT INTO meta_content(article_content,link,main_category_id,article_title)VALUES("%s","%s","%d","%s")'%(final_article4,url,intcategory,final_article_title)

        cursor.execute(sql)

        db.commit()
        db.close()
    except:
        db.rollback()
        db.close()


    summarization()
Exemple #3
0
def get_feed():
    logging.warning('get_feed: RSS check...')
    parsed_feed = feedparser.parse(config.RSS_FEED)
    parser_client = ParserClient(readability_api_key)

    feed_urls_cached = Feeds.query.all()

    db_url_list = [cached_feed.url for cached_feed in feed_urls_cached]
    logging.warning('get_feed: db urls count {}'.format(len(db_url_list)))

    for rss_url in parsed_feed['entries']:
        if rss_url['link'] not in db_url_list:
            logging.warning('get_feed: Added from rss: {}'.format(rss_url['link']))
            parser_response = parser_client.get_article_content(rss_url['link'])

            try:
                logging.warning('get_feed: Data len {}'.format(len(parser_response.content['content'])))
                save_to_db(rss_url['link'], parser_response.content['title'], parser_response.content['content'])
                add_feed = Feeds(url=rss_url['link'])

                db.session.add(add_feed)
                db.session.commit()

                write_action_log('rss', url=rss_url['link'], title=parser_response.content['title'])

            except KeyError, e:
                logging.warning('get_feed: ERR {}, no content'.format(e))
                db.session.rollback()
                add_feed = Feeds(url=rss_url['link'])
                db.session.add(add_feed)
                db.session.commit()

                write_action_log('rss', url=rss_url['link'], title="Err parse, no title")
def extracting_content(url):
    parser_client = ParserClient(token='#########################')
    parser_response = parser_client.get_article(str(url))
    article = parser_response.json()
    str_article_title = article['title']
    strarticle = article['content']

    print(str_article_title)
    print(strarticle)
Exemple #5
0
def get_page_metadata(url):
    token = os.environ.get('READABILITY_PARSER_KEY', None)
    if not token:
        return {}
    try:
        parser_client = ParserClient(token=os.environ.get('READABILITY_PARSER_KEY'))
        return parser_client.get_article(url).json()
    except Exception:
        logger.exception('Failed to readability for url %s', url)
Exemple #6
0
def get_text(wiki_title="Jabari_Parker"):
  from readability import ParserClient
  parser_client = ParserClient(settings.PARSER_TOKEN)
  parser_response_text = parser_client.get_article_content(settings.WIKI_URL + wiki_title).content['content'].replace("\n", " ")

  ## Filter out the end of Wikipedia articles
  text = parser_response_text.replace("/<img[^>]*>/g","")
  text = text.split('<span class="mw-headline" id="See_also"')[0]
  text = text.split('<span class="mw-headline" id="Notes"')[0]
  text = text.split('<span class="mw-headline" id="References"')[0]
  text = text.split('<span class="mw-headline" id="Notes_and_references"')[0]
  return text
Exemple #7
0
 def extract_raw_content(a):
     #data = Aylien().extract(a["resolved_url"])
     parser_client = ParserClient('0ae1d8bed72a91ed706dcf9f354a0db4b430cb47')
     parser_response = parser_client.get_article_content(a['resolved_url'])
     try:
         content = parser_response.content
         if 'error' in content:
             raise Exception
         return content
     except Exception as e:
         print parser_response
         print parser_response.content
         print e
         return False
Exemple #8
0
class ReadabilityToEpub:
    def __init__(self, parser_token=None):
        if not parser_token:
            raise Exception(
                "Get a Readability parser token at: https://www.readability.com/developers/api"
            )
        self.parser_client = ParserClient(token=parser_token)

    def convert_url(self, url):
        parser_resp = self.parser_client.get_article(url).json()

        epub_book = epub.EpubBook()
        epub_book.set_title(parser_resp['title'])
        epub_book.add_author(parser_resp['author'])

        content_html = epub.EpubHtml(title=parser_resp['title'],
                                     file_name='content.xhtml',
                                     content="<h1>{}</h1>\n{}".format(
                                         parser_resp['title'],
                                         parser_resp['content']))

        epub_book.add_item(content_html)
        epub_book.add_item(epub.EpubNcx())
        epub_book.add_item(epub.EpubNav())
        # A spine determines the order in which content will be shown
        epub_book.spine = [content_html]

        epub.write_epub("{}.epub".format(slugify(parser_resp['title'])),
                        epub_book, dict(plugins=[DownloadImagesPlugin()]))
Exemple #9
0
class ReadabilityToEpub:
    def __init__(self, parser_token=None):
        if not parser_token:
            raise Exception("Get a Readability parser token at: https://www.readability.com/developers/api")
        self.parser_client = ParserClient(token=parser_token)

    def convert_url(self, url):
        parser_resp = self.parser_client.get_article(url).json()

        epub_book = epub.EpubBook()
        epub_book.set_title(parser_resp["title"])
        epub_book.add_author(parser_resp["author"])

        content_html = epub.EpubHtml(
            title=parser_resp["title"],
            file_name="content.xhtml",
            content="<h1>{}</h1>\n{}".format(parser_resp["title"], parser_resp["content"]),
        )

        epub_book.add_item(content_html)
        epub_book.add_item(epub.EpubNcx())
        epub_book.add_item(epub.EpubNav())
        # A spine determines the order in which content will be shown
        epub_book.spine = [content_html]

        epub.write_epub(
            "{}.epub".format(slugify(parser_resp["title"])), epub_book, dict(plugins=[DownloadImagesPlugin()])
        )
Exemple #10
0
 def setUp(self):
     self.database = mongomock.Connection().db
     self.parser_client = ParserClient('readability secret parser key')
     response = Response(dict())
     response.content = dict(content='<p>article</p>')
     self.parser_client.get_article_content = MagicMock(
         return_value=response)
Exemple #11
0
 def get(self):
     client = ParserClient(token='64c0f2ae58811bc3d09104e8d22abb3e3b328971')
     feeds = RSSinfo.query()
     for feed in feeds:
         if feed.get_full_article == True:
             items = RSS.query(ancestor = feed.key)
             for item in items:
                 if item.content == 'no content':
                     parser_response = client.get_article(url = item.link)
                     sleep(1)
                     article = parser_response.json()
                     item.content = article['content']
                     item.put()
                 else:
                     pass
         else:
             pass
 def post(self, request, *args, **kwargs):
     form = LinkForm(request.POST)
     if form.is_valid():
         link = form.save(commit=False)
         link.group = Group.objects.get(pk=self.kwargs['group_id'])
         # extract data from readability
         parser_client = ParserClient(token=settings.READABILITY_TOKEN)
         parser_response = parser_client.get_article(link.url)
         article = parser_response.json()
         link.title = article.get('title', '')
         link.content = article.get('content', '')
         link.description = article.get('excerpt', '')
         link.save()
         tags = extract_tags(link.title + ' ' + link.content)
         link.tags.add(*tags)
     url = reverse('groups:list_links', kwargs={'group_id': self.kwargs['group_id']})
     return redirect(url)
Exemple #13
0
def generate_content(url,category):
    parser_client = ParserClient(token='7c8daeedd7726bf0c7d6b042098ee320ae336d87')
    parser_response = parser_client.get_article(str(url))
    article = parser_response.json()
    str_article_title=article['title']
    strarticle = article['content']
    final_article = re.sub('<.*?>', '', strarticle)
    final_article2 = re.sub('&.*?;', '', final_article)

    line = re.sub('["]', '', final_article2)
    final_article3=line.encode('utf-8').strip()
    final_article3=os.linesep.join([s for s in final_article3.splitlines() if s])
    final_article4=re.sub(' +',' ',final_article3)
    linet=re.sub('["]', '', str_article_title)
    final_article_title = linet.encode('utf-8').strip()
    print(url)
    print(final_article4)
    insertion(category,url,final_article4)
Exemple #14
0
def get_feed():
    logging.warning('get_feed: RSS check...')
    parsed_feed = feedparser.parse(config.RSS_FEED)
    parser_client = ParserClient(readability_api_key)

    feed_urls_cached = Feeds.query.all()

    db_url_list = [cached_feed.url for cached_feed in feed_urls_cached]
    logging.warning('get_feed: db urls count {}'.format(len(db_url_list)))

    for rss_url in parsed_feed['entries']:
        if rss_url['link'] not in db_url_list:
            logging.warning('get_feed: Added from rss: {}'.format(
                rss_url['link']))
            parser_response = parser_client.get_article_content(
                rss_url['link'])

            try:
                logging.warning('get_feed: Data len {}'.format(
                    len(parser_response.content['content'])))
                save_to_db(rss_url['link'], parser_response.content['title'],
                           parser_response.content['content'])
                add_feed = Feeds(url=rss_url['link'])

                db.session.add(add_feed)
                db.session.commit()

                write_action_log('rss',
                                 url=rss_url['link'],
                                 title=parser_response.content['title'])

            except KeyError, e:
                logging.warning('get_feed: ERR {}, no content'.format(e))
                db.session.rollback()
                add_feed = Feeds(url=rss_url['link'])
                db.session.add(add_feed)
                db.session.commit()

                write_action_log('rss',
                                 url=rss_url['link'],
                                 title="Err parse, no title")
Exemple #15
0
    def get(self):
        """
        *Get the readability parser client*

        **Return:**
            - ``parserClient`` -- the readability parser client
        """
        self.log.info('starting the ``get`` method')

        from readability import ParserClient
        os.environ['READABILITY_PARSER_TOKEN'] = self.settings["readability"][
            "parser api token"]

        parser_client = ParserClient()

        self.log.info('completed the ``get`` method')
        return parser_client
Exemple #16
0
def _save_bookmark(bookmark_form):
    bookmark = {
        'title': bookmark_form.title.data,
        'url': bookmark_form.url.data,
        'description': bookmark_form.description.data,
        'referrer': bookmark_form.referrer.data,
        'tags': bookmark_form.tags.data,
        'published': datetime.datetime.utcnow(),
        'public': bookmark_form.public.data,
        'user': {
            '_id': ObjectId(current_user.get_id()),
            'nickname': current_user.nickname,
            'email': current_user.email
        }
    }
    if bookmark_form.archive.data:
        response = ParserClient(os.getenv('READABILITY_PARSER_KEY')).get_article_content(bookmark_form.url.data)
        if response.status == 200:
            bookmark['content'] = response.content['content']

    mongo.db.bookmarks.update({'url': bookmark_form.url.data, 'user._id': ObjectId(current_user.get_id())},
                              {'$set': bookmark}, upsert=True)
Exemple #17
0
                writer.write("<meta name = \"SOURCEURL\" content = \" " +
                             sourceUrl + "\" />" + '\n')

                writer.write(parser_response.content['content'].encode("utf8"))


# get event list from GDELT project data, one csv file for each day
startPath = "http://data.gdeltproject.org/events/"
outPutPath = "/home/ysz/news_graph/events"
articleArchive = "/home/ysz/news_graph/article"

getEvents(startPath, outPutPath)

print "Downloading articles......"
parser_client = ParserClient(
    'f25f302cab7c00da41e4f5f2c5b17428f60c97d5'
)  # Crawl Tool: https://www.readability.com/developers/api/parser
startTime = latest_file(articleRoot)
startTime = "20150430"
endTime = "20150507"
files = need_extract(outPutPath, ".export.CSV.zip", startTime, endTime)

# print files:
for fi in files:
    print "filename: ", fi
    filehandle = open(fi, 'rb')
    zf = zipfile.ZipFile(filehandle)
    base = os.path.basename(fi)
    pure_file_name = os.path.splitext(base)[0]
    try:
        data = StringIO.StringIO(zf.read(pure_file_name))
Exemple #18
0
# -*- coding: utf-8 -*-
import nltk
import string
from nltk.collocations import *
from nltk.stem.wordnet import WordNetLemmatizer

from bs4 import BeautifulSoup
from readability import ParserClient

parser_client = ParserClient('0ae1d8bed72a91ed706dcf9f354a0db4b430cb47')
parser_response = parser_client.get_article_content('http://www.theatlantic.com/entertainment/archive/2014/02/russias-gold-medal-figure-skaters-celeb-relationship-status-pioneers/283804')
article = parser_response.content['content']

soup = BeautifulSoup(article, "lxml")
text = soup.get_text()

for k, v in parser_response.content.iteritems():
	if k in ['title', 'dek']:
		text = text + v

exclude = set(string.punctuation+'”'+'’')
text = ''.join(ch for ch in text if ch not in exclude and ch in string.printable).lower()

words = nltk.word_tokenize(text)
filtered_words = [w for w in words if not w in nltk.corpus.stopwords.words('english')]

for w in filtered_words:
	print w
Exemple #19
0
	def _get_article(self, url):
		parser_client = ParserClient(PARSER_TOKEN)
		return parser_client.get_article_content(url)
Exemple #20
0
 def get_page_content():
     parser_client = ParserClient(readability_api_key)
     parser_response = parser_client.get_article_content(url)
     return parser_response
class ParserClientTest(TestCase):
    """Test case for the Parser Client

    """

    def setUp(self):
        self.parser_client = ParserClient(PARSER_TOKEN)
        self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain'

    def test_generate_url(self):
        """Test the clients ability to generate urls to endpoints.

        """
        # test root resource
        expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('')
        expected_url = '{0}?token={1}'.format(expected_url, PARSER_TOKEN)
        generated_url = self.parser_client._generate_url('')
        self.assertEqual(generated_url, expected_url)

        expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('parser')
        params = {'url': 'http://www.beanis.biz/blog.html'}
        expected_url = '{0}?url=http%3A%2F%2Fwww.beanis.biz%2Fblog.html&token={1}'.format(
            expected_url, PARSER_TOKEN)

        generated_url = self.parser_client._generate_url(
            'parser', query_params=params)
        self.assertEqual(generated_url, expected_url)

    def test_get_root(self):
        """Test the client's ability to hit the root endpoint.

        """
        response = self.parser_client.get_root()

        expected_keys = set(['resources', ])
        self.assertEqual(set(response.content.keys()), expected_keys)

    def test_get_confidence(self):
        """Test the client's ability to hit the confidence endpoint.

        """
        # hit without an article_id or url. Should get an error.
        response = self.parser_client.get_confidence()
        self.assertEqual(response.status, 400)

        expected_keys = set(['url', 'confidence'])

        response = self.parser_client.get_confidence(url=self.test_url)
        self.assertEqual(response.status, 200)
        self.assertEqual(set(response.content.keys()), expected_keys)
        # confidence for wikipedia should be over .5
        self.assertTrue(response.content['confidence'] > .5)

    def test_get_article_status(self):
        """Test the client's ability to hit the parser endpoint with a HEAD
        request.

        """
        # hit without an article_id or url. Should get an error.
        response = self.parser_client.get_confidence()
        self.assertEqual(response.status, 400)

        response = self.parser_client.get_article_status(url=self.test_url)
        self.assertEqual(response.status, 200)
        self.assertTrue(response.get('x-article-status') is not None)
        self.assertTrue(response.get('x-article-id') is not None)

    def test_get_article_content(self):
        """Test the client's ability to hit the parser endpoint with a GET
        request.

        """
        # test with incorrect params
        response = self.parser_client.get_article_content()
        self.assertEqual(response.status, 400)

        response = self.parser_client.get_article_content(url=self.test_url)
        self.assertEqual(response.status, 200)

        some_expected_keys = set(['content', 'domain', 'author', 'word_count',
            'title', 'total_pages'])
        self.assertTrue(
            some_expected_keys.issubset(set(response.content.keys())))

    def test_post_article_content(self):
        """Test the client's ability to hit the parser endpoint with a POST
        request.

        """
        # I'm sorry...
        content = """
        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
        <html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
        <title>Readability v1 Parser API</title><style type="text/css">
                            body {
                                font-family: sans-serif;
                                font: 0.8em/1.4 Arial, sans-serif;
                                margin: 2em 6em;
                                width: 65em;
                            }
                            pre {
                                font-family: Courier, monospace;
                                font-weight: 500;
                                font-size: 0.8em;
                                background-color: #eef;
                                padding: 1em;
                            }
                            .methods {
                                background-color: #e4e4e4;
                                margin-top: .4em;
                                padding: .6em;
                            }
                            .methods h4 {
                                border-bottom: 1px solid #fff;
                                padding: .1em 0;
                                margin-bottom: .4em;
                                color: #0b3c97;
                                font-size: 1.1em;
                            }
                            .methods h6 {
                                color: #666;
                                text-transform: lowercase;
                                margin: .6em 0 .3em;
                            }
                            .resource {
                                margin-bottom: 2em;
                                margin-top: .4em;
                            }
                            .resource h3 {
                                margin-bottom: .4em;
                                font-size: 1.4em;
                                color: #ff5700;
                            }
                            h1 {
                                font-size: 2.5em;
                            }
                            h2 {
                                border-bottom: 1px solid black;
                                margin-top: 1em;
                                color: #666;
                                margin-bottom: 0.5em;
                                font-size: 2em;
                            }
                            h3 {
                                font-size: 1.75em;
                                margin: 0.6em 0;
                            }
                            h4 {
                                color: #666;
                                margin: 0;
                                padding: 0.3em 0;
                                border-bottom: 2px solid white;
                            }
                            h6 {
                                font-size: 1.1em;
                                color: #99a;
                                margin: 0.5em 0em 0.25em 0em;
                            }
                            dd {
                                margin-left: 1em;
                            }
                            tt {
                                font-size: 1.2em;
                            }
                            table {
                                margin-bottom: 0.5em;
                                width: 100%;
                                border-collapse: collapse;
                            }
                            th {
                                text-align: left;
                                font-weight: normal;
                                color: black;
                                border-bottom: 1px solid black;
                                padding: 3px 6px;
                            }
                            td {
                                padding: 3px 6px;
                                vertical-align: top;
                                background-color: f6f6ff;
                                font-size: 0.85em;
                            }
                            td p {
                                margin: 0px;
                            }
                            ul {
                                padding-left: 1.75em;
                            }
                            p + ul, p + ol, p + dl {
                                margin-top: 0em;
                            }
                            .optional {
                                font-weight: normal;
                                opacity: 0.75;
                            }
                        </style><link href="prettify/prettify.css" type="text/css" rel="stylesheet"></link><script type="text/javascript" src="prettify/prettify.js"></script></head><body onload="prettyPrint()"><h1>Readability v1 Parser API</h1>
                <section>
                    <h2 id="authentication">Authentication</h2>
                    <p>
                        Requests to the Parser API are not signed like an OAuth
                        request.  The Parser token is simply passed as a POST or GET
                        parameter depending on the request type. Be careful not to
                        reveal this token, requests directly to the Parser API should
                        not be made on the client device but rather proxied to keep the
                        API token secure.
                    </p>
                </section>

                <section>
                    <h2 id="quick-start">Quick Start</h2>
                    <p class="section-intro">
                                Here's how to pull an article's content from the Readability Parser API:
                    </p>
                    <h4>Request</h4>
                    <pre>GET /api/content/v1/parser?url=http://blog.readability.com/2011/02/step-up-be-heard-readability-ideas/&amp;token=1b830931777ac7c2ac954e9f0d67df437175e66e</pre>
                    <h4>Response</h4>
                    <pre>
        HTTP/1.0 200 OK
        {
            "content" &lt;div class=\"article-text\"&gt;\n&lt;p&gt;I'm idling outside Diamante's, [snip] ...&lt;/p&gt;&lt;/div&gt;",
            "domain": "www.gq.com",
            "author": "Rafi Kohan",
            "url": "http://www.gq.com/sports/profiles/201202/david-diamante-interview-cigar-lounge-brooklyn-new-jersey-nets?currentPage=all",
            "short_url": "http://rdd.me/g3jcb1sr",
            "title": "Blowing Smoke with Boxing's Big Voice",
            "excerpt": "I'm idling outside Diamante's, a cigar lounge in Fort Greene, waiting for David Diamante, and soon I smell him coming. It's late January but warm. A motorcycle growls down the Brooklyn side street,&amp;hellip;",
            "direction": "ltr",
            "word_count": 2892,
            "total_pages": 1,
            "date_published": null,
            "dek": "Announcer &lt;strong&gt;David Diamante&lt;/strong&gt;, the new voice of the New Jersey (soon Brooklyn) Nets, has been calling boxing matches for years. On the side, he owns a cigar lounge in the heart of Brooklyn. We talk with Diamante about his new gig and the fine art of cigars",
            "lead_image_url": "http://www.gq.com/images/entertainment/2012/02/david-diamante/diamante-628.jpg",
            "next_page_id": null,
            "rendered_pages": 1
        }
        </pre>
                </section>

                <section>
                    <h2 id="data-formats">Data Formats</h2>
                    <p>
                        All requests are, by default, provided as JSON. You may also pass "?format=xml" in the URL to convert this into XML data to be consumed.
                    </p>
                </section>

            <h3>Resources, Representations &amp; Errors</h3><ul><li><a href="#resources">Resources</a><ul><li><a href="#idp3728">https://readability.com/api/content/v1/</a></li><li><a href="#idp4080">https://readability.com/api/content/v1/parser</a></li><li><a href="#idp39744">https://readability.com/api/content/v1/confidence</a></li></ul></li><li><a href="#representations">Representations</a><ul><li><a href="#https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</a></li></ul></li><li><a href="#faults">Errors</a><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</a></li></ul></li></ul><h2 id="resources">Resources</h2><div class="resource"><h3 id="idp3728">/</h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp5008">GET</h4>
                            Retrieve the base API URI - information about subresources.
                        <h6>request header parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>Authorization</strong></p></td><td><p><em><a href="" title=""></a></em><small> (required)</small></p></td><td></td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</a></li></ul></div></div></div><div class="resource"><h3 id="idp4080">/parser?token<span class="optional">&amp;url</span><span class="optional">&amp;id</span><span class="optional">&amp;max_pages</span></h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp36384">GET</h4>
                            Parse an article
                        <h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>token</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td></td></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The URL of an article to return the content for.</td></tr><tr><td><p><strong>id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The ID of an article to return the content for.</td></tr><tr><td><p><strong>max_pages</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#integer">integer</a></em></p></td><td>The maximum number of pages to parse and combine. Default is 25.</td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</a></li></ul><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div><div class="method"><h4 id="idp63552">HEAD</h4>
                            <p>
                                Retrieve the Content Status of an article. This is useful if you want to save yourself from POSTing a large html document. You can do a HEAD request on the resource, and check for the status of the article in the X-Article-Status header. <strong>Additionally, if we've never seen the article before, we'll return a 404, which also means you should POST.</strong>
                            </p>
                        <h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>token</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td></td></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The URL of an article to check.</td></tr><tr><td><p><strong>id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The ID of an article to check.</td></tr></table><h6>response header parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>X-Article-Id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>
                                <p>The ID of the article within Readablity.</p>
                            </td></tr><tr><td><p><strong>X-Article-Status</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>
                                <p>The status of the content in Readability. One of:</p>
                                <dl>
                                <dt>INVALID</dt>
                                <dd>We were unable to parse this URL for some reason. <em>Recommendation: Fail</em></dd>
                                <dt>UNRETRIEVED</dt>
                                <dd>We know of this article, but have not yet retrieved its content, or the cache has expired. <em>Recommendation: POST content to us</em></dd>
                                <dt>PROVIDED_BY_USER</dt>
                                <dd>We have retrieved the content for this URL from at least one user. <em>Recommendation: POST content to us</em></dd>
                                <dt>VALIDATED_BY_USERS</dt>
                                <dd>We have retrieved the content for this URL from multiple users, and have validated it. <em>Recommendation: GET the content from us.</em></dd>
                                <dt>FETCHED</dt>
                                <dd>We fetched the content for this URL manually, and it has been cached. <em>Recommendation:GET the content from us.</em></dd>
                                </dl>
                            </td></tr></table><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div></div></div><div class="resource"><h3 id="idp39744">/confidence?url<span class="optional">&amp;callback</span></h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp89296">GET</h4>Detect the confidence with which Readability could parse a given URL. Does not require a token.<h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td>The URL of an article to return the confidence for.</td></tr><tr><td><p><strong>callback</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The jsonp callback function name.</td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</a></li></ul><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div></div></div><h2 id="representations">Representations</h2><h3 id="https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        {
            "resources": {
                "parser": {
                    "description": "The Content Parser Resource",
                    "href": "/api/content/v1/parser"
                }
            }
        }
                    </pre>
                <h3 id="https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        {
            "content" &lt;div class=\"article-text\"&gt;\n&lt;p&gt;I'm idling outside Diamante's, [snip] ...&lt;/p&gt;&lt;/div&gt;",
            "domain": "www.gq.com",
            "author": "Rafi Kohan",
            "url": "http://www.gq.com/sports/profiles/201202/david-diamante-interview-cigar-lounge-brooklyn-new-jersey-nets?currentPage=all",
            "short_url": "http://rdd.me/g3jcb1sr",
            "title": "Blowing Smoke with Boxing's Big Voice",
            "excerpt": "I'm idling outside Diamante's, a cigar lounge in Fort Greene, waiting for David Diamante, and soon I smell him coming. It's late January but warm. A motorcycle growls down the Brooklyn side street,&amp;hellip;",
            "direction": "ltr",
            "word_count": 2892,
            "total_pages": 1,
            "date_published": null,
            "dek": "Announcer &lt;strong&gt;David Diamante&lt;/strong&gt;, the new voice of the New Jersey (soon Brooklyn) Nets, has been calling boxing matches for years. On the side, he owns a cigar lounge in the heart of Brooklyn. We talk with Diamante about his new gig and the fine art of cigars",
            "lead_image_url": "http://www.gq.com/images/entertainment/2012/02/david-diamante/diamante-628.jpg",
            "next_page_id": null,
            "rendered_pages": 1
        }

        </pre>
                <h3 id="https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        {
            "url": "http://www.gq.com/article/12",
            "confidence": .7
        }

        </pre>
                <h3 id="https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        callback({
            "url": "http://www.gq.com/article/12",
            "confidence": .7
        });

        </pre>
                <h2 id="faults">Errors</h2><h3 id="https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</h3>
                    The server could not understand your request. Verify that request parameters (and content, if any) are valid.
                <h3 id="https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</h3>
                    <p>
                        Authentication failed or was not provided. Verify that you have sent valid ixDirectory credentials via HTTP Basic.
                    </p>
                    <p>A 'Www-Authenticate' challenge header will be sent with this type of error response.</p>
                <h3 id="https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</h3>
                    An unknown error has occurred.
                <h3 id="https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</h3>
                    The resource that you requested does not exist.
                </body></html>
        """
        url = 'http://readability.com/developers/api/parser#https://readability.com/api/content/v1#test_suite'
        response = self.parser_client.post_article_content(content, url)
        self.assertEqual(response.status, 200)
        # should have gotten back content that is shorter than original
        self.assertTrue(len(content) > len(response.content['content']))
 def setUp(self):
     self.parser_token = required_from_env('READABILITY_PARSER_TOKEN')
     self.parser_client = ParserClient(token=self.parser_token)
     self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain'
Exemple #23
0
 def _get_article(self, url):
     parser_client = ParserClient(PARSER_TOKEN)
     return parser_client.get_article_content(url)
# -*- coding: utf-8 -*-
import os
from pymongo import MongoClient
from readability import ParserClient


def do_it(database, parser_client):
    for bookmark in list(database.bookmarks.find({}, {'_id': 1, 'url': 1})):
        response = parser_client.get_article_content(bookmark['url'])
        database.bookmarks.update({'_id': bookmark['_id']}, {'$set': {'content': response.content['content']}}, multi=True)

if __name__ == '__main__':
    client = MongoClient(os.environ['MONGOLAB_URI'])
    do_it(client.get_default_database(), ParserClient(os.getenv('READABILITY_PARSER_KEY')))


Exemple #25
0
def main():
    if not os.path.isfile('credentials.config'):  # if credentials file does not exist, start the first run function
        first_run()  # Authenticate and generate the credentials file.

    # command line switches function
    args = read_command_args()
    use_evernote = args.e
    debug_mode = args.debug
    delete_files = args.t if use_evernote is True else False
    path = args.p
    info_mode = args.i

    if debug_mode:
        # print("Warning - Debug mode active. Files will be downloaded, but not added to index")
        logger = create_logger(log_to_console=True)
        logger.setLevel(logging.DEBUG)
        logger.info('Warning - Debug mode active. Files will be downloaded, but not added to index')
    elif info_mode:
        warnings.warn("Suppressed Resource warning", ResourceWarning)  # suppresses sll unclosed socket warnings.
        logger = create_logger(log_to_console=True)
    else:
        warnings.warn("Suppressed Resource warning", ResourceWarning)  # suppresses sll unclosed socket warnings.
        logger = create_logger()

    logger.info("\n###########\nStarting SR\n###########")

    try:
        with open('credentials.config', 'r') as json_file:
            credentials = json.load(json_file)  # get various OAuth tokens
    except OSError:
        logger.error('Unable to open credentials file')
        raise SystemExit

    # Create the downloads folder on the specified path, or in the dir where file is stored.
    if path is not "":
        path = path[0]
    else:
        path = os.getcwd()
    path += "/SRDownloads"

    if not os.path.exists(path):
        os.makedirs(path)

    # Authenticate with Reddit
    logger.info('Authenticating with Reddit')
    client_id = credentials['reddit']['client_id']
    client_secret = credentials['reddit']['client_secret']
    redirect_uri = credentials['reddit']['redirect_uri']
    refresh_token = credentials['reddit']['refresh_token']
    user_agent = "SavedRetriever 0.9 by /u/fuzzycut"

    try:
        r = praw.Reddit(user_agent=user_agent,
                        oauth_client_id=client_id,
                        oauth_client_secret=client_secret,
                        oauth_redirect_uri=redirect_uri)

        access_information = r.refresh_access_information(refresh_token)
        r.set_access_credentials(**access_information)
    except Exception as e:
        logger.error(e)
        raise SystemExit
    time_since_accesstoken = time.time()

    index = set()
    if os.path.isfile('index.txt'):  # checking for  index file, which contains index of downloaded files.
        try:
            with open('index.txt', 'r') as ind:
                for line in ind:
                    index.add(line[:-1])  # -1 truncates the newline in the index file.
        except OSError:
            logger.error("Unable to open index file for reading")
            raise SystemExit

    if use_evernote is True:
        enclient = evernoteWrapper.Client(credentials['evernote']['dev_token'], 'Saved from Reddit')

    html_index_file = None
    if delete_files is False:  # only create index if we're going to use it.
        html_index_file = html_index.index(r.get_me().name, path)

    try:
        ind = open('index.txt', 'a')  # open index file for appending
    except OSError:
        logger.error("Unable to open index file for writing")
        raise SystemExit

    logger.info("Beginning to save files...")
    for i in r.get_me().get_saved(limit=None):
        if (time.time() - time_since_accesstoken) / 60 > 55:  # Refresh the access token before it runs out.
            logger.debug('Refreshing Reddit token')
            r.refresh_access_information(access_information['refresh_token'])
            time_since_accesstoken = time.time()

        name = i.name
        file_name = name  # to stop ide complaining.
        note = None
        evernote_tags = ('Reddit', 'SavedRetriever', '/r/' + i.subreddit.display_name)  # add config for this later

        # logger.info('Saving post - {}'.format(name))

        if name not in index:  # file has not been downloaded
            permalink = i.permalink
            author = i.author
            title = i.link_title if hasattr(i, 'link_title') else i.title
            # ========== #
            # IS COMMENT #
            # ========== #
            if hasattr(i, 'body_html'):
                logger.debug("{} is comment".format(name))
                body = i.body_html

                # html output
                body = subreddit_linker(body)
                output = html_output_string(permalink, author, body, title)
                if delete_files is False:
                    file_name = html_writer(path, name, output)

                # en api section
                if use_evernote is True:
                    enclient.new_note(title)
                    enclient.add_html(output)
                    enclient.add_tag(*evernote_tags)  # the * is very important. It unpacks the tags tuple properly
                    note = enclient.create_note()
            # ============ #
            # IS SELF-POST #
            # ============ #
            elif hasattr(i, 'is_self') and i.is_self is True:
                logger.debug('{} is self-post'.format(name))
                text = i.selftext_html if i.selftext_html is not None else ""

                # html output
                text = subreddit_linker(text)
                output = html_output_string(permalink, author, text, title)
                if delete_files is False:
                    file_name = html_writer(path, name, output)

                # en api section
                if use_evernote is True:
                    enclient.new_note(title)
                    enclient.add_tag(*evernote_tags)
                    enclient.add_html(output)
                    note = enclient.create_note()
            # ====================== #
            # IS DIRECT LINKED IMAGE #
            # ====================== #
            elif hasattr(i, 'url') and re.sub("([^A-z0-9])\w+", "", i.url.split('.')[-1]) in ['jpg', 'png', 'gif', 'gifv', 'pdf']:
                """
                Need to check file types and test pdf. How does this handle gfycat and webm? Can EN display that inline?
                The regex in the if is to strip out non-valid filetype chars.
                """
                logger.debug('{} is direct linked image'.format(name))
                url = i.url
                base_filename = "{}_image.{}".format(name, re.sub("([^A-z0-9])\w+", "", url.split('.')[
                    -1]))  # filename for image. regex same as above.
                filename = path + "/" + base_filename

                # image downloader section
                if os.path.exists(filename) and (os.path.getsize(filename) > 0):  # If image exists and is valid
                    image_downloaded = True
                    logger.info("Image already exists - {}".format(base_filename))
                else:
                    image_downloaded = image_saver(url, filename)
                    logger.info('Downloaded image - {}'.format(base_filename))

                if image_downloaded:
                    # write image as <img> or link to local pdf downloaded in html file
                    if filename.split('.')[-1] == 'pdf':
                        img = '<a href="{}">Click here for link to downloaded pdf</a>'.format(base_filename)
                    else:
                        img = '<br><a href="{0}"><img src="{0}"></a>'.format(
                            base_filename)  # html for embedding in html file
                else:
                    img = "Image failed to download - It may be temporarily or permanently unavailable"

                # Evernote api section
                if use_evernote is True:
                    enclient.new_note(title)
                    enclient.add_tag(*evernote_tags)
                    enclient.add_html(html_output_string_image(permalink, author, "", title))  # should add body="" in the function
                    if image_downloaded:
                        enclient.add_resource(filename)
                    note = enclient.create_note()

                if delete_files is False:
                    file_name = html_writer(path, name, html_output_string_image(permalink, author, img, title))
                else:
                    os.remove(filename)
            # ============== #
            # IS IMGUR ALBUM #
            # ============== #
            elif hasattr(i, 'url') and 'imgur' in i.url:  # Add option to download images to folder.
                logger.debug('{} is Imgur album'.format(name))
                url = i.url
                body = "<h2>{}</h2>".format(title)

                # imgur api section
                client = ImgurClient(credentials['imgur']['client_id'], credentials['imgur']['client_secret'])
                pattern = '\/([A-z0-9]{5,7})'  # matches any 5-7 long word that comes after a forward slash (/).
                match = re.findall(pattern, url)
                gallery_id = match[-1].replace('/', '')  # removes any forward slashes for processing
                gallery = []
                filename = None
                try:
                    gallery = client.get_album_images(gallery_id)
                except imgurpython.helpers.error.ImgurClientError:  # if 'gallery' is actually just a lone image
                    try:
                        gallery = [client.get_image(gallery_id)]
                    except imgurpython.helpers.error.ImgurClientError as error:  # if gallery does not exist. Is this the best way to do this?
                        if debug_mode is True or error.status_code != 404:
                            print("**{} - {}**".format(error.status_code, error.error_message))

                # img_path = 'Downloads/{}'.format(gallery_id)
                img_path = path + "/" + gallery_id
                if not os.path.exists(img_path):
                    os.makedirs(img_path)
                for image in gallery:  # add if gallery > 10, then just add a link (would be too large for the note)
                    image_name = image.title if image.title is not None else ""
                    image_description = image.description if image.description is not None else ""
                    image_filetype = image.type.split('/')[1]
                    image_id = image.id
                    image_link = image.link
                    # sets up downloaded filename and html for embedding image
                    base_filename = "{}_image.{}".format(image_id, image_filetype)
                    img = '<p><h3>{0}</h3><a href="{1}/{2}"><img src="{1}/{2}"></a><br/>{3}</p>'.format(image_name,
                                                                                                        gallery_id,
                                                                                                        base_filename,
                                                                                                        image_description)
                    filename = img_path + "/" + base_filename
                    if os.path.exists(filename) and (os.path.getsize(filename) > 0):  # only download if file doesn't already exist
                        logger.info('Image already exists - {}'.format(base_filename))
                    else:
                        image_saver(image_link, filename)
                        logger.info('Image downloaded - {}'.format(base_filename))
                    body += img

                # Evernote api section
                if use_evernote is True:
                    enclient.new_note(title)
                    enclient.add_tag(*evernote_tags)
                    if len(gallery) == 1 and filename is not None:
                        enclient.add_html(html_output_string_image(permalink, author, "", title))
                        enclient.add_resource(filename)
                    else:
                        enclient.add_html(html_output_string_image(permalink, author,
                        'This album is too large to embed; please see <a href="{}">here</a> for the original link.'.format(url),
                                                             title))
                    note = enclient.create_note()

                if delete_files is False:
                    file_name = html_writer(path, name, html_output_string_image(permalink, author, body, title))
                else:
                    shutil.rmtree(img_path)
            # ========== #
            # IS ARTICLE #
            # ========== #
            elif hasattr(i, 'title') and i.is_self is False:
                # This section needs work. It is semi-complete. Ultimately, adding in the full article is the goal.
                logger.debug('{} is article/webpage'.format(name))
                url = i.url

                # readability api section
                os.environ["READABILITY_PARSER_TOKEN"] = credentials['readability'][
                    'parser_key']  # set the environment variable as the parser key
                logger.info('Initializing Readability Client')
                parse = ParserClient()  # readability api doesn't take the token directly
                parse_response = parse.get_article(url)
                article = parse_response.json()
                if 'content' not in article:  # if unable to parse document, manually set an error message
                    article['content'] = 'Unable to parse page - See <a href="{}">here</a> for the original link'.format(url)
                article = article['content']
                article = "<a href='{}'>{}</a><br/>{}<br/>".format(url, title, article)  # source of article

                # html output section.
                output = html_output_string(permalink, author, article, title)
                if delete_files is False:
                    file_name = html_writer(path, name, output)

                # Evernote section
                if use_evernote is True:
                    enclient.new_note(title)
                    enclient.add_tag(*evernote_tags)
                    output = html_output_string(permalink, author, article, title)
                    enclient.add_html(output)

                    # Add html file to note
                    # enclient.add_resource("Downloads/{}.html".format(name))
                    note = enclient.create_note()

            # end of checking for saved items #
            failed_upload = False
            if use_evernote is True:
                if note is not None:
                    # print("Saved {:9} - GUID: {}".format(name, note.guid))
                    logger.info('Saved {:9} - GUID: {}'.format(name, note.guid))
                else:  # Upload failed
                    # print("Saved {:9} - Note failed to upload".format(name))
                    logger.info('Saved {:9} - Note failed to upload'.format(name))
                    failed_upload = True
            elif use_evernote is False:
                # print("Saved " + name)
                logger.info('Saved ' + name)
            if not debug_mode and not failed_upload:
                ind.write(name + "\n")
                ind.flush()  # this fixes python not writing the file if it terminates before .close() can be called
                if delete_files is False:
                    html_index_file.add_link(title, file_name, permalink)

    # end of for loop
    ind.close()
    logger.info("All items downloaded")
    if delete_files is False:
        html_index_file.save_and_close()
    else:  # try remove downloads if -t is set, but don't force it if directory has things in it already.
        try:
            os.rmdir('Downloads')
        except OSError:
            logger.error("Unable to remove files")
Exemple #26
0
 def setUp(self):
     self.parser_client = ParserClient(PARSER_TOKEN)
     self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain'
Exemple #27
0
class ParserClientTest(TestCase):
    """Test case for the Parser Client

    """
    def setUp(self):
        self.parser_client = ParserClient(PARSER_TOKEN)
        self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain'

    def test_generate_url(self):
        """Test the clients ability to generate urls to endpoints.

        """
        # test root resource
        expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('')
        expected_url = '{0}?token={1}'.format(expected_url, PARSER_TOKEN)
        generated_url = self.parser_client._generate_url('')
        self.assertEqual(generated_url, expected_url)

        expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('parser')
        params = {'url': 'http://www.beanis.biz/blog.html'}
        expected_url = '{0}?url=http%3A%2F%2Fwww.beanis.biz%2Fblog.html&token={1}'.format(
            expected_url, PARSER_TOKEN)

        generated_url = self.parser_client._generate_url('parser',
                                                         query_params=params)
        self.assertEqual(generated_url, expected_url)

    def test_get_root(self):
        """Test the client's ability to hit the root endpoint.

        """
        response = self.parser_client.get_root()

        expected_keys = set([
            'resources',
        ])
        self.assertEqual(set(response.content.keys()), expected_keys)

    def test_get_confidence(self):
        """Test the client's ability to hit the confidence endpoint.

        """
        # hit without an article_id or url. Should get an error.
        response = self.parser_client.get_confidence()
        self.assertEqual(response.status, 400)

        expected_keys = set(['url', 'confidence'])

        response = self.parser_client.get_confidence(url=self.test_url)
        self.assertEqual(response.status, 200)
        self.assertEqual(set(response.content.keys()), expected_keys)
        # confidence for wikipedia should be over .5
        self.assertTrue(response.content['confidence'] > .5)

    def test_get_article_status(self):
        """Test the client's ability to hit the parser endpoint with a HEAD
        request.

        """
        # hit without an article_id or url. Should get an error.
        response = self.parser_client.get_confidence()
        self.assertEqual(response.status, 400)

        response = self.parser_client.get_article_status(url=self.test_url)
        self.assertEqual(response.status, 200)
        self.assertTrue(response.get('x-article-status') is not None)
        self.assertTrue(response.get('x-article-id') is not None)

    def test_get_article_content(self):
        """Test the client's ability to hit the parser endpoint with a GET
        request.

        """
        # test with incorrect params
        response = self.parser_client.get_article_content()
        self.assertEqual(response.status, 400)

        response = self.parser_client.get_article_content(url=self.test_url)
        self.assertEqual(response.status, 200)

        some_expected_keys = set([
            'content', 'domain', 'author', 'word_count', 'title', 'total_pages'
        ])
        self.assertTrue(
            some_expected_keys.issubset(set(response.content.keys())))

    def test_post_article_content(self):
        """Test the client's ability to hit the parser endpoint with a POST
        request.

        """
        # I'm sorry...
        content = """
        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
        <html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
        <title>Readability v1 Parser API</title><style type="text/css">
                            body {
                                font-family: sans-serif;
                                font: 0.8em/1.4 Arial, sans-serif;
                                margin: 2em 6em;
                                width: 65em;
                            }
                            pre {
                                font-family: Courier, monospace;
                                font-weight: 500;
                                font-size: 0.8em;
                                background-color: #eef;
                                padding: 1em;
                            }
                            .methods {
                                background-color: #e4e4e4;
                                margin-top: .4em;
                                padding: .6em;
                            }
                            .methods h4 {
                                border-bottom: 1px solid #fff;
                                padding: .1em 0;
                                margin-bottom: .4em;
                                color: #0b3c97;
                                font-size: 1.1em;
                            }
                            .methods h6 {
                                color: #666;
                                text-transform: lowercase;
                                margin: .6em 0 .3em;
                            }
                            .resource {
                                margin-bottom: 2em;
                                margin-top: .4em;
                            }
                            .resource h3 {
                                margin-bottom: .4em;
                                font-size: 1.4em;
                                color: #ff5700;
                            }
                            h1 {
                                font-size: 2.5em;
                            }
                            h2 {
                                border-bottom: 1px solid black;
                                margin-top: 1em;
                                color: #666;
                                margin-bottom: 0.5em;
                                font-size: 2em;
                            }
                            h3 {
                                font-size: 1.75em;
                                margin: 0.6em 0;
                            }
                            h4 {
                                color: #666;
                                margin: 0;
                                padding: 0.3em 0;
                                border-bottom: 2px solid white;
                            }
                            h6 {
                                font-size: 1.1em;
                                color: #99a;
                                margin: 0.5em 0em 0.25em 0em;
                            }
                            dd {
                                margin-left: 1em;
                            }
                            tt {
                                font-size: 1.2em;
                            }
                            table {
                                margin-bottom: 0.5em;
                                width: 100%;
                                border-collapse: collapse;
                            }
                            th {
                                text-align: left;
                                font-weight: normal;
                                color: black;
                                border-bottom: 1px solid black;
                                padding: 3px 6px;
                            }
                            td {
                                padding: 3px 6px;
                                vertical-align: top;
                                background-color: f6f6ff;
                                font-size: 0.85em;
                            }
                            td p {
                                margin: 0px;
                            }
                            ul {
                                padding-left: 1.75em;
                            }
                            p + ul, p + ol, p + dl {
                                margin-top: 0em;
                            }
                            .optional {
                                font-weight: normal;
                                opacity: 0.75;
                            }
                        </style><link href="prettify/prettify.css" type="text/css" rel="stylesheet"></link><script type="text/javascript" src="prettify/prettify.js"></script></head><body onload="prettyPrint()"><h1>Readability v1 Parser API</h1>
                <section>
                    <h2 id="authentication">Authentication</h2>
                    <p>
                        Requests to the Parser API are not signed like an OAuth
                        request.  The Parser token is simply passed as a POST or GET
                        parameter depending on the request type. Be careful not to
                        reveal this token, requests directly to the Parser API should
                        not be made on the client device but rather proxied to keep the
                        API token secure.
                    </p>
                </section>

                <section>
                    <h2 id="quick-start">Quick Start</h2>
                    <p class="section-intro">
                                Here's how to pull an article's content from the Readability Parser API:
                    </p>
                    <h4>Request</h4>
                    <pre>GET /api/content/v1/parser?url=http://blog.readability.com/2011/02/step-up-be-heard-readability-ideas/&amp;token=1b830931777ac7c2ac954e9f0d67df437175e66e</pre>
                    <h4>Response</h4>
                    <pre>
        HTTP/1.0 200 OK
        {
            "content" &lt;div class=\"article-text\"&gt;\n&lt;p&gt;I'm idling outside Diamante's, [snip] ...&lt;/p&gt;&lt;/div&gt;",
            "domain": "www.gq.com",
            "author": "Rafi Kohan",
            "url": "http://www.gq.com/sports/profiles/201202/david-diamante-interview-cigar-lounge-brooklyn-new-jersey-nets?currentPage=all",
            "short_url": "http://rdd.me/g3jcb1sr",
            "title": "Blowing Smoke with Boxing's Big Voice",
            "excerpt": "I'm idling outside Diamante's, a cigar lounge in Fort Greene, waiting for David Diamante, and soon I smell him coming. It's late January but warm. A motorcycle growls down the Brooklyn side street,&amp;hellip;",
            "direction": "ltr",
            "word_count": 2892,
            "total_pages": 1,
            "date_published": null,
            "dek": "Announcer &lt;strong&gt;David Diamante&lt;/strong&gt;, the new voice of the New Jersey (soon Brooklyn) Nets, has been calling boxing matches for years. On the side, he owns a cigar lounge in the heart of Brooklyn. We talk with Diamante about his new gig and the fine art of cigars",
            "lead_image_url": "http://www.gq.com/images/entertainment/2012/02/david-diamante/diamante-628.jpg",
            "next_page_id": null,
            "rendered_pages": 1
        }
        </pre>
                </section>

                <section>
                    <h2 id="data-formats">Data Formats</h2>
                    <p>
                        All requests are, by default, provided as JSON. You may also pass "?format=xml" in the URL to convert this into XML data to be consumed.
                    </p>
                </section>

            <h3>Resources, Representations &amp; Errors</h3><ul><li><a href="#resources">Resources</a><ul><li><a href="#idp3728">https://readability.com/api/content/v1/</a></li><li><a href="#idp4080">https://readability.com/api/content/v1/parser</a></li><li><a href="#idp39744">https://readability.com/api/content/v1/confidence</a></li></ul></li><li><a href="#representations">Representations</a><ul><li><a href="#https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</a></li></ul></li><li><a href="#faults">Errors</a><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</a></li></ul></li></ul><h2 id="resources">Resources</h2><div class="resource"><h3 id="idp3728">/</h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp5008">GET</h4>
                            Retrieve the base API URI - information about subresources.
                        <h6>request header parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>Authorization</strong></p></td><td><p><em><a href="" title=""></a></em><small> (required)</small></p></td><td></td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</a></li></ul></div></div></div><div class="resource"><h3 id="idp4080">/parser?token<span class="optional">&amp;url</span><span class="optional">&amp;id</span><span class="optional">&amp;max_pages</span></h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp36384">GET</h4>
                            Parse an article
                        <h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>token</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td></td></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The URL of an article to return the content for.</td></tr><tr><td><p><strong>id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The ID of an article to return the content for.</td></tr><tr><td><p><strong>max_pages</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#integer">integer</a></em></p></td><td>The maximum number of pages to parse and combine. Default is 25.</td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</a></li></ul><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div><div class="method"><h4 id="idp63552">HEAD</h4>
                            <p>
                                Retrieve the Content Status of an article. This is useful if you want to save yourself from POSTing a large html document. You can do a HEAD request on the resource, and check for the status of the article in the X-Article-Status header. <strong>Additionally, if we've never seen the article before, we'll return a 404, which also means you should POST.</strong>
                            </p>
                        <h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>token</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td></td></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The URL of an article to check.</td></tr><tr><td><p><strong>id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The ID of an article to check.</td></tr></table><h6>response header parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>X-Article-Id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>
                                <p>The ID of the article within Readablity.</p>
                            </td></tr><tr><td><p><strong>X-Article-Status</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>
                                <p>The status of the content in Readability. One of:</p>
                                <dl>
                                <dt>INVALID</dt>
                                <dd>We were unable to parse this URL for some reason. <em>Recommendation: Fail</em></dd>
                                <dt>UNRETRIEVED</dt>
                                <dd>We know of this article, but have not yet retrieved its content, or the cache has expired. <em>Recommendation: POST content to us</em></dd>
                                <dt>PROVIDED_BY_USER</dt>
                                <dd>We have retrieved the content for this URL from at least one user. <em>Recommendation: POST content to us</em></dd>
                                <dt>VALIDATED_BY_USERS</dt>
                                <dd>We have retrieved the content for this URL from multiple users, and have validated it. <em>Recommendation: GET the content from us.</em></dd>
                                <dt>FETCHED</dt>
                                <dd>We fetched the content for this URL manually, and it has been cached. <em>Recommendation:GET the content from us.</em></dd>
                                </dl>
                            </td></tr></table><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div></div></div><div class="resource"><h3 id="idp39744">/confidence?url<span class="optional">&amp;callback</span></h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp89296">GET</h4>Detect the confidence with which Readability could parse a given URL. Does not require a token.<h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td>The URL of an article to return the confidence for.</td></tr><tr><td><p><strong>callback</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The jsonp callback function name.</td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</a></li></ul><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div></div></div><h2 id="representations">Representations</h2><h3 id="https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        {
            "resources": {
                "parser": {
                    "description": "The Content Parser Resource",
                    "href": "/api/content/v1/parser"
                }
            }
        }
                    </pre>
                <h3 id="https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        {
            "content" &lt;div class=\"article-text\"&gt;\n&lt;p&gt;I'm idling outside Diamante's, [snip] ...&lt;/p&gt;&lt;/div&gt;",
            "domain": "www.gq.com",
            "author": "Rafi Kohan",
            "url": "http://www.gq.com/sports/profiles/201202/david-diamante-interview-cigar-lounge-brooklyn-new-jersey-nets?currentPage=all",
            "short_url": "http://rdd.me/g3jcb1sr",
            "title": "Blowing Smoke with Boxing's Big Voice",
            "excerpt": "I'm idling outside Diamante's, a cigar lounge in Fort Greene, waiting for David Diamante, and soon I smell him coming. It's late January but warm. A motorcycle growls down the Brooklyn side street,&amp;hellip;",
            "direction": "ltr",
            "word_count": 2892,
            "total_pages": 1,
            "date_published": null,
            "dek": "Announcer &lt;strong&gt;David Diamante&lt;/strong&gt;, the new voice of the New Jersey (soon Brooklyn) Nets, has been calling boxing matches for years. On the side, he owns a cigar lounge in the heart of Brooklyn. We talk with Diamante about his new gig and the fine art of cigars",
            "lead_image_url": "http://www.gq.com/images/entertainment/2012/02/david-diamante/diamante-628.jpg",
            "next_page_id": null,
            "rendered_pages": 1
        }

        </pre>
                <h3 id="https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        {
            "url": "http://www.gq.com/article/12",
            "confidence": .7
        }

        </pre>
                <h3 id="https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</h3>
                    <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint">
        callback({
            "url": "http://www.gq.com/article/12",
            "confidence": .7
        });

        </pre>
                <h2 id="faults">Errors</h2><h3 id="https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</h3>
                    The server could not understand your request. Verify that request parameters (and content, if any) are valid.
                <h3 id="https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</h3>
                    <p>
                        Authentication failed or was not provided. Verify that you have sent valid ixDirectory credentials via HTTP Basic.
                    </p>
                    <p>A 'Www-Authenticate' challenge header will be sent with this type of error response.</p>
                <h3 id="https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</h3>
                    An unknown error has occurred.
                <h3 id="https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</h3>
                    The resource that you requested does not exist.
                </body></html>
        """
        url = 'http://readability.com/developers/api/parser#https://readability.com/api/content/v1#test_suite'
        response = self.parser_client.post_article_content(content, url)
        self.assertEqual(response.status, 200)
        # should have gotten back content that is shorter than original
        self.assertTrue(len(content) > len(response.content['content']))
Exemple #28
0
#!/usr/bin/env python
from readability import ParserClient
import pystache
import sys
import re
from ftfy import fix_text
import codecs

# readability api key
parser = ParserClient('877f0069c46e0603a7d5868fab7d50731817dd9f')

# thanks to http://jamesmurty.com/2011/12/30/python-code-utf8-to-latin1/
def strip (html):
	html = re.sub(u'[\u02bc\u2018\u2019\u201a\u201b\u2039\u203a\u300c\u300d]',"'",html)
	# Replace "smart" and other double-quote like things
	html = re.sub(u'[\u00ab\u00bb\u201c\u201d\u201e\u201f\u300e\u300f]','"', html)
	# Replace copyright symbol
	html = re.sub(u'[\u00a9\u24b8\u24d2]', '(c)', html)
	# Replace registered trademark symbol
	html = re.sub(u'[\u00ae\u24c7]', '(r)', html)
	# Replace sound recording copyright symbol
	html = re.sub(u'[\u2117\u24c5\u24df]', '(p)', html)
	# Replace service mark symbol
	html = re.sub(u'[\u2120]', '(sm)', html)
	# Replace trademark symbol
	html = re.sub(u'[\u2122]', '(tm)', html)
	# Replace em & em dashes
	html = re.sub(u'[\u2013]', '&ndash;', html)
	html = re.sub(u'[\u2014]', '&mdash;', html)
	# weird hyphen replace
	html = re.sub(u'[\xad]', '&shy;', html)
 def setUp(self):
     self.parser_client = ParserClient(PARSER_TOKEN)
     self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain'
Exemple #30
0
 def __init__(self, parser_token=None):
     if not parser_token:
         raise Exception(
             "Get a Readability parser token at: https://www.readability.com/developers/api"
         )
     self.parser_client = ParserClient(token=parser_token)
class ParserClientTest(unittest.TestCase):
    """
    Test case for the Parser Client
    """
    def setUp(self):
        self.parser_token = required_from_env('READABILITY_PARSER_TOKEN')
        self.parser_client = ParserClient(token=self.parser_token)
        self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain'

    def test_generate_url(self):
        """
        Test the clients ability to generate urls to endpoints.
        """
        # Test root resource
        expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('')
        expected_url = '{}?token={}'.format(expected_url, self.parser_token)
        generated_url = self.parser_client._generate_url('')
        self.assertEqual(generated_url, expected_url)

        # Test parser resource
        expected_url = '{base_url}?token={token}&url=http%3A%2F%2Fwww.google.biz%2Fblog.html'.format(
            base_url=DEFAULT_PARSER_URL_TEMPLATE.format('parser'),
            token=self.parser_token)
        params = {'url': 'http://www.google.biz/blog.html'}
        generated_url = self.parser_client._generate_url(
            'parser', query_params=params)

        self.assertEqual(generated_url, expected_url)

    def test_get_root(self):
        """
        Test the client's ability to hit the root endpoint.
        """
        response = self.parser_client.get_root()

        expected_keys = set(['resources', ])
        self.assertEqual(set(response.json().keys()), expected_keys)

    def test_get_confidence(self):
        """
        Test the client's ability to hit the confidence endpoint.
        """
        # hit without an article_id or url. Should get an error.
        response = self.parser_client.get_confidence()
        self.assertEqual(response.status_code, 400)

        expected_keys = set(['url', 'confidence'])

        response = self.parser_client.get_confidence(url=self.test_url)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(set(response.json().keys()), expected_keys)
        # confidence for wikipedia should be over .5
        self.assertTrue(response.json()['confidence'] >= .5)

    def test_get_article_status(self):
        """
        Test the client's ability to hit the parser endpoint with a HEAD
        """
        # hit without an article_id or url. Should get an error.
        response = self.parser_client.get_confidence()
        self.assertEqual(response.status_code, 400)

        response = self.parser_client.get_article_status(url=self.test_url)
        self.assertEqual(response.status_code, 200)
        self.assertTrue(response.headers.get('x-article-status') is not None)
        self.assertTrue(response.headers.get('x-article-id') is not None)

    def test_get_article(self):
        """
        Test the client's ability to hit the parser endpoint with a GET
        """
        # test with incorrect params
        response = self.parser_client.get_article()
        self.assertEqual(response.status_code, 400)

        response = self.parser_client.get_article(url=self.test_url)
        self.assertEqual(response.status_code, 200)

        some_expected_keys = set(['content', 'domain', 'author', 'word_count',
            'title', 'total_pages'])
        self.assertTrue(
            some_expected_keys.issubset(set(response.json().keys())))

    def test_post_article_content(self):
        """
        Test the client's ability to hit the parser endpoint with a POST
        request.
        """
        content = load_test_content('content/test_post_content.html')
        url = 'http://thisisaurlthatdoesntmatterbutmustbepassedanyway.com/article.html'
        response = self.parser_client.post_article_content(content, url)
        self.assertEqual(response.status_code, 200)
Exemple #32
0
from flask import Flask, request, jsonify, make_response
from werkzeug.contrib.cache import MemcachedCache

#from readability.readability import Document
#import requests
from readability import ParserClient

application = Flask(__name__)

cache = MemcachedCache(['memcache:11211'])
parser_client = ParserClient()


@application.route("/")
def hello():
    id = request.args.get('id', '')
    url = request.args.get('url', '')

    result = cache.get(id)
    if result is None:
        try:
            parser_response = parser_client.get_article(url)
            result = parser_response.json()
        except Exception as e:
            print e
            result = {
                "title": "Error",
                "content": "<h3>Unable to fetch article's content!</h3>",
            }
        result[
            'summary'] = '<h3>You are using the out-dated Hacker News app, please update to latest version!</h3>'
#
# for tag in soup.findAll('p'):
#     print tag

import os
from readability import ParserClient


os.environ['READABILITY_PARSER_TOKEN'] = 'c4e591e3f00ed1512c8194ab6616cf826d155294'


# READABILITY_PARSER_TOKEN='c4e591e3f00ed1512c8194ab6616cf826d155294'
token = "c4e591e3f00ed1512c8194ab6616cf826d155294"

from readability import ParserClient
client = ParserClient(token=token)

parser_client = ParserClient(token)
parser_response = client.get_article('http://paulgraham.com/altair.html')
article = parser_response.json()

print(article['title'])
print(article['content'])


parser_response = client.get_article("http://www.politico.com/story/2016/03/rubio-wins-dc-caucuses-220681")
article = parser_response.json()
print(article['title'])
print(article['content'])

Exemple #34
0
 def get_page_content():
     parser_client = ParserClient(readability_api_key)
     parser_response = parser_client.get_article_content(url)
     return parser_response
Exemple #35
0
 def __init__(self, parser_token=None):
     if not parser_token:
         raise Exception("Get a Readability parser token at: https://www.readability.com/developers/api")
     self.parser_client = ParserClient(token=parser_token)
Exemple #36
0
	except:
		print text
		return text

def getDomainName(domain):
	try:
		return domainToName[domain]
	except KeyError:
		return domain

with open('url.csv') as csvfile:
	urls = csvfile.read().split('\r\n')
	for url in urls:
		article = " "
		try:
			parser_client = ParserClient('dab74f9def9312c90473befef4181cf66bab7321')
			parser_response = parser_client.get_article_content(url)
			s = parser_response.content['content']
			x = parser_response.content['date_published']
			title = parser_response.content['title']
			author = parser_response.content['author']
			article = re.sub('href', '', s)
			article1 = re.sub('<img.*?>', '', article)
			article2 = re.sub('<p>','<br>', article1)
			article3 = re.sub('</p>','<br />', article2)
		except:
			print 'fail', url
		results+=[(tounicode(url), tounicode(title), tounicode(author), tounicode(article3), timeconvert(x))]
output = time.strftime('articles-%x.html').replace('/', '_')
with open(output, 'w') as outputfile:
	for result in results:
import mmh3
import csv
import os
from readability import ParserClient
PC = ParserClient(os.getenv('READABILITY_API_TOKEN'))

STOPWORDS = [
    'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am',
    'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'being', 'because',
    'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does',
    'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had',
    'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i',
    'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like',
    'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no',
    'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our',
    'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so',
    'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these',
    'they', 'this', 'to', 'too', 'us', 'wants', 'was', 'we', 'were', 'what',
    'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with',
    'would', 'yet', 'you', 'your'
]


def add_post_dicts_to_csv(list_of_post_dictionaries):
    #writes all important data to CSV file to be used for calculating feature vectors without hitting Readability API
    with open('post_dictionaries.csv', 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter="\n")
        writer.writerow(list_of_post_dictionaries)


def call_readability():
#
#
# for tag in soup.findAll('p'):
#     print tag

import os
from readability import ParserClient

os.environ[
    'READABILITY_PARSER_TOKEN'] = 'c4e591e3f00ed1512c8194ab6616cf826d155294'

# READABILITY_PARSER_TOKEN='c4e591e3f00ed1512c8194ab6616cf826d155294'
token = "c4e591e3f00ed1512c8194ab6616cf826d155294"

from readability import ParserClient
client = ParserClient(token=token)

parser_client = ParserClient(token)
parser_response = client.get_article('http://paulgraham.com/altair.html')
article = parser_response.json()

print(article['title'])
print(article['content'])

parser_response = client.get_article(
    "http://www.politico.com/story/2016/03/rubio-wins-dc-caucuses-220681")
article = parser_response.json()
print(article['title'])
print(article['content'])

parser_response = client.get_article(