Python extract Examples, newslynx.lib.article.extract Python Examples

Example #1

0

Show file

File: test_extraction.py Project: newslynx/newslynx-core

 def test_reveal(self):
     source_url = 'https://www.revealnews.org/article/a-brief-history-of-the-modern-strawberry/'
     d = article.extract(source_url)
     assert ('ARIANE WU' in d['authors'])
     assert (d['page_type'] == 'article')
     assert (d['title'] == 'A Brief History of the Modern Strawberry')
     assert (
         d['description'] ==
         'This short stop-motion animation explains how clever advertising tactics and certain pesticides helped make the strawberry cheaply and widely available in the U.S.'
     )
     assert (d['domain'] == 'revealnews.org')
     assert (d['site_name'] == 'Reveal')
     assert (d['created'] == datetime.datetime(2014,
                                               11,
                                               11,
                                               0,
                                               57,
                                               tzinfo=pytz.utc))
     assert (
         d['favicon'] ==
         'https://www.revealnews.org/wp-content/themes/reveal2015/static/images/cir/favicon.ico'
     )
     assert (
         d['img_url'] ==
         'https://www.revealnews.org/wp-content/uploads/2015/02/Strawberry-CA0.png'
     )
     assert (
         'it seems that strawberries are served with just about everything'
         in d['body'])
     assert (d['url'] == source_url)

Example #2

0

Show file

File: google_alerts.py Project: newslynx/newslynx-sc-rss

    def format(self, obj):
        """
        For now all of these options are standard to twitter events.
        """
        # set the status.
        obj['status'] = self.options.get('event_status', 'pending')

        # prepare url (these are formatted as redirects).
        obj['url'] = url.prepare(obj['url'], expand=False, canonicalize=False)

        # ignore bad domains / org's own domains.
        if self._is_bad_domain(obj['url']):
            return

        # extract and merge article data.
        if url.is_article(obj['url']):
            data = article.extract(obj['url'], type=None)
            if data:
                obj.update(data)
                obj.pop('type', None)
                obj.pop('site_name', None)
                obj.pop('favicon', None)

        # set source id:
        _id = obj.pop('id', obj.get('url', gen_uuid()))
        if ":" in _id:
            _id = _id.split(':')[-1]
        obj['source_id'] = _id

        # TODO: Make formatting more elegant.
        if self.options.get('set_event_title', None):
            obj['title'] = self.options.get(
                'set_event_title').format(**self._fmt(obj))

        if self.options.get('set_event_description', None):
            obj['description'] = self.options.get(
                'set_event_description').format(**self._fmt(obj))

        if self.options.get('set_event_tag_ids', None) and \
           len(self.options.get('set_event_tag_ids')):

            obj['tag_ids'] = self.options.get('set_event_tag_ids')

        # hack because the app cant handle this field being a list.
        if self.options.get('set_event_content_items', None):
            if 'content_item_ids' not in obj:
                obj['content_item_ids'] = []
            for c in self.options.get('set_event_content_items', []):
                if isinstance(c, dict):
                    if c.get('id', None):
                        obj['content_item_ids'].append(c.get('id'))
                elif isinstance(c, int):
                    obj['content_item_ids'].append(c)
        # filter links.
        if self.options.get('must_link', False) \
           and not len(obj.get('links', [])):
            return None
        return obj

Example #3

0

Show file

File: test_extraction.py Project: abelsonlive/newslynx-core

 def test_nytimes(self):
     source_url = 'http://www.nytimes.com/2015/06/05/fashion/mens-style/farewell-my-lovely-cigarettes.html?smid=tw-share&_r=0'
     d = article.extract(source_url)
     assert('CHOIRE SICHA' in d['authors'])
     assert(d['page_type'] == 'article')
     assert(d['title'] == 'Farewell, My Lovely Cigarettes')
     assert(d['description'] == 'A lifelong smoker takes his final puff and looks back on a 30-year habit.')
     assert(d['domain'] == 'nytimes.com')
     assert(d['site_name'] == 'Nytimes')
     assert(d['created'] == datetime.datetime(2015, 6, 3, 0, 0, tzinfo=pytz.utc))
     assert(d['favicon'] == 'http://static01.nyt.com/favicon.ico')
     assert(d['img_url'] == 'http://static01.nyt.com/images/2015/06/05/fashion/05RITESOFPASSAGE1/05RITESOFPASSAGE1-facebookJumbo.jpg')
     assert('Someone could easily get cut' in d['body'])
     assert(d['url'] == 'http://www.nytimes.com/2015/06/05/fashion/mens-style/farewell-my-lovely-cigarettes.html')

Example #4

0

Show file

File: test_extraction.py Project: abelsonlive/newslynx-core

 def test_reveal(self):
     source_url = 'https://www.revealnews.org/article/a-brief-history-of-the-modern-strawberry/'
     d = article.extract(source_url)
     assert('ARIANE WU' in d['authors'])
     assert(d['page_type'] == 'article')
     assert(d['title'] == 'A Brief History of the Modern Strawberry')
     assert(d['description'] == 'This short stop-motion animation explains how clever advertising tactics and certain pesticides helped make the strawberry cheaply and widely available in the U.S.')
     assert(d['domain'] == 'revealnews.org')
     assert(d['site_name'] == 'Reveal')
     assert(d['created'] == datetime.datetime(2014, 11, 11, 0, 57, tzinfo=pytz.utc))
     assert(d['favicon'] == 'https://www.revealnews.org/wp-content/themes/reveal2015/static/images/cir/favicon.ico')
     assert(d['img_url'] == 'https://www.revealnews.org/wp-content/uploads/2015/02/Strawberry-CA0.png')
     assert('it seems that strawberries are served with just about everything' in d['body'])
     assert(d['url'] == source_url)

Example #5

0

Show file

File: test_extraction.py Project: abelsonlive/newslynx-core

    def test_propublica(self):

        source_url = 'http://www.propublica.org/article/congress-to-consider-scaling-down-group-homes-for-troubled-children'
        d = article.extract(source_url)
        assert(['JOAQUIN SAPIEN'] == d['authors'])
        assert(d['page_type'] == 'article')
        assert(d['title'] == 'Congress to Consider Scaling Down Group Homes for Troubled Children')
        assert(d['description'] == 'At a hearing in Washington, a renewed call for addressing the violence and neglect that plagues group homes for foster youth.')
        assert(d['domain'] == 'propublica.org')
        assert(d['site_name'] == 'ProPublica')
        assert(d['created'] == datetime.datetime(2015, 5, 20, 17, 47, 13, tzinfo=pytz.utc))
        assert('www.propublica.org/favicon.ico' in d['favicon'])
        assert(d['img_url'] == 'http://www.propublica.org/images/ngen/gypsy_og_image/20150520-group-home-hearing-1200x630.jpg')
        assert('finding that children had repeatedly been sent to facilities that were rife with abuse and that had become known recruiting grounds for pimp' in d['body'])
        assert(d['url'] == source_url)

Example #6

0

Show file

 def work(self, url, type='article'):
     """
     Standardize + cache a raw url
     returning it's standardized url + global bitly url.
     """
     return article.extract(url, type=type)

Example #7

0

Show file

File: extract_cache.py Project: abelsonlive/newslynx-core

 def work(self, url, type='article'):
     """
     Standardize + cache a raw url
     returning it's standardized url + global bitly url.
     """
     return article.extract(url, type=type)

Example #8

0

Show file

File: work_cache.py Project: jjelosua/newslynx-core

 def _extract(self, url, type):
     if type == 'article':
         return article.extract(url)
     else:
         raise NotImplemented(
             "NewsLynx only has support for Article Extraction.")

Example #9

0

Show file

File: test_extraction.py Project: abelsonlive/newslynx-core

 def test_multiple_authors(self):
     source_url = 'http://www.propublica.org/article/new-snowden-documents-reveal-secret-memos-expanding-spying'
     d = article.extract(source_url)
     assert(len(d.get('authors', [])) == 2)

Example #10

0

Show file

File: work_cache.py Project: jjelosua/newslynx-core

 def _extract(self, url, type):
     if type == 'article':
         return article.extract(url)
     else:
         raise NotImplemented(
             "NewsLynx only has support for Article Extraction.")

Example #11

0

Show file

File: test_extraction.py Project: newslynx/newslynx-core

 def test_multiple_authors(self):
     source_url = 'http://www.propublica.org/article/new-snowden-documents-reveal-secret-memos-expanding-spying'
     d = article.extract(source_url)
     assert (len(d.get('authors', [])) == 2)