Example #1
0
    def parse_entry(self, entry):
        """
        Parse an entry in an RSS feed.
        """
        entry_url = self.get_url(entry)

        # merge description with body
        body = self.get_body(entry)
        description = self.get_description(entry)
        if not body:
            body = description
            description = None

        return {
            'id': entry.id,
            'url': entry_url,
            'domain': url.get_domain(entry_url),
            'body':  html.prepare(body, entry_url),
            'title': self.get_title(entry),
            'description': html.prepare(description, entry_url),
            'tags': self.get_tags(entry),
            'authors': self.get_authors(entry),
            'created': self.get_created(entry),
            'img_url': self.get_img_url(entry, body),
            'links': self.get_links(body, entry_url)
        }
Example #2
0
 def __init__(self, **kw):
     self.source_id = str(kw.get('source_id'))
     self.recipe_id = kw.get('recipe_id')
     self.org_id = kw.get('org_id')
     self.status = kw.get('status', 'pending')
     self.provenance = kw.get('provenance', 'recipe')
     self.url = kw.get('url')
     self.domain = kw.get('domain', url.get_domain(kw.get('url', None)))
     self.img_url = kw.get('img_url')
     self.thumbnail = kw.get('thumbnail')
     self.created = kw.get('created', dates.now())
     self.title = kw.get('title')
     self.description = kw.get('description')
     self.body = kw.get('body')
     self.authors = kw.get('authors', [])
     self.meta = kw.get('meta', {})
Example #3
0
def _prepare(obj, requires=[], recipe=None, type='event', org_id=None, extract=True):
    """
    Prepare a content item or an event.
    """

    # check required fields
    _check_requires(obj, requires, type=type)

    # validate status
    if type == 'event':
        if 'status' in obj:
            if not obj.get('status', None) in EVENT_STATUSES:
                raise RequestError(
                    'Invalid event status: {status}'.format(**obj))
            if obj['status'] == 'deleted':
                raise RequestError(
                    'You cannot create an Event with status of "deleted."')

    # validate type
    if type == 'content_item':
        if not obj.get('type', None) in CONTENT_ITEM_TYPES:
            raise RequestError(
                'Invalid content item type: {type}'.format(**obj))

    # get rid of ``id`` if it somehow got in here.
    obj.pop('id', None)
    obj.pop('org_id', None)

    # normalize the url
    if type == 'event':
        obj['url'] = _prepare_url(obj, 'url', expand=True, canonicalize=False)

    elif type == 'content_item':
        obj['url'] = _prepare_url(obj, 'url', expand=True, canonicalize=True)

    # sanitize creation date
    obj['created'] = _prepare_date(obj, 'created')
    if not obj['created']:
        obj.pop('created')

    # sanitize text/html fields
    obj['title'] = _prepare_str(obj, 'title', obj['url'])
    obj['description'] = _prepare_str(
        obj, 'description', obj['url'])
    obj['body'] = _prepare_str(obj, 'body', obj['url'])

    # set org id
    obj['org_id'] = org_id

    # check img url
    if not url.validate(obj.get('img_url', None)):
        obj['img_url'] = None

    # determine provenance.
    obj = _provenance(obj, recipe, type)

    # if type is content items and we're extracting. do it.
    if type == 'content_item' and extract and obj.get('url', None):
        cr = extract_cache.get(obj.get('url'), type=obj.get('type', None))

        if not cr.value:
            extract_cache.invalidate(
                obj.get('url'), type=obj.get('type', None))
            pass

        # merge extracted data with object.
        else:
            # merge extracted authors.
            for k, v in cr.value.items():
                if not obj.get(k, None):
                    obj[k] = v
                # preference extracted data
                if k in ['description', 'body']:
                    obj[k] = v
                elif k == 'authors':
                    if not k in obj:
                        obj[k] = v
                    else:
                        for vv in v:
                            if vv not in obj[k]:
                                obj[k].append(vv)

            # swap bad images.
            tn = _prepare_thumbnail(obj, 'img_url')
            if not tn:
                img = cr.value.get('img_url', None)
                if img:
                    obj['img_url'] = img
                    obj['thumbnail'] = _prepare_thumbnail(obj, 'img_url')
            else:
                obj['thumbnail'] = tn
    else:
        obj['thumbnail'] = _prepare_thumbnail(obj, 'img_url')

    # set domain
    obj['domain'] = url.get_domain(obj['url'])

    # return prepped object
    return obj
Example #4
0
def extract(source_url):
    """
    Article extraction. Method is as follows:
    1. Get html from url.
    2. Canonicalize URL.
    3. If not canonical, prepare the url.
    4. Extract meta tags.
    5. If embedly is active, use it for content extraction.
    6. If embedly doesnt return content or is not active, use readability
    7. If readability doesnt return content, use article tag.
    8. If authors aren't detcted from meta tags, detect them in article body.
    """

    # fetch page
    page_html = network.get(source_url)

    # something failed.
    if not page_html:
        log.warning("Failed to extract html from {}".format(source_url))
        return None

    soup = BeautifulSoup(page_html)

    # get canonical url
    canonical_url = meta.canonical_url(soup)
    if not canonical_url:
        canonical_url = url.prepare(
            source_url, source=source_url, canonicalize=False)

    # domain
    domain = url.get_domain(canonical_url)

    # get meta tags + other data
    data = {
        'url': canonical_url,
        'domain': domain,
        'title': meta.title(soup, canonical_url),
        'description': meta.description(soup, canonical_url),
        'img_url': meta.img_url(soup, canonical_url),
        'created': meta.publish_date(soup, canonical_url),
        'favicon': meta.favicon(soup, canonical_url),
        'site_name': meta.site_name(soup, canonical_url),
        'page_type': meta.page_type(soup, canonical_url),
        'authors': author.extract(soup),
        'body': None
    }

    # embed videos
    if url.is_video(canonical_url):
        data['body'] = embed.video(canonical_url)
        return data

    # extract article body
    if settings.EMBEDLY_ENABLED:
        data['body'] = body_via_embedly(canonical_url)
    if not data['body']:
        data['body'] = body_via_readability(page_html, canonical_url)

    # # extract body from article tag
    body, raw_html = body_via_article_tag(soup, canonical_url)

    # merge body
    if not data['body']:
        data['body'] = body

    # get creators from raw article html
    if not len(data['authors']) and raw_html:
        data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS)

        # remove site name from authors
        if data.get('site_name'):
            data['authors'] = [
                a.replace(data['site_name'].upper(), "").strip()
                for a in data['authors']
            ]

    # # get links from raw_html + content
    # links = [u for u in url.from_any(data['body']) if source_url not in u]
    # for u in url.from_any(raw_html, source=source_url):
    #     if u not in links and (u != source_url or not u.startswith(source_url)):
    #         links.append(u)

    # # split out internal / external links / article links
    # data['links'] = url.categorize_links(links, domain)

    return data
Example #5
0
 def test_domain_rm_www(self):
     u = 'http://www.nytimes.com'
     assert(url.get_domain(u) == 'nytimes.com')
Example #6
0
 def test_domain_rm_www(self):
     u = 'http://www.nytimes.com'
     assert (url.get_domain(u) == 'nytimes.com')
Example #7
0
def extract(source_url):
    """
    Article extraction. Method is as follows:
    1. Get html from url.
    2. Canonicalize URL.
    3. If not canonical, prepare the url.
    4. Extract meta tags.
    5. If embedly is active, use it for content extraction.
    6. If embedly doesnt return content or is not active, use readability
    7. If readability doesnt return content, use article tag.
    8. If authors aren't detcted from meta tags, detect them in article body.
    """

    # fetch page
    page_html = network.get(source_url)

    # something failed.
    if not page_html:
        log.warning("Failed to extract html from {}".format(source_url))
        return None

    soup = BeautifulSoup(page_html)

    # get canonical url
    canonical_url = meta.canonical_url(soup)
    if not canonical_url:
        canonical_url = url.prepare(source_url,
                                    source=source_url,
                                    canonicalize=False)

    # domain
    domain = url.get_domain(canonical_url)

    # get meta tags + other data
    data = {
        'url': canonical_url,
        'domain': domain,
        'title': meta.title(soup, canonical_url),
        'description': meta.description(soup, canonical_url),
        'img_url': meta.img_url(soup, canonical_url),
        'created': meta.publish_date(soup, canonical_url),
        'favicon': meta.favicon(soup, canonical_url),
        'site_name': meta.site_name(soup, canonical_url),
        'page_type': meta.page_type(soup, canonical_url),
        'authors': author.extract(soup),
        'body': None
    }

    # extract body from embedly + readability
    if settings.EMBEDLY_ENABLED:
        data['body'] = body_via_embedly(canonical_url)

    if not data['body']:
        data['body'] = body_via_readability(page_html, canonical_url)

    # # extract body from article tag
    body, raw_html = body_via_article_tag(soup, canonical_url)

    # merge body
    if not data['body']:
        data['body'] = body

    # get creators from raw article html
    if not len(data['authors']) and raw_html:
        data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS)

        # remove site name from authors
        if data.get('site_name'):
            data['authors'] = [
                a.replace(data['site_name'].upper(), "").strip()
                for a in data['authors']
            ]

    # # get links from raw_html + content
    # links = [u for u in url.from_any(data['body']) if source_url not in u]
    # for u in url.from_any(raw_html, source=source_url):
    #     if u not in links and (u != source_url or not u.startswith(source_url)):
    #         links.append(u)

    # # split out internal / external links / article links
    # data['links'] = url.categorize_links(links, domain)

    return data
Example #8
0
def _prepare(obj,
             requires=[],
             recipe=None,
             type='event',
             org_id=None,
             extract=True):
    """
    Prepare a content item or an event.
    """

    # check required fields
    _check_requires(obj, requires, type=type)

    # validate status
    if type == 'event':
        if 'status' in obj:
            if not obj.get('status', None) in EVENT_STATUSES:
                raise RequestError(
                    'Invalid event status: {status}'.format(**obj))
            if obj['status'] == 'deleted':
                raise RequestError(
                    'You cannot create an Event with status of "deleted."')

    # validate type
    if type == 'content_item':
        if not obj.get('type', None) in CONTENT_ITEM_TYPES:
            raise RequestError(
                'Invalid content item type: {type}'.format(**obj))

    # get rid of ``id`` if it somehow got in here.
    obj.pop('id', None)
    obj.pop('org_id', None)

    # normalize the url
    if type == 'event':
        obj['url'] = _prepare_url(obj, 'url', expand=True, canonicalize=False)

    elif type == 'content_item':
        obj['url'] = _prepare_url(obj, 'url', expand=True, canonicalize=True)

    # sanitize creation date
    obj['created'] = _prepare_date(obj, 'created')
    if not obj['created']:
        obj.pop('created')

    # sanitize text/html fields
    obj['title'] = _prepare_str(obj, 'title', obj['url'])
    obj['description'] = _prepare_str(obj, 'description', obj['url'])
    obj['body'] = _prepare_str(obj, 'body', obj['url'])

    # set org id
    obj['org_id'] = org_id

    # check img url
    if not url.validate(obj.get('img_url', None)):
        obj['img_url'] = None

    # determine provenance.
    obj = _provenance(obj, recipe, type)

    # if type is content items and we're extracting. do it.
    if type == 'content_item' and extract and obj.get('url', None):
        cr = extract_cache.get(obj.get('url'), type=obj.get('type', None))

        if not cr.value:
            extract_cache.invalidate(obj.get('url'),
                                     type=obj.get('type', None))
            pass

        # merge extracted data with object.
        else:
            # merge extracted authors.
            for k, v in cr.value.items():
                if not obj.get(k, None):
                    obj[k] = v
                # preference extracted data
                if k in ['description', 'body']:
                    obj[k] = v
                elif k == 'authors':
                    if not k in obj:
                        obj[k] = v
                    else:
                        for vv in v:
                            if vv not in obj[k]:
                                obj[k].append(vv)

            # swap bad images.
            tn = _prepare_thumbnail(obj, 'img_url')
            if not tn:
                img = cr.value.get('img_url', None)
                if img:
                    obj['img_url'] = img
                    obj['thumbnail'] = _prepare_thumbnail(obj, 'img_url')
            else:
                obj['thumbnail'] = tn
    else:
        obj['thumbnail'] = _prepare_thumbnail(obj, 'img_url')

    # set domain
    obj['domain'] = url.get_domain(obj['url'])

    # return prepped object
    return obj