def parse_entry(self, entry): """ Parse an entry in an RSS feed. """ entry_url = self.get_url(entry) # merge description with body body = self.get_body(entry) description = self.get_description(entry) if not body: body = description description = None return { 'id': entry.id, 'url': entry_url, 'domain': url.get_domain(entry_url), 'body': html.prepare(body, entry_url), 'title': self.get_title(entry), 'description': html.prepare(description, entry_url), 'tags': self.get_tags(entry), 'authors': self.get_authors(entry), 'created': self.get_created(entry), 'img_url': self.get_img_url(entry, body), 'links': self.get_links(body, entry_url) }
def __init__(self, **kw): self.source_id = str(kw.get('source_id')) self.recipe_id = kw.get('recipe_id') self.org_id = kw.get('org_id') self.status = kw.get('status', 'pending') self.provenance = kw.get('provenance', 'recipe') self.url = kw.get('url') self.domain = kw.get('domain', url.get_domain(kw.get('url', None))) self.img_url = kw.get('img_url') self.thumbnail = kw.get('thumbnail') self.created = kw.get('created', dates.now()) self.title = kw.get('title') self.description = kw.get('description') self.body = kw.get('body') self.authors = kw.get('authors', []) self.meta = kw.get('meta', {})
def _prepare(obj, requires=[], recipe=None, type='event', org_id=None, extract=True): """ Prepare a content item or an event. """ # check required fields _check_requires(obj, requires, type=type) # validate status if type == 'event': if 'status' in obj: if not obj.get('status', None) in EVENT_STATUSES: raise RequestError( 'Invalid event status: {status}'.format(**obj)) if obj['status'] == 'deleted': raise RequestError( 'You cannot create an Event with status of "deleted."') # validate type if type == 'content_item': if not obj.get('type', None) in CONTENT_ITEM_TYPES: raise RequestError( 'Invalid content item type: {type}'.format(**obj)) # get rid of ``id`` if it somehow got in here. obj.pop('id', None) obj.pop('org_id', None) # normalize the url if type == 'event': obj['url'] = _prepare_url(obj, 'url', expand=True, canonicalize=False) elif type == 'content_item': obj['url'] = _prepare_url(obj, 'url', expand=True, canonicalize=True) # sanitize creation date obj['created'] = _prepare_date(obj, 'created') if not obj['created']: obj.pop('created') # sanitize text/html fields obj['title'] = _prepare_str(obj, 'title', obj['url']) obj['description'] = _prepare_str( obj, 'description', obj['url']) obj['body'] = _prepare_str(obj, 'body', obj['url']) # set org id obj['org_id'] = org_id # check img url if not url.validate(obj.get('img_url', None)): obj['img_url'] = None # determine provenance. obj = _provenance(obj, recipe, type) # if type is content items and we're extracting. do it. if type == 'content_item' and extract and obj.get('url', None): cr = extract_cache.get(obj.get('url'), type=obj.get('type', None)) if not cr.value: extract_cache.invalidate( obj.get('url'), type=obj.get('type', None)) pass # merge extracted data with object. else: # merge extracted authors. for k, v in cr.value.items(): if not obj.get(k, None): obj[k] = v # preference extracted data if k in ['description', 'body']: obj[k] = v elif k == 'authors': if not k in obj: obj[k] = v else: for vv in v: if vv not in obj[k]: obj[k].append(vv) # swap bad images. tn = _prepare_thumbnail(obj, 'img_url') if not tn: img = cr.value.get('img_url', None) if img: obj['img_url'] = img obj['thumbnail'] = _prepare_thumbnail(obj, 'img_url') else: obj['thumbnail'] = tn else: obj['thumbnail'] = _prepare_thumbnail(obj, 'img_url') # set domain obj['domain'] = url.get_domain(obj['url']) # return prepped object return obj
def extract(source_url): """ Article extraction. Method is as follows: 1. Get html from url. 2. Canonicalize URL. 3. If not canonical, prepare the url. 4. Extract meta tags. 5. If embedly is active, use it for content extraction. 6. If embedly doesnt return content or is not active, use readability 7. If readability doesnt return content, use article tag. 8. If authors aren't detcted from meta tags, detect them in article body. """ # fetch page page_html = network.get(source_url) # something failed. if not page_html: log.warning("Failed to extract html from {}".format(source_url)) return None soup = BeautifulSoup(page_html) # get canonical url canonical_url = meta.canonical_url(soup) if not canonical_url: canonical_url = url.prepare( source_url, source=source_url, canonicalize=False) # domain domain = url.get_domain(canonical_url) # get meta tags + other data data = { 'url': canonical_url, 'domain': domain, 'title': meta.title(soup, canonical_url), 'description': meta.description(soup, canonical_url), 'img_url': meta.img_url(soup, canonical_url), 'created': meta.publish_date(soup, canonical_url), 'favicon': meta.favicon(soup, canonical_url), 'site_name': meta.site_name(soup, canonical_url), 'page_type': meta.page_type(soup, canonical_url), 'authors': author.extract(soup), 'body': None } # embed videos if url.is_video(canonical_url): data['body'] = embed.video(canonical_url) return data # extract article body if settings.EMBEDLY_ENABLED: data['body'] = body_via_embedly(canonical_url) if not data['body']: data['body'] = body_via_readability(page_html, canonical_url) # # extract body from article tag body, raw_html = body_via_article_tag(soup, canonical_url) # merge body if not data['body']: data['body'] = body # get creators from raw article html if not len(data['authors']) and raw_html: data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS) # remove site name from authors if data.get('site_name'): data['authors'] = [ a.replace(data['site_name'].upper(), "").strip() for a in data['authors'] ] # # get links from raw_html + content # links = [u for u in url.from_any(data['body']) if source_url not in u] # for u in url.from_any(raw_html, source=source_url): # if u not in links and (u != source_url or not u.startswith(source_url)): # links.append(u) # # split out internal / external links / article links # data['links'] = url.categorize_links(links, domain) return data
def test_domain_rm_www(self): u = 'http://www.nytimes.com' assert(url.get_domain(u) == 'nytimes.com')
def test_domain_rm_www(self): u = 'http://www.nytimes.com' assert (url.get_domain(u) == 'nytimes.com')
def extract(source_url): """ Article extraction. Method is as follows: 1. Get html from url. 2. Canonicalize URL. 3. If not canonical, prepare the url. 4. Extract meta tags. 5. If embedly is active, use it for content extraction. 6. If embedly doesnt return content or is not active, use readability 7. If readability doesnt return content, use article tag. 8. If authors aren't detcted from meta tags, detect them in article body. """ # fetch page page_html = network.get(source_url) # something failed. if not page_html: log.warning("Failed to extract html from {}".format(source_url)) return None soup = BeautifulSoup(page_html) # get canonical url canonical_url = meta.canonical_url(soup) if not canonical_url: canonical_url = url.prepare(source_url, source=source_url, canonicalize=False) # domain domain = url.get_domain(canonical_url) # get meta tags + other data data = { 'url': canonical_url, 'domain': domain, 'title': meta.title(soup, canonical_url), 'description': meta.description(soup, canonical_url), 'img_url': meta.img_url(soup, canonical_url), 'created': meta.publish_date(soup, canonical_url), 'favicon': meta.favicon(soup, canonical_url), 'site_name': meta.site_name(soup, canonical_url), 'page_type': meta.page_type(soup, canonical_url), 'authors': author.extract(soup), 'body': None } # extract body from embedly + readability if settings.EMBEDLY_ENABLED: data['body'] = body_via_embedly(canonical_url) if not data['body']: data['body'] = body_via_readability(page_html, canonical_url) # # extract body from article tag body, raw_html = body_via_article_tag(soup, canonical_url) # merge body if not data['body']: data['body'] = body # get creators from raw article html if not len(data['authors']) and raw_html: data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS) # remove site name from authors if data.get('site_name'): data['authors'] = [ a.replace(data['site_name'].upper(), "").strip() for a in data['authors'] ] # # get links from raw_html + content # links = [u for u in url.from_any(data['body']) if source_url not in u] # for u in url.from_any(raw_html, source=source_url): # if u not in links and (u != source_url or not u.startswith(source_url)): # links.append(u) # # split out internal / external links / article links # data['links'] = url.categorize_links(links, domain) return data
def _prepare(obj, requires=[], recipe=None, type='event', org_id=None, extract=True): """ Prepare a content item or an event. """ # check required fields _check_requires(obj, requires, type=type) # validate status if type == 'event': if 'status' in obj: if not obj.get('status', None) in EVENT_STATUSES: raise RequestError( 'Invalid event status: {status}'.format(**obj)) if obj['status'] == 'deleted': raise RequestError( 'You cannot create an Event with status of "deleted."') # validate type if type == 'content_item': if not obj.get('type', None) in CONTENT_ITEM_TYPES: raise RequestError( 'Invalid content item type: {type}'.format(**obj)) # get rid of ``id`` if it somehow got in here. obj.pop('id', None) obj.pop('org_id', None) # normalize the url if type == 'event': obj['url'] = _prepare_url(obj, 'url', expand=True, canonicalize=False) elif type == 'content_item': obj['url'] = _prepare_url(obj, 'url', expand=True, canonicalize=True) # sanitize creation date obj['created'] = _prepare_date(obj, 'created') if not obj['created']: obj.pop('created') # sanitize text/html fields obj['title'] = _prepare_str(obj, 'title', obj['url']) obj['description'] = _prepare_str(obj, 'description', obj['url']) obj['body'] = _prepare_str(obj, 'body', obj['url']) # set org id obj['org_id'] = org_id # check img url if not url.validate(obj.get('img_url', None)): obj['img_url'] = None # determine provenance. obj = _provenance(obj, recipe, type) # if type is content items and we're extracting. do it. if type == 'content_item' and extract and obj.get('url', None): cr = extract_cache.get(obj.get('url'), type=obj.get('type', None)) if not cr.value: extract_cache.invalidate(obj.get('url'), type=obj.get('type', None)) pass # merge extracted data with object. else: # merge extracted authors. for k, v in cr.value.items(): if not obj.get(k, None): obj[k] = v # preference extracted data if k in ['description', 'body']: obj[k] = v elif k == 'authors': if not k in obj: obj[k] = v else: for vv in v: if vv not in obj[k]: obj[k].append(vv) # swap bad images. tn = _prepare_thumbnail(obj, 'img_url') if not tn: img = cr.value.get('img_url', None) if img: obj['img_url'] = img obj['thumbnail'] = _prepare_thumbnail(obj, 'img_url') else: obj['thumbnail'] = tn else: obj['thumbnail'] = _prepare_thumbnail(obj, 'img_url') # set domain obj['domain'] = url.get_domain(obj['url']) # return prepped object return obj