def extract(source_url, **kw): """ Article extraction. Method is as follows: 1. Get html from url. 2. Canonicalize URL. 3. If not canonical, prepare the url. 4. Extract meta tags. 5. If embedly is active, use it for content extraction. 6. If embedly doesnt return content or is not active, use readability 7. If readability doesnt return content, use article tag. 8. If authors aren't detcted from meta tags, detect them in article body. """ type = kw.get('type', 'article') # fetch page page_html = network.get(source_url) # something failed. if not page_html: log.warning("Failed to extract html from {}".format(source_url)) return None soup = make_soup(page_html) # get canonical url canonical_url = meta.canonical_url(soup) if not canonical_url: canonical_url = url.prepare( source_url, source=source_url, canonicalize=False) # domain domain = url.get_domain(canonical_url) # get meta tags + other data data = { 'url': canonical_url, 'domain': domain, 'title': meta.title(soup, canonical_url), 'description': meta.description(soup, canonical_url), 'img_url': meta.img_url(soup, canonical_url), 'created': meta.publish_date(soup, canonical_url), 'favicon': meta.favicon(soup, canonical_url), 'site_name': meta.site_name(soup, canonical_url), 'authors': author.extract(soup), 'type': type, 'body': None } # embed videos if url.is_video(canonical_url): data['type'] = 'video' data['body'] = embed.video(canonical_url) return data # extract article body if data['type'] == 'article': if settings.EMBEDLY_ENABLED: data['body'] = body_via_embedly(canonical_url) if not data['body']: data['body'] = body_via_readability(page_html, canonical_url) # # extract body from article tag body, raw_html = body_via_article_tag(soup, canonical_url) # merge body if not data['body']: data['body'] = body # get creators from raw article html if not len(data['authors']) and raw_html: data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS) # remove site name from authors if data.get('site_name'): data['authors'] = [ a.replace(data['site_name'].upper(), "").strip() for a in data['authors'] ] # get links from raw_html + content links = [u for u in url.from_any(data['body']) if source_url not in u] for u in url.from_any(raw_html, source=source_url): if u not in links and (u != source_url or not u.startswith(source_url)): links.append(u) # split out internal / external links / article links data['links'] = links return data
def test_from_any_html(self): u = 'fds;lfjdlskafjsldak <a href="http://www.nytimes.com"></a> asdlifjkasdlkfj ' assert('http://www.nytimes.com' in url.from_any(u))
def test_from_any_html(self): u = 'fds;lfjdlskafjsldak <a href="http://www.nytimes.com"></a> asdlifjkasdlkfj ' assert ('http://www.nytimes.com' in url.from_any(u))