def prepare(url, source=None, canonicalize=True, expand=True, keep_params=('id', 'p', 'v')): """ Operations that unshorten a url, reconcile embeds, resolves redirects, strip parameters (with optional ones to keep), and then attempts to canonicalize the url by checking the page source's metadata. All urls that enter `merlynne` are first treated with this function. """ # reconcile embeds: url = reconcile_embed(url) # check for redirects / non absolute urls if source: source_domain = get_domain(source) # reconcile redirects url = redirect_back(url, source_domain) # if the domain is in the source, attempt to absolutify it if source_domain in url: # check for non-absolute urls if not is_abs(url): url = urljoin(source, url) # check short urls if expand: if is_shortened(url): url = unshorten(url, attempts=1) # canonicalize if canonicalize: page_html = network.get(url) if page_html: soup = BeautifulSoup(page_html) canonical = meta.canonical_url(soup) if canonical: return canonical # if it got converted to None, return if not url: return None # remove arguments w/ optional parameters to keep. url = remove_args(url, keep_params) # remove index.html url = re_index_html.sub('', url) # always remove trailing slash if url.endswith('/'): url = url[:-1] return url
def _bypass_bitly_warning(url): """ Sometime bitly blocks unshorten attempts, this bypasses that. """ html_string = network.get(url) soup = BeautifulSoup(html_string) a = soup.find('a', {'id': 'clickthrough'}) if a: return a.attrs.get('href') return url
def extract(source_url): """ Article extraction. Method is as follows: 1. Get html from url. 2. Canonicalize URL. 3. If not canonical, prepare the url. 4. Extract meta tags. 5. If embedly is active, use it for content extraction. 6. If embedly doesnt return content or is not active, use readability 7. If readability doesnt return content, use article tag. 8. If authors aren't detcted from meta tags, detect them in article body. """ # fetch page page_html = network.get(source_url) # something failed. if not page_html: log.warning("Failed to extract html from {}".format(source_url)) return None soup = BeautifulSoup(page_html) # get canonical url canonical_url = meta.canonical_url(soup) if not canonical_url: canonical_url = url.prepare( source_url, source=source_url, canonicalize=False) # domain domain = url.get_domain(canonical_url) # get meta tags + other data data = { 'url': canonical_url, 'domain': domain, 'title': meta.title(soup, canonical_url), 'description': meta.description(soup, canonical_url), 'img_url': meta.img_url(soup, canonical_url), 'created': meta.publish_date(soup, canonical_url), 'favicon': meta.favicon(soup, canonical_url), 'site_name': meta.site_name(soup, canonical_url), 'page_type': meta.page_type(soup, canonical_url), 'authors': author.extract(soup), 'body': None } # embed videos if url.is_video(canonical_url): data['body'] = embed.video(canonical_url) return data # extract article body if settings.EMBEDLY_ENABLED: data['body'] = body_via_embedly(canonical_url) if not data['body']: data['body'] = body_via_readability(page_html, canonical_url) # # extract body from article tag body, raw_html = body_via_article_tag(soup, canonical_url) # merge body if not data['body']: data['body'] = body # get creators from raw article html if not len(data['authors']) and raw_html: data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS) # remove site name from authors if data.get('site_name'): data['authors'] = [ a.replace(data['site_name'].upper(), "").strip() for a in data['authors'] ] # # get links from raw_html + content # links = [u for u in url.from_any(data['body']) if source_url not in u] # for u in url.from_any(raw_html, source=source_url): # if u not in links and (u != source_url or not u.startswith(source_url)): # links.append(u) # # split out internal / external links / article links # data['links'] = url.categorize_links(links, domain) return data
def fetch(self, params): text = network.get(self.endpoint, **params) if not text: return None return self._parse_jsonp(text)
def prepare(url, source=None, canonicalize=True, expand=True, keep_params=KEEP_PARAMS): """ Operations that unshorten a url, reconcile embeds, resolves redirects, strip parameters (with optional ones to keep), and then attempts to canonicalize the url by checking the page source's metadata. All urls that enter `merlynne` are first treated with this function. """ if not url or url == "": return None # encode. url = url.encode('utf-8', errors='ignore') # reconcile embeds: url = reconcile_embed(url) # reconcile redirects url = redirect_back(url, source) # check for non absolute urls. if source: source_domain = get_domain(source) # if the domain is in the source, attempt to absolutify it if source_domain in url: # check for non-absolute urls if not is_abs(url): url = urljoin(source, url) # check for missing scheme if not get_scheme(url): url = "http://" + url # check short urls if expand: if is_shortened(url): url = unshorten(url, attempts=1) # canonicalize if canonicalize: page_html = network.get(url) if page_html: soup = make_soup(page_html) _url = meta.canonical_url(soup) if _url: url = _url # if it got converted to None, return if not url: return None # remove arguments w/ optional parameters to keep. url = remove_args(url, keep_params) # remove index.html url = re_index_html.sub('', url) # always remove trailing slash if url.endswith('/'): url = url[:-1] return url
def extract(source_url): """ Article extraction. Method is as follows: 1. Get html from url. 2. Canonicalize URL. 3. If not canonical, prepare the url. 4. Extract meta tags. 5. If embedly is active, use it for content extraction. 6. If embedly doesnt return content or is not active, use readability 7. If readability doesnt return content, use article tag. 8. If authors aren't detcted from meta tags, detect them in article body. """ # fetch page page_html = network.get(source_url) # something failed. if not page_html: log.warning("Failed to extract html from {}".format(source_url)) return None soup = BeautifulSoup(page_html) # get canonical url canonical_url = meta.canonical_url(soup) if not canonical_url: canonical_url = url.prepare(source_url, source=source_url, canonicalize=False) # domain domain = url.get_domain(canonical_url) # get meta tags + other data data = { 'url': canonical_url, 'domain': domain, 'title': meta.title(soup, canonical_url), 'description': meta.description(soup, canonical_url), 'img_url': meta.img_url(soup, canonical_url), 'created': meta.publish_date(soup, canonical_url), 'favicon': meta.favicon(soup, canonical_url), 'site_name': meta.site_name(soup, canonical_url), 'page_type': meta.page_type(soup, canonical_url), 'authors': author.extract(soup), 'body': None } # extract body from embedly + readability if settings.EMBEDLY_ENABLED: data['body'] = body_via_embedly(canonical_url) if not data['body']: data['body'] = body_via_readability(page_html, canonical_url) # # extract body from article tag body, raw_html = body_via_article_tag(soup, canonical_url) # merge body if not data['body']: data['body'] = body # get creators from raw article html if not len(data['authors']) and raw_html: data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS) # remove site name from authors if data.get('site_name'): data['authors'] = [ a.replace(data['site_name'].upper(), "").strip() for a in data['authors'] ] # # get links from raw_html + content # links = [u for u in url.from_any(data['body']) if source_url not in u] # for u in url.from_any(raw_html, source=source_url): # if u not in links and (u != source_url or not u.startswith(source_url)): # links.append(u) # # split out internal / external links / article links # data['links'] = url.categorize_links(links, domain) return data