def fetch_mercury(): merc = mercury.fetch(article.url) if merc and len(merc.get('content') or "") >= 50: article.title = merc['title'] article.top_image = merc['lead_image_url'] if merc['date_published'] and not article.published: pass # TODO article.author = merc['author'] content.html = merc['content'] if not article.description: article.description = merc['excerpt'] populate_article_json(article, content) return True else: return False
def fetch_normal(): response = url_fetch(url, return_response_obj=True) # print 'INFO', response.info() if response and response.info().getheader('content-type', 'text/html').lower().split(';')[0].strip() == 'text/html': markup = response.read() else: print 'BAD MIME TYPE' if response else 'NO SUCCESSFUL RESPONSE' markup = None if markup: # process markup: markup_soup = BeautifulSoup(markup, 'lxml') og_title = find_meta_value(markup_soup, 'og:title') og_image = find_meta_value(markup_soup, 'og:image') og_description = find_meta_value(markup_soup, 'og:description') title_field = find_title(markup_soup) article.site_name = find_meta_value(markup_soup, 'og:site_name') # find author: article.author = find_author(markup_soup) # parse and process article content: content.html = article_extractor.extract(markup, article.url) doc_soup = BeautifulSoup(content.html, 'lxml') article.title = first_present([og_title, title_field, article.title]) article.top_image = make_url_absolute(first_present([article.top_image, og_image])) populate_article_json(article, content) # compute description: description = None if og_description and len(og_description.strip()): description = truncate(og_description.strip(), words=40) elif content.text and len(content.text.strip()) > 0: description = truncate(content.text, words=40) article.description = re.sub(r"[\r\n\t ]+", " ", description).strip() if description else None return True else: return False