Esempio n. 1
0
 def fetch_mercury():
     merc = mercury.fetch(article.url)
     if merc and len(merc.get('content') or "") >= 50:
         article.title = merc['title']
         article.top_image = merc['lead_image_url']
         if merc['date_published'] and not article.published:
             pass # TODO
         article.author = merc['author']
         content.html = merc['content']
         if not article.description:
             article.description = merc['excerpt']
         populate_article_json(article, content)
         return True
     else:
         return False
Esempio n. 2
0
 def fetch_normal():
     response = url_fetch(url, return_response_obj=True)
     # print 'INFO', response.info()
     if response and response.info().getheader('content-type', 'text/html').lower().split(';')[0].strip() == 'text/html':
         markup = response.read()
     else:
         print 'BAD MIME TYPE' if response else 'NO SUCCESSFUL RESPONSE'
         markup = None
 
     if markup:
         # process markup:
         markup_soup = BeautifulSoup(markup, 'lxml')
         og_title = find_meta_value(markup_soup, 'og:title')
         og_image = find_meta_value(markup_soup, 'og:image')
         og_description = find_meta_value(markup_soup, 'og:description')
         title_field = find_title(markup_soup)
     
         article.site_name = find_meta_value(markup_soup, 'og:site_name')
     
         # find author:
         article.author = find_author(markup_soup)
     
         # parse and process article content:
         content.html = article_extractor.extract(markup, article.url)
         doc_soup = BeautifulSoup(content.html, 'lxml')
     
         article.title = first_present([og_title, title_field, article.title])
         article.top_image = make_url_absolute(first_present([article.top_image, og_image]))
     
         populate_article_json(article, content)
     
         # compute description:
         description = None
         if og_description and len(og_description.strip()):
             description = truncate(og_description.strip(), words=40)
         elif content.text and len(content.text.strip()) > 0:
             description = truncate(content.text, words=40)
         article.description = re.sub(r"[\r\n\t ]+", " ", description).strip() if description else None
             
         return True
     else:
         return False