def extend_article(res, url, **kwargs): ex = {'summary': None, 'content_ex': None} try: if not kwargs.get('summarize', None) and not kwargs.get('top', None): return ex if not res: request = requests.get(read_url.format(url)) res = request.json() if request.status_code != 200 or ('error' in res and res['error']) or not res: return ex if kwargs.get('summarize', None): with tempfile.NamedTemporaryFile() as tmp: tmp_path = tmp.name tmp.write( (strip_tags(res['content'].decode('ascii', 'ignore'))).strip()) tmp.flush() ex['summary'] = subprocess.check_output( ['ots', tmp_path]).strip().splitlines().pop().strip() if kwargs.get('top', None): ex['content_ex'] = res['content'] except Exception: pass return ex
def process_article(html, full=True, replace=False): pos = 0 src = None try: soup = BeautifulSoup(html) except UnicodeEncodeError: soup = BeautifulSoup(html.encode('utf-8', 'ignore')) media_found = False for tag in soup.find_all(True): if any(x == tag.name for x in EXCLUDED_TAGS) \ or (tag.name == 'div' and 'class' in tag.attrs and any(div in tag.attrs['class'] for div in EXCLUDED_DIV_CLASS))\ or ((not tag.contents and not tag.name == 'img' and (tag.string is None or not tag.string.strip())) or (tag.name == 'img' and 'src' in tag.attrs and any(host in tag['src'] for host in EXCLUDED_IMAGES_HOST)))\ or (tag.name == 'a' and 'href' in tag.attrs and any(host in tag.attrs['href'] for host in EXCLUDED_A))\ or isinstance(tag, Comment): if tag.parent and tag.parent.name == 'a': tag.parent.decompose() else: tag.decompose() continue for attr in EXCLUDED_ATTR: try: del tag[attr] except AttributeError: pass if not replace and not media_found and full: if tag.name != 'img' and tag.name != 'a' and pos > 12: media_found = True elif tag.name == 'img' and 'src' in tag.attrs: src = tag.attrs['src'] if src: o = urlparse.urlparse(src) src = o.scheme + "://" + o.netloc + o.path if tag.parent and tag.parent.name == 'a': tag.parent.decompose() else: tag.decompose() media_found = True pos += 1 if replace: if tag.name == 'img' and 'src' in tag.attrs and tag.attrs[ 'src'] == replace: if tag.parent and tag.parent.name == 'a': tag.parent.decompose() else: tag.decompose() content = unicode(soup) if full: excerpt = (strip_tags(content)).strip() return { 'content': content, 'image': src, 'word_count': len(excerpt.split()), 'excerpt': excerpt } else: return {'content': content, 'image': src}
def process_article(html, full=True, replace=False): pos = 0 src = None try: soup = BeautifulSoup(html) except UnicodeEncodeError: soup = BeautifulSoup(html.encode('utf-8', 'ignore')) media_found = False for tag in soup.find_all(True): if any(x == tag.name for x in EXCLUDED_TAGS) \ or (tag.name == 'div' and 'class' in tag.attrs and any(div in tag.attrs['class'] for div in EXCLUDED_DIV_CLASS))\ or ((not tag.contents and not tag.name == 'img' and (tag.string is None or not tag.string.strip())) or (tag.name == 'img' and 'src' in tag.attrs and any(host in tag['src'] for host in EXCLUDED_IMAGES_HOST)))\ or (tag.name == 'a' and 'href' in tag.attrs and any(host in tag.attrs['href'] for host in EXCLUDED_A))\ or isinstance(tag, Comment): if tag.parent and tag.parent.name == 'a': tag.parent.decompose() else: tag.decompose() continue for attr in EXCLUDED_ATTR: try: del tag[attr] except AttributeError: pass if not replace and not media_found and full: if tag.name != 'img' and tag.name != 'a' and pos > 12: media_found = True elif tag.name == 'img' and 'src' in tag.attrs: src = tag.attrs['src'] if src: o = urlparse.urlparse(src) src = o.scheme + "://" + o.netloc + o.path if tag.parent and tag.parent.name == 'a': tag.parent.decompose() else: tag.decompose() media_found = True pos += 1 if replace: if tag.name == 'img' and 'src' in tag.attrs and tag.attrs['src'] == replace: if tag.parent and tag.parent.name == 'a': tag.parent.decompose() else: tag.decompose() content = unicode(soup) if full: excerpt = (strip_tags(content)).strip() return {'content': content, 'image': src, 'word_count': len(excerpt.split()), 'excerpt': excerpt} else: return {'content': content, 'image': src}
def extend_article(res, url, **kwargs): ex = { 'summary': None, 'content_ex': None } try: if not kwargs.get('summarize', None) and not kwargs.get('top', None): return ex if not res: request = requests.get(read_url.format(url)) res = request.json() if request.status_code != 200 or ('error' in res and res['error']) or not res: return ex if kwargs.get('summarize', None): with tempfile.NamedTemporaryFile() as tmp: tmp_path = tmp.name tmp.write((strip_tags(res['content'].decode('ascii', 'ignore'))).strip()) tmp.flush() ex['summary'] = subprocess.check_output(['ots', tmp_path]).strip().splitlines().pop().strip() if kwargs.get('top', None): ex['content_ex'] = res['content'] except Exception: pass return ex
def get_full_article(this_item, feed_id, **kwargs): if len(this_item['link']) > 200: this_item['link'] = short.shorten(this_item['link'])['url'] if any(required not in this_item for required in ['title', 'link']): return try: Article.objects.values('id').get( Q(feed_id=feed_id, url=this_item['link']) | Q(feed_id=feed_id, title=this_item['title'])) return except Article.DoesNotExist: pass except Article.MultipleObjectsReturned: return published_parsed = datetime.utcnow().replace(tzinfo=utc) if 'updated_parsed' not in this_item: if 'published_parsed' in this_item and datetime.utcfromtimestamp( mktime(this_item['published_parsed'])).replace( tzinfo=utc) < published_parsed: published_parsed = datetime.utcfromtimestamp( mktime(this_item['published_parsed'])).replace(tzinfo=utc) elif datetime.utcfromtimestamp(mktime( this_item['updated_parsed'])).replace( tzinfo=utc) < published_parsed: published_parsed = datetime.utcfromtimestamp( mktime(this_item['updated_parsed'])).replace(tzinfo=utc) if 'author' not in this_item: this_item['author'] = None if 'description' not in this_item: this_item['description'] = '' if len(this_item['title']) > 200: this_item['title'] = this_item['title'][:180] + '...' res = process_article(this_item['description']) this_item['excerpt'] = res['excerpt'] this_item['word_count'] = res['word_count'] this_item['description'] = res['content'] media = res['image'] full = None if not media: if 'media_content' in this_item and 'url' in this_item[ 'media_content'][0]: media = this_item['media_content'][0]['url'] else: full = get_article_readability(this_item) if full: res = full media = res['lead_image_url'] if len(this_item['excerpt']) == 0: this_item['language'] = cld.detect(this_item['excerpt'].encode( 'ascii', 'ignore'))[1] if this_item['language'] == 'un': this_item['language'] = cld.detect(this_item['title'].encode( 'ascii', 'ignore'))[1] else: this_item['language'] = cld.detect(this_item['title'].encode( 'ascii', 'ignore'))[1] if kwargs.get('summarize_excerpt'): extend = {'content_ex': None, 'summary': None} try: with tempfile.NamedTemporaryFile() as tmp: tmp_path = tmp.name tmp.write( (strip_tags(res['content'].decode('ascii', 'ignore'))).strip()) tmp.flush() extend['summary'] = subprocess.check_output( ['ots', tmp_path]).strip().splitlines().pop().strip() except Exception: pass else: extend = extend_article(full, this_item['link'], **kwargs) obj, created = Article.objects.get_or_create(feed_id=feed_id, url=this_item['link'], defaults={ 'title': this_item['title'], 'content': this_item['description'], 'word_count': res['word_count'], 'url': this_item['link'], 'media': media, 'date_parsed': published_parsed, 'author': this_item['author'], 'excerpt': this_item['excerpt'], 'language': this_item['language'], 'summary': extend['summary'], 'content_ex': extend['content_ex'] }) if created: get_article_info(obj)
def get_full_article(this_item, feed_id, **kwargs): if len(this_item['link']) > 200: this_item['link'] = short.shorten(this_item['link'])['url'] if any(required not in this_item for required in ['title', 'link']): return try: Article.objects.values('id').get(Q(feed_id=feed_id, url=this_item['link']) | Q(feed_id=feed_id, title=this_item['title'])) return except Article.DoesNotExist: pass except Article.MultipleObjectsReturned: return published_parsed = datetime.utcnow().replace(tzinfo=utc) if 'updated_parsed' not in this_item: if 'published_parsed' in this_item and datetime.utcfromtimestamp( mktime(this_item['published_parsed'])).replace(tzinfo=utc) < published_parsed: published_parsed = datetime.utcfromtimestamp(mktime(this_item['published_parsed'])).replace(tzinfo=utc) elif datetime.utcfromtimestamp(mktime(this_item['updated_parsed'])).replace(tzinfo=utc) < published_parsed: published_parsed = datetime.utcfromtimestamp(mktime(this_item['updated_parsed'])).replace(tzinfo=utc) if 'author' not in this_item: this_item['author'] = None if 'description' not in this_item: this_item['description'] = '' if len(this_item['title']) > 200: this_item['title'] = this_item['title'][:180] + '...' res = process_article(this_item['description']) this_item['excerpt'] = res['excerpt'] this_item['word_count'] = res['word_count'] this_item['description'] = res['content'] media = res['image'] full = None if not media: if 'media_content' in this_item and 'url' in this_item['media_content'][0]: media = this_item['media_content'][0]['url'] else: full = get_article_readability(this_item) if full: res = full media = res['lead_image_url'] if len(this_item['excerpt']) == 0: this_item['language'] = cld.detect(this_item['excerpt'].encode('ascii', 'ignore'))[1] if this_item['language'] == 'un': this_item['language'] = cld.detect(this_item['title'].encode('ascii', 'ignore'))[1] else: this_item['language'] = cld.detect(this_item['title'].encode('ascii', 'ignore'))[1] if kwargs.get('summarize_excerpt'): extend = { 'content_ex': None, 'summary': None } try: with tempfile.NamedTemporaryFile() as tmp: tmp_path = tmp.name tmp.write((strip_tags(res['content'].decode('ascii', 'ignore'))).strip()) tmp.flush() extend['summary'] = subprocess.check_output(['ots', tmp_path]).strip().splitlines().pop().strip() except Exception: pass else: extend = extend_article(full, this_item['link'], **kwargs) obj, created = Article.objects.get_or_create( feed_id=feed_id, url=this_item['link'], defaults={'title': this_item['title'], 'content': this_item['description'], 'word_count': res['word_count'], 'url': this_item['link'], 'media': media, 'date_parsed': published_parsed, 'author': this_item['author'], 'excerpt': this_item['excerpt'], 'language': this_item['language'], 'summary': extend['summary'], 'content_ex': extend['content_ex']}) if created: get_article_info(obj)