Example #1
0
def extend_article(res, url, **kwargs):
    ex = {'summary': None, 'content_ex': None}
    try:
        if not kwargs.get('summarize', None) and not kwargs.get('top', None):
            return ex
        if not res:
            request = requests.get(read_url.format(url))
            res = request.json()
            if request.status_code != 200 or ('error' in res
                                              and res['error']) or not res:
                return ex
        if kwargs.get('summarize', None):
            with tempfile.NamedTemporaryFile() as tmp:
                tmp_path = tmp.name
                tmp.write(
                    (strip_tags(res['content'].decode('ascii',
                                                      'ignore'))).strip())
                tmp.flush()
                ex['summary'] = subprocess.check_output(
                    ['ots', tmp_path]).strip().splitlines().pop().strip()
        if kwargs.get('top', None):
            ex['content_ex'] = res['content']
    except Exception:
        pass
    return ex
Example #2
0
def process_article(html, full=True, replace=False):
    pos = 0
    src = None
    try:
        soup = BeautifulSoup(html)
    except UnicodeEncodeError:
        soup = BeautifulSoup(html.encode('utf-8', 'ignore'))
    media_found = False
    for tag in soup.find_all(True):
        if any(x == tag.name for x in EXCLUDED_TAGS) \
            or (tag.name == 'div' and 'class' in tag.attrs and any(div in tag.attrs['class'] for div in EXCLUDED_DIV_CLASS))\
            or ((not tag.contents and not tag.name == 'img' and (tag.string is None or not tag.string.strip()))
                or (tag.name == 'img' and 'src' in tag.attrs
                    and any(host in tag['src'] for host in EXCLUDED_IMAGES_HOST)))\
            or (tag.name == 'a' and 'href' in tag.attrs and any(host in tag.attrs['href'] for host in EXCLUDED_A))\
                or isinstance(tag, Comment):
            if tag.parent and tag.parent.name == 'a':
                tag.parent.decompose()
            else:
                tag.decompose()
            continue
        for attr in EXCLUDED_ATTR:
            try:
                del tag[attr]
            except AttributeError:
                pass
        if not replace and not media_found and full:
            if tag.name != 'img' and tag.name != 'a' and pos > 12:
                media_found = True
            elif tag.name == 'img' and 'src' in tag.attrs:
                src = tag.attrs['src']
                if src:
                    o = urlparse.urlparse(src)
                    src = o.scheme + "://" + o.netloc + o.path
                if tag.parent and tag.parent.name == 'a':
                    tag.parent.decompose()
                else:
                    tag.decompose()
                media_found = True
            pos += 1
        if replace:
            if tag.name == 'img' and 'src' in tag.attrs and tag.attrs[
                    'src'] == replace:
                if tag.parent and tag.parent.name == 'a':
                    tag.parent.decompose()
                else:
                    tag.decompose()
    content = unicode(soup)
    if full:
        excerpt = (strip_tags(content)).strip()
        return {
            'content': content,
            'image': src,
            'word_count': len(excerpt.split()),
            'excerpt': excerpt
        }
    else:
        return {'content': content, 'image': src}
Example #3
0
def process_article(html, full=True, replace=False):
    pos = 0
    src = None
    try:
        soup = BeautifulSoup(html)
    except UnicodeEncodeError:
        soup = BeautifulSoup(html.encode('utf-8', 'ignore'))
    media_found = False
    for tag in soup.find_all(True):
        if any(x == tag.name for x in EXCLUDED_TAGS) \
            or (tag.name == 'div' and 'class' in tag.attrs and any(div in tag.attrs['class'] for div in EXCLUDED_DIV_CLASS))\
            or ((not tag.contents and not tag.name == 'img' and (tag.string is None or not tag.string.strip()))
                or (tag.name == 'img' and 'src' in tag.attrs
                    and any(host in tag['src'] for host in EXCLUDED_IMAGES_HOST)))\
            or (tag.name == 'a' and 'href' in tag.attrs and any(host in tag.attrs['href'] for host in EXCLUDED_A))\
                or isinstance(tag, Comment):
                    if tag.parent and tag.parent.name == 'a':
                        tag.parent.decompose()
                    else:
                        tag.decompose()
                    continue
        for attr in EXCLUDED_ATTR:
            try:
                del tag[attr]
            except AttributeError:
                pass
        if not replace and not media_found and full:
            if tag.name != 'img' and tag.name != 'a' and pos > 12:
                media_found = True
            elif tag.name == 'img' and 'src' in tag.attrs:
                src = tag.attrs['src']
                if src:
                    o = urlparse.urlparse(src)
                    src = o.scheme + "://" + o.netloc + o.path
                if tag.parent and tag.parent.name == 'a':
                    tag.parent.decompose()
                else:
                    tag.decompose()
                media_found = True
            pos += 1
        if replace:
            if tag.name == 'img' and 'src' in tag.attrs and tag.attrs['src'] == replace:
                if tag.parent and tag.parent.name == 'a':
                    tag.parent.decompose()
                else:
                    tag.decompose()
    content = unicode(soup)
    if full:
        excerpt = (strip_tags(content)).strip()
        return {'content': content, 'image': src, 'word_count': len(excerpt.split()), 'excerpt': excerpt}
    else:
        return {'content': content, 'image': src}
Example #4
0
def extend_article(res, url, **kwargs):
    ex = {
        'summary': None,
        'content_ex': None
    }
    try:
        if not kwargs.get('summarize', None) and not kwargs.get('top', None):
            return ex
        if not res:
            request = requests.get(read_url.format(url))
            res = request.json()
            if request.status_code != 200 or ('error' in res and res['error']) or not res:
                return ex
        if kwargs.get('summarize', None):
            with tempfile.NamedTemporaryFile() as tmp:
                tmp_path = tmp.name
                tmp.write((strip_tags(res['content'].decode('ascii', 'ignore'))).strip())
                tmp.flush()
                ex['summary'] = subprocess.check_output(['ots', tmp_path]).strip().splitlines().pop().strip()
        if kwargs.get('top', None):
            ex['content_ex'] = res['content']
    except Exception:
        pass
    return ex
Example #5
0
def get_full_article(this_item, feed_id, **kwargs):
    if len(this_item['link']) > 200:
        this_item['link'] = short.shorten(this_item['link'])['url']
    if any(required not in this_item for required in ['title', 'link']):
        return
    try:
        Article.objects.values('id').get(
            Q(feed_id=feed_id, url=this_item['link'])
            | Q(feed_id=feed_id, title=this_item['title']))
        return
    except Article.DoesNotExist:
        pass
    except Article.MultipleObjectsReturned:
        return
    published_parsed = datetime.utcnow().replace(tzinfo=utc)
    if 'updated_parsed' not in this_item:
        if 'published_parsed' in this_item and datetime.utcfromtimestamp(
                mktime(this_item['published_parsed'])).replace(
                    tzinfo=utc) < published_parsed:
            published_parsed = datetime.utcfromtimestamp(
                mktime(this_item['published_parsed'])).replace(tzinfo=utc)
    elif datetime.utcfromtimestamp(mktime(
            this_item['updated_parsed'])).replace(
                tzinfo=utc) < published_parsed:
        published_parsed = datetime.utcfromtimestamp(
            mktime(this_item['updated_parsed'])).replace(tzinfo=utc)
    if 'author' not in this_item:
        this_item['author'] = None
    if 'description' not in this_item:
        this_item['description'] = ''
    if len(this_item['title']) > 200:
        this_item['title'] = this_item['title'][:180] + '...'
    res = process_article(this_item['description'])
    this_item['excerpt'] = res['excerpt']
    this_item['word_count'] = res['word_count']
    this_item['description'] = res['content']
    media = res['image']
    full = None
    if not media:
        if 'media_content' in this_item and 'url' in this_item[
                'media_content'][0]:
            media = this_item['media_content'][0]['url']
        else:
            full = get_article_readability(this_item)
            if full:
                res = full
                media = res['lead_image_url']
    if len(this_item['excerpt']) == 0:
        this_item['language'] = cld.detect(this_item['excerpt'].encode(
            'ascii', 'ignore'))[1]
        if this_item['language'] == 'un':
            this_item['language'] = cld.detect(this_item['title'].encode(
                'ascii', 'ignore'))[1]
    else:
        this_item['language'] = cld.detect(this_item['title'].encode(
            'ascii', 'ignore'))[1]

    if kwargs.get('summarize_excerpt'):
        extend = {'content_ex': None, 'summary': None}
        try:
            with tempfile.NamedTemporaryFile() as tmp:
                tmp_path = tmp.name
                tmp.write(
                    (strip_tags(res['content'].decode('ascii',
                                                      'ignore'))).strip())
                tmp.flush()
                extend['summary'] = subprocess.check_output(
                    ['ots', tmp_path]).strip().splitlines().pop().strip()
        except Exception:
            pass
    else:
        extend = extend_article(full, this_item['link'], **kwargs)

    obj, created = Article.objects.get_or_create(feed_id=feed_id,
                                                 url=this_item['link'],
                                                 defaults={
                                                     'title':
                                                     this_item['title'],
                                                     'content':
                                                     this_item['description'],
                                                     'word_count':
                                                     res['word_count'],
                                                     'url':
                                                     this_item['link'],
                                                     'media':
                                                     media,
                                                     'date_parsed':
                                                     published_parsed,
                                                     'author':
                                                     this_item['author'],
                                                     'excerpt':
                                                     this_item['excerpt'],
                                                     'language':
                                                     this_item['language'],
                                                     'summary':
                                                     extend['summary'],
                                                     'content_ex':
                                                     extend['content_ex']
                                                 })
    if created:
        get_article_info(obj)
Example #6
0
def get_full_article(this_item, feed_id, **kwargs):
    if len(this_item['link']) > 200:
        this_item['link'] = short.shorten(this_item['link'])['url']
    if any(required not in this_item for required in ['title', 'link']):
        return
    try:
        Article.objects.values('id').get(Q(feed_id=feed_id, url=this_item['link']) |
                                         Q(feed_id=feed_id, title=this_item['title']))
        return
    except Article.DoesNotExist:
        pass
    except Article.MultipleObjectsReturned:
        return
    published_parsed = datetime.utcnow().replace(tzinfo=utc)
    if 'updated_parsed' not in this_item:
        if 'published_parsed' in this_item and datetime.utcfromtimestamp(
                mktime(this_item['published_parsed'])).replace(tzinfo=utc) < published_parsed:
            published_parsed = datetime.utcfromtimestamp(mktime(this_item['published_parsed'])).replace(tzinfo=utc)
    elif datetime.utcfromtimestamp(mktime(this_item['updated_parsed'])).replace(tzinfo=utc) < published_parsed:
        published_parsed = datetime.utcfromtimestamp(mktime(this_item['updated_parsed'])).replace(tzinfo=utc)
    if 'author' not in this_item:
        this_item['author'] = None
    if 'description' not in this_item:
        this_item['description'] = ''
    if len(this_item['title']) > 200:
        this_item['title'] = this_item['title'][:180] + '...'
    res = process_article(this_item['description'])
    this_item['excerpt'] = res['excerpt']
    this_item['word_count'] = res['word_count']
    this_item['description'] = res['content']
    media = res['image']
    full = None
    if not media:
        if 'media_content' in this_item and 'url' in this_item['media_content'][0]:
            media = this_item['media_content'][0]['url']
        else:
            full = get_article_readability(this_item)
            if full:
                res = full
                media = res['lead_image_url']
    if len(this_item['excerpt']) == 0:
        this_item['language'] = cld.detect(this_item['excerpt'].encode('ascii', 'ignore'))[1]
        if this_item['language'] == 'un':
            this_item['language'] = cld.detect(this_item['title'].encode('ascii', 'ignore'))[1]
    else:
        this_item['language'] = cld.detect(this_item['title'].encode('ascii', 'ignore'))[1]

    if kwargs.get('summarize_excerpt'):
        extend = {
            'content_ex': None,
            'summary': None
        }
        try:
            with tempfile.NamedTemporaryFile() as tmp:
                    tmp_path = tmp.name
                    tmp.write((strip_tags(res['content'].decode('ascii', 'ignore'))).strip())
                    tmp.flush()
                    extend['summary'] = subprocess.check_output(['ots', tmp_path]).strip().splitlines().pop().strip()
        except Exception:
            pass
    else:
        extend = extend_article(full, this_item['link'], **kwargs)

    obj, created = Article.objects.get_or_create(
        feed_id=feed_id, url=this_item['link'],
        defaults={'title': this_item['title'], 'content': this_item['description'],
                  'word_count': res['word_count'], 'url': this_item['link'], 'media': media,
                  'date_parsed': published_parsed, 'author': this_item['author'], 'excerpt': this_item['excerpt'],
                  'language': this_item['language'], 'summary': extend['summary'], 'content_ex': extend['content_ex']})
    if created:
        get_article_info(obj)