Exemple #1
0
def get_article(item, source, reprocess=False):
    """Take the initial set of listings and enrich the content."""
    article = dict()
    encoded = item.get('link').encode('utf-8')
    article['uuid'] = hashlib.sha256(encoded).hexdigest()
    processed = is_found(article['uuid'])
    if processed and not reprocess:
        return {'article': processed, 'from_store': True}
    article['title'] = item.get('title', None)
    href = item.get('link', None)
    article['href'] = strip_google(href)
    article['source'] = derive_source(article['href'])
    article['collected'] = now_time()
    article['published'] = item.get('published', None)
    article['summary'] = item.get('summary', None)

    page_content = get_page_content(article['href'])
    if not page_content:
        logger.debug("No content found: %s" % article['href'])
        return {'article': None, 'from_store': True}
    paragraphs = justext.justext(page_content,
                                 justext.get_stoplist("English"),
                                 no_headings=True,
                                 max_heading_distance=150,
                                 length_high=140,
                                 max_link_density=0.4,
                                 stopwords_low=0.2,
                                 stopwords_high=0.3)
    text_content = list()
    for paragraph in paragraphs:
        if paragraph.is_boilerplate:
            continue
        text_content.append(paragraph.text)
    text_content = '\n'.join(text_content)
    tokens = get_tokens(text_content)

    article['word_count'] = len(tokens)
    article['read_time'] = round(float(article['word_count']) / 250, 2)
    clean = cleaned_tokens(tokens)
    article['tokens'] = [{
        t[0]: t[1]
    } for t in nltk.FreqDist(clean).most_common(100)]
    article['tags'] = [list(x.keys())[0] for x in article['tokens'][0:7]]
    article['sentiment'] = get_sentiment(text_content)
    article['feed_source'] = source.replace('www.google.com', 'google.com')
    articles = mongo.db[app.config['ARTICLES_COLLECTION']]
    if not reprocess:
        try:
            articles.insert(article)
        except:
            pass
    else:
        if not processed:
            try:
                articles.insert(article)
            except:
                pass
        articles.update({'_id': ObjectId(processed['_id'])}, {'$set': article})
    return {'article': article, 'from_store': False}
Exemple #2
0
 def decorated_function(*args, **kwargs):
     config = json.load(open('%s/resources/config.json' % APP_BASE))
     delta = (now_time() -
              load_time(config['geoip']['last_update'])).seconds
     if delta > REFRESH_TIME or not app.config['GEOIPDB']:
         try:
             app.config['GEOIPDB'] = geoip2.database.Reader(
                 '%s/resources/geoip/current' % APP_BASE)
         except Exception as e:
             print(e)
             raise Exception("Database has not been initialized.")
     return f(*args, **kwargs)
Exemple #3
0
 def decorated_function(*args, **kwargs):
     config = json.load(open('%s/resources/config.json' % APP_BASE))
     delta = (now_time() - load_time(config['asn']['last_update'])).seconds
     if delta > REFRESH_TIME or not app.config['ASNDB']:
         try:
             app.config['ASNDB'] = pyasn.pyasn(
                 '%s/resources/asn/current' % APP_BASE,
                 as_names_file='%s/resources/asn/as_names.json' % APP_BASE)
             app.config['ASNDB'].loaded = config['asn']['last_rib_file']
         except Exception as e:
             raise Exception("Database has not been initialized.")
     return f(*args, **kwargs)
Exemple #4
0
def process_all_rss(reprocess=False):
    """Gather all RSS feeds and articles, then process."""
    sources = list()
    monitors = mongo.db[app.config['MONITORS_COLLECTION']]
    for item in monitors.find({'active': True}):
        sources.append(item['metadata'].get('rss_link'))

    contents = [feedparser.parse(x) for x in sources]
    for source in contents:
        for idx, item in enumerate(source.get('entries')):
            response = get_article(item, source['href'], reprocess)
            if response['from_store'] or reprocess:
                continue
            clean_link = response['article']['feed_source']
            monitors.update({'metadata.rss_link': clean_link},
                            {'$inc': {'hits': 1}})
            monitors.update({'metadata.rss_link': clean_link},
                            {'$set': {'checked': now_time()}})
    correct_counts()