Esempio n. 1
0
def headlines(tag, limit=200):
    def date_parser(obj):
        import datetime
        if isinstance(obj, datetime.datetime):
            return obj.isoformat()
        return obj

    conn.indices.refresh('newsworld')
    articles = ArticleModel.objects.filter(
        tag=tag,
        main=True,
        date__gte=datetime.now()-timedelta(hours=24)
    ).order_by('-date')[:limit]

    try:
        for his in articles:
            his['similar'] = [
                similar.prepare_es_dto(a) for a in his['similar']
            ]
            his['similar'] = sorted(
                his['similar'], key=lambda s: s['date'], reverse=True)
    except:
        utils.print_exception()

    r = redis.StrictRedis(host='localhost', port=6379, db=0)
    h = [utils.from_es_dict_dto(a) for a in articles]
    h = sorted(h, key=lambda a: len(a['similar']), reverse=True)
    r.set('headlines_%s' % tag, json.dumps(h[:5], default=date_parser))
    logger.info('[headlines] %s updated' % tag)
Esempio n. 2
0
def get_instance(cls, dictArticle, source):
    a = None

    try:
        if not dictArticle.description:
            return

        content = lxml.html.fromstring(dictArticle.description).text_content()
        hash_str = ':'.join([dictArticle.title,  content, source])\
                      .encode('ascii', 'ignore')
        hash = md5_constructor(hash_str).hexdigest()

        article_date = dictArticle.published_parsed
        if not article_date:
            article_date = datetime.now().isoformat()
        else:
            article_date = datetime.fromtimestamp(
                mktime(dictArticle.published_parsed)
            ).isoformat()

        a, created = cls.objects.get_or_create(link=dictArticle.link)
        if created:
            article = {
                'title': utils.clean(dictArticle.title),
                'link': dictArticle.link,
                'hash_key': hash,
                'content': utils.clean(content),
                'source': source,
                'tag': cls.__name__,
                'image_url': get_image_url(dictArticle.links),
                'date': '%s' % article_date
            }
            a.title = article['title']
            a.hash_key = article['hash_key']
            a.content = article['content']
            a.source = article['source']
            a.date = article['date']
            a.image_url = article['image_url']
            a.save()
            return article
        return None

    except:
        utils.print_exception()

    return None
Esempio n. 3
0
def run_tasks(feeds, feedModel):
    logger.info('[update] Started: %s --' % feedModel.__name__)
    logger.info('[update] Fetching RSS feeds.')
    new_articles = list(get_new_articles(feeds, feedModel))
    l_new_articles = len([a for a in new_articles if a])
    try:
        index_articles(new_articles)
        logger.info('[update][%s] Complete. new:%s' % (
            feedModel.__name__,
            l_new_articles))
    except:
        rollback_articles(new_articles, feedModel)
        logger.info('[update][error] Rolling back: %s (%s)' % (
            feedModel.__name__,
            l_new_articles))
        utils.print_exception()

    conn.indices.refresh('newsworld')
Esempio n. 4
0
def build_similar(articles, tag):
    history = []
    seen = []
    skipped = []
    index = 0
    for a in articles:
        log_progress(index, len(articles), len(history), len(skipped), tag)
        a = prepare_es_dto(a)
        found_similar = False
        index += 1

        if index > 200:
            break

        if a['hash_key'] in seen:
            skipped.append(a['hash_key'])
            continue

        for h in history:
            try:
                if a['hash_key'] not in h['seen']:
                    sim_ratio = get_fuzzy_ratio(h, a)
                    if sim_ratio >= 70:
                        a['score'] = sim_ratio
                        a['seen'] = h['hash_key']
                        try:
                            a.save()
                        except:
                            pass

                        h['similar'] = [a, ] + h['similar'] + a['similar']
                        h['seen'].append(a['hash_key'])
                        h['seen'] += [a['seen'], ]
                        #h = append_related(tag, h, a, 70)
                        seen.append(a['hash_key'])
                        seen += [a['seen'], ]
                        found_similar = True

                        if a['hash_key'] == '94032da0f2e05ed7e6df051ec1e0e9ce':
                            print 'what!!', h['title']
                        break
                    #else:
                    #    for s in h['similar']:
                    #        sim_ratio = get_fuzzy_ratio(s, a)

                    #        if sim_ratio >= 70 and a['hash_key'] not in h['seen']:
                    #            a['score'] = sim_ratio
                    #            a['seen'] = s['hash_key']
                    #            h['similar'].insert(0, a)
                    #            h['seen'].append(a['hash_key'])
                    #            #h = append_related(tag, h, a, 70)
                    #            seen.append(a['hash_key'])
                    #            break
            except:
                print_exception()

        try:
            if not found_similar and a['hash_key'] not in seen:
                a['main'] = True
                #a['similar'] = []
                #a['seen'] = []
                history.append(a)
                seen.append(a['hash_key'])
                seen += a['seen']
        except:
            print_exception()
    return history