def headlines(tag, limit=200): def date_parser(obj): import datetime if isinstance(obj, datetime.datetime): return obj.isoformat() return obj conn.indices.refresh('newsworld') articles = ArticleModel.objects.filter( tag=tag, main=True, date__gte=datetime.now()-timedelta(hours=24) ).order_by('-date')[:limit] try: for his in articles: his['similar'] = [ similar.prepare_es_dto(a) for a in his['similar'] ] his['similar'] = sorted( his['similar'], key=lambda s: s['date'], reverse=True) except: utils.print_exception() r = redis.StrictRedis(host='localhost', port=6379, db=0) h = [utils.from_es_dict_dto(a) for a in articles] h = sorted(h, key=lambda a: len(a['similar']), reverse=True) r.set('headlines_%s' % tag, json.dumps(h[:5], default=date_parser)) logger.info('[headlines] %s updated' % tag)
def get_instance(cls, dictArticle, source): a = None try: if not dictArticle.description: return content = lxml.html.fromstring(dictArticle.description).text_content() hash_str = ':'.join([dictArticle.title, content, source])\ .encode('ascii', 'ignore') hash = md5_constructor(hash_str).hexdigest() article_date = dictArticle.published_parsed if not article_date: article_date = datetime.now().isoformat() else: article_date = datetime.fromtimestamp( mktime(dictArticle.published_parsed) ).isoformat() a, created = cls.objects.get_or_create(link=dictArticle.link) if created: article = { 'title': utils.clean(dictArticle.title), 'link': dictArticle.link, 'hash_key': hash, 'content': utils.clean(content), 'source': source, 'tag': cls.__name__, 'image_url': get_image_url(dictArticle.links), 'date': '%s' % article_date } a.title = article['title'] a.hash_key = article['hash_key'] a.content = article['content'] a.source = article['source'] a.date = article['date'] a.image_url = article['image_url'] a.save() return article return None except: utils.print_exception() return None
def run_tasks(feeds, feedModel): logger.info('[update] Started: %s --' % feedModel.__name__) logger.info('[update] Fetching RSS feeds.') new_articles = list(get_new_articles(feeds, feedModel)) l_new_articles = len([a for a in new_articles if a]) try: index_articles(new_articles) logger.info('[update][%s] Complete. new:%s' % ( feedModel.__name__, l_new_articles)) except: rollback_articles(new_articles, feedModel) logger.info('[update][error] Rolling back: %s (%s)' % ( feedModel.__name__, l_new_articles)) utils.print_exception() conn.indices.refresh('newsworld')
def build_similar(articles, tag): history = [] seen = [] skipped = [] index = 0 for a in articles: log_progress(index, len(articles), len(history), len(skipped), tag) a = prepare_es_dto(a) found_similar = False index += 1 if index > 200: break if a['hash_key'] in seen: skipped.append(a['hash_key']) continue for h in history: try: if a['hash_key'] not in h['seen']: sim_ratio = get_fuzzy_ratio(h, a) if sim_ratio >= 70: a['score'] = sim_ratio a['seen'] = h['hash_key'] try: a.save() except: pass h['similar'] = [a, ] + h['similar'] + a['similar'] h['seen'].append(a['hash_key']) h['seen'] += [a['seen'], ] #h = append_related(tag, h, a, 70) seen.append(a['hash_key']) seen += [a['seen'], ] found_similar = True if a['hash_key'] == '94032da0f2e05ed7e6df051ec1e0e9ce': print 'what!!', h['title'] break #else: # for s in h['similar']: # sim_ratio = get_fuzzy_ratio(s, a) # if sim_ratio >= 70 and a['hash_key'] not in h['seen']: # a['score'] = sim_ratio # a['seen'] = s['hash_key'] # h['similar'].insert(0, a) # h['seen'].append(a['hash_key']) # #h = append_related(tag, h, a, 70) # seen.append(a['hash_key']) # break except: print_exception() try: if not found_similar and a['hash_key'] not in seen: a['main'] = True #a['similar'] = [] #a['seen'] = [] history.append(a) seen.append(a['hash_key']) seen += a['seen'] except: print_exception() return history