def update_candidateset(self): full_warm_seq_ids = self.get_full_dataset() from article.models import Article articles = Article.objects(seq_id__in=full_warm_seq_ids).only('id', 'category', 'seq_id', 'feature', 'published_at', 'quality', 'hot') now = datetime.datetime.utcnow() update_time = self.candidate_updated_time if not update_time: update_time = get_global_cal_time() seq_id_article_mapping = dict(((a.seq_id, (a, calculate_scores(0, 0, a.published_at, a.quality, a.hot, update_time, now))) for a in articles)) short_feature = self.get_feature_by_name('recent_feature') long_feature = self.feature_matrix User.recompute_candidateset(self, short_feature, long_feature, seq_id_article_mapping, subset=full_warm_seq_ids) update_time = datetime.datetime.utcnow() self.candidate_updated_time = update_time self.save()
def batch_add_to_users_candidateset(self): if not self.usable: return from people.models import User from people.mixins import default_key_user users = User.objects.all().only('id', 'seq_id', 'feature', 'recent_feature').order_by('id') short_ur_mapping = self._generate_user_relation_scores(users, 'recent_feature') long_ur_mapping = self._generate_user_relation_scores(users) if not len(short_ur_mapping) == len(long_ur_mapping): # NOTE: logging raise Exception('Notice') update_time = get_global_cal_time() default_short_score, default_long_score = calculate_scores(0, 0, self.published_at, self.quality, self.hot, update_time) seq_id = str(self.seq_id) pipeline = warm_conn.pipeline() keys = default_key_user.get_category_keys(self.category) long_key = default_key_user.rec_longterm_dataset_key pipeline.zadd(long_key, seq_id, default_long_score) long_cate_recs = default_key_user.get_longterm_dataset_key(self.category) for long_cate_rec in long_cate_recs: pipeline.zadd(long_cate_rec, seq_id, default_long_score) short_key = default_key_user.rec_shortterm_dataset_key pipeline.zadd(short_key, seq_id, default_long_score) short_cate_recs = default_key_user.get_shortterm_dataset_key(self.category) for short_cate_rec in short_cate_recs: pipeline.zadd(short_cate_rec, seq_id, default_short_score) for u in users: short_ur_score = short_ur_mapping[u.seq_id] long_ur_score = long_ur_mapping[u.seq_id] short_score = 1000*short_ur_score + default_short_score long_score = 1000*long_ur_score + default_long_score long_key = u.rec_longterm_dataset_key pipeline.zadd(long_key, seq_id, long_score) long_cate_recs = u.get_longterm_dataset_key(self.category) for long_cate_rec in long_cate_recs: pipeline.zadd(long_cate_rec, seq_id, long_score) short_key = u.rec_shortterm_dataset_key pipeline.zadd(short_key, seq_id, short_score) short_cate_recs = u.get_shortterm_dataset_key(self.category) for short_cate_rec in short_cate_recs: pipeline.zadd(short_cate_rec, seq_id, short_score) pipeline.execute()
def calcualte_useless_articles(cls): from people.mixins import default_key_user full_warm_seq_ids = default_key_user.get_full_dataset() articles = cls.objects(seq_id__in=full_warm_seq_ids).only('id', 'seq_id', 'category', 'published_at', 'quality', 'hot') line_article = cls.get_earliest_valid_obj() outdate_articles = [article for article in articles if article.seq_id < line_article.seq_id] usable_articles = set(articles) - set(outdate_articles) useless_articles = outdate_articles today = date.today() now = datetime.utcnow() valid_deadline = now - VALID_DURATION update_time = get_global_cal_time() for pubdate, _articles in groupby(sorted(useless_articles, key=lambda a: a.published_at.date()), lambda a: a.published_at.date()): if pubdate >= now.date(): continue __articles = list(_articles) __aritlces = sorted(__articles, key=lambda _ar: calculate_scores(0, 0, _ar.published_at, _ar.quality, _ar.hot, 0)[0], reverse=True) count = len(__articles) if pubdate < valid_deadline.date(): useless_articles.extend(__articles) else: useless_articles.extend(__articles[count * 3 / 4:]) return useless_articles