def recompute_candidateset(cls, user, short_feature, long_feature, id_article_mapping, subset=None): # NOTE: cls method is avoiding rebuild featrure matrix if subset is not None: full_warm_seq_ids = subset else: full_warm_seq_ids = user.get_full_dataset() if not full_warm_seq_ids: return usable_id_feature_pairs = ((id_article_mapping[_id][0], id_article_mapping[_id][0].feature_matrix, id_article_mapping[_id][1]) for _id in full_warm_seq_ids \ if _id in id_article_mapping) try: usable_articles, usable_features, usable_article_scores = izip(*usable_id_feature_pairs) except ValueError: # NOTE: no usable id in id_article_mapping return if not usable_articles: return article_union_matrix = sp.vstack(usable_features) usable_short_urs = batch_calculate_similarity(short_feature, article_union_matrix) usable_long_urs = batch_calculate_similarity(long_feature, article_union_matrix) pipeline = warm_conn.pipeline() for article, article_scores, short_ur, long_ur in izip(usable_articles, usable_article_scores, usable_short_urs, usable_long_urs): short_qht_score, long_qht_score = article_scores short_score = 1000*short_ur + short_qht_score long_score = 1000*long_ur + long_qht_score User.add2pipeline(pipeline, user, article, short_score, long_score) pipeline.execute()
def remove_articles_from_candidateset(self, articles): pipeline = warm_conn.pipeline() for article in articles: keys = self.get_category_keys(article.category) for key in keys: pipeline.zrem(key, article.seq_id) pipeline.execute()
def get_all_key_ids_iteritems(self): dataset_keys = self.get_all_dataset_keys() pipeline = warm_conn.pipeline() for key in dataset_keys: pipeline.zrange(key, 0, -1) ids_list = pipeline.execute() return izip(dataset_keys, ids_list)
def recompute_common_candidateset(cls, common_candidate_ids, id_article_mapping): from people.mixins import default_key_user pipeline = warm_conn.pipeline() for _id in common_candidate_ids: if not _id in id_article_mapping: continue article, scores = id_article_mapping[_id] short_score, long_score = scores User.add2pipeline(pipeline, default_key_user, article, short_score, long_score) pipeline.execute()
def copy_warm_data(self, uk_obj): pipeline = warm_conn.pipeline() dataset_keys = uk_obj.get_all_dataset_keys() for key in dataset_keys: pipeline.zrange(key, 0, -1, withscores=True) mapping_list = pipeline.execute() new_dataset_keys = self.get_all_dataset_keys() for key, mapping in izip(new_dataset_keys, mapping_list): if not mapping: continue pipeline.zadd(key, **dict(mapping)) pipeline.execute()
def batch_add_to_users_candidateset(self): if not self.usable: return from people.models import User from people.mixins import default_key_user users = User.objects.all().only('id', 'seq_id', 'feature', 'recent_feature').order_by('id') short_ur_mapping = self._generate_user_relation_scores(users, 'recent_feature') long_ur_mapping = self._generate_user_relation_scores(users) if not len(short_ur_mapping) == len(long_ur_mapping): # NOTE: logging raise Exception('Notice') update_time = get_global_cal_time() default_short_score, default_long_score = calculate_scores(0, 0, self.published_at, self.quality, self.hot, update_time) seq_id = str(self.seq_id) pipeline = warm_conn.pipeline() keys = default_key_user.get_category_keys(self.category) long_key = default_key_user.rec_longterm_dataset_key pipeline.zadd(long_key, seq_id, default_long_score) long_cate_recs = default_key_user.get_longterm_dataset_key(self.category) for long_cate_rec in long_cate_recs: pipeline.zadd(long_cate_rec, seq_id, default_long_score) short_key = default_key_user.rec_shortterm_dataset_key pipeline.zadd(short_key, seq_id, default_long_score) short_cate_recs = default_key_user.get_shortterm_dataset_key(self.category) for short_cate_rec in short_cate_recs: pipeline.zadd(short_cate_rec, seq_id, default_short_score) for u in users: short_ur_score = short_ur_mapping[u.seq_id] long_ur_score = long_ur_mapping[u.seq_id] short_score = 1000*short_ur_score + default_short_score long_score = 1000*long_ur_score + default_long_score long_key = u.rec_longterm_dataset_key pipeline.zadd(long_key, seq_id, long_score) long_cate_recs = u.get_longterm_dataset_key(self.category) for long_cate_rec in long_cate_recs: pipeline.zadd(long_cate_rec, seq_id, long_score) short_key = u.rec_shortterm_dataset_key pipeline.zadd(short_key, seq_id, short_score) short_cate_recs = u.get_shortterm_dataset_key(self.category) for short_cate_rec in short_cate_recs: pipeline.zadd(short_cate_rec, seq_id, short_score) pipeline.execute()
def clear_dataset(self, earliest_article, useless_ids): ''' clear outdated data ''' if not earliest_article and not useless_ids: return if earliest_article: filter_func = lambda x: int(x) < earliest_article.seq_id else: filter_func = lambda x: int(x) in useless_ids key_ids_pairs = self.get_all_key_ids_iteritems() pipeline = warm_conn.pipeline() for key, ids in key_ids_pairs: if ids: ids_to_del = [int(_id) for _id in ids if filter_func(_id)] if ids_to_del: pipeline.zrem(key, *ids_to_del) pipeline.execute()
def remove_from_candidateset(self, user): keys = user.get_category_keys(self.category) pipeline = warm_conn.pipeline() for key in keys: pipeline.zrem(key, self.seq_id) pipeline.execute()