def get_product_templates_tfidf(context, product_ids, blocked_products=None): """ Retrieves the top *n_templates* product templates per given product. :param context: A session context. :param product_ids: A list with the ids of the intended products. :param blocked_products: A list with ids of products that should not be fetched. :returns: A map {product_id: list of [strength, template_id] pairs}. """ result = {} if blocked_products is None: blocked_products = [] templates_map = context.data_proxy.fetch_product_templates(product_ids) for p_id, templates_tuple in templates_map.items(): approved_templates = [t for t in templates_tuple[1] if t[1] not in blocked_products] result[p_id] = approved_templates if context.user_context is not None: product_models = context.product_models else: product_models = {} all_products = set(product_ids) for templates_with_strengths in result.values(): all_products |= {t[1] for t in templates_with_strengths} products_with_missing_product_models = all_products - product_models.keys() if len(products_with_missing_product_models) > 0 and context.filter_strategy == ctx.AFTER_SCORING: product_models.update(context.data_proxy.fetch_product_models(list(products_with_missing_product_models))) if (context.near_identical_filter_field is not None) and (context.near_identical_filter_threshold is not None): for product_id, templates_with_strengths in result.items(): templates = [t[1] for t in templates_with_strengths if t[1] in product_models] templates_to_disregard = pinpoint_near_identical_products(context, templates, product_models, base_product_id=product_id) result[product_id] = [t for t in templates_with_strengths if t[1] not in templates_to_disregard] return result
def recommend(self, n_recommendations): """ Returns the top-scored recommendations for the target user. :param n_recommendations: The intended number of recommendations. """ start_time = time() log.info("Retrieving {0} recommendations for user [{1}]".format(n_recommendations, self.session_context.user_id)) # Obtains the candidate products. candidate_products_by_algorithm = self._gather_processed_candidate_products( max(500, 3 * n_recommendations) # Hack to add some slack and make sure we bring enough products to overcome a possible subsequent pruning # by history decay, deleted and already consumed products. ) if self.session_context.filter_strategy == ctx.BEFORE_SCORING: number_of_recommendations_to_ask_for = min(3 * n_recommendations, len(candidate_products_by_algorithm[PRE_FILTER])) else: number_of_recommendations_to_ask_for = 3 * n_recommendations # Here again we leave some slack, so we can post-process and still retain the intended number of products. # Scores the products. scored_recommendations = self.gather_recommendation_scores(candidate_products_by_algorithm, number_of_recommendations_to_ask_for) if log.is_debug_enabled(): log.debug('full recommendations: [{0}] => [{1}]'.format(len(scored_recommendations), scored_recommendations)) else: log.info('full recommendations: [{0}]'.format(len(scored_recommendations))) # Post-processes the scores (boosts, decays, etc.). scored_recommendations = self.post_process_scores(scored_recommendations) if log.is_debug_enabled(): log.debug('post-processed recommendations: [{0}] => [{1}]'.format(len(scored_recommendations), scored_recommendations)) else: log.info('post-processed recommendations: [{0}]'.format(len(scored_recommendations))) # Makes sure that all pre-filtered products have made their way into the recommendations list. if self.session_context.filter_strategy == ctx.BEFORE_SCORING: all_candidates = candidate_products_by_algorithm[PRE_FILTER] if len(scored_recommendations) < len(all_candidates): recommended_products = {p[1] for p in scored_recommendations} missing_candidates = list(all_candidates - recommended_products) random.shuffle(missing_candidates) for missing_candidate in missing_candidates: fill_in_score = ["PRE-FILTER", 0] if self.is_hybrid() else [0] scored_recommendations += [[fill_in_score, missing_candidate]] should_worry_about_near_identical = (self.session_context.near_identical_filter_field is not None) and \ (self.session_context.near_identical_filter_threshold is not None) # Ranks. slack_for_near_identical = 2 if should_worry_about_near_identical else 1 ranked_recommendations = self._nlargest(slack_for_near_identical * n_recommendations, scored_recommendations) if log.is_debug_enabled(): log.debug('ranked recommendations: [{0}] => [{1}]'.format(len(ranked_recommendations), ranked_recommendations)) else: log.info('ranked recommendations: [{0}]'.format(len(ranked_recommendations))) # Identifies near-identical products within a same page and sends them to the end of the list. if should_worry_about_near_identical: products = [r[1] for r in ranked_recommendations] products_to_disregard = pinpoint_near_identical_products(self.session_context, products, self.session_context.product_models) result = [] near_identical = [] count_recommendations = 0 for score_and_product in ranked_recommendations: product = score_and_product[1] if product in products_to_disregard: score = score_and_product[0] new_score = ["NI"] + score # indicates it was decayed for being 'near-identical' near_identical += [(new_score, product)] else: result += [score_and_product] count_recommendations += 1 if count_recommendations == n_recommendations: break hole = n_recommendations - count_recommendations if hole > 0: result += near_identical[:hole] if log.is_debug_enabled(): log.debug('recommendations after near-identical filter [count({0})] => [{1}]'.format( len(ranked_recommendations), ranked_recommendations)) else: log.info('recommendations after near-identical filter [count({0})]'.format( len(ranked_recommendations))) else: result = ranked_recommendations # There was no need to filter near-identical products... log.info("Recommender{0} took [{1:2.6f}] seconds for user [{2}]".format( self.get_suffix(), time() - start_time, self.session_context.user_id)) return result