def main(argv): if len(argv) < 1: msg = "You must specify the environment" log.error(msg) return {"success": False, "message": msg} try: # command-line arguments env = argv[0] session = init_session(env) product_ids = None if len(argv) >= 2: product_ids = argv[1] if product_ids != "--all": product_ids = argv[1].split(",") timestamp = session.get_present_date() start = time() latest_run = session.data_proxy.fetch_latest_batch_info_product_template_consolidation() if latest_run: if latest_run.get("status") == "running": msg = "An old consolidation batch is still running. Won't start another one." log.info(msg) return {"success": False, "message": msg} session.data_proxy.save_timestamp_product_template_consolidation(status="running", timestamp=timestamp) consolidate_product_templates(session, product_ids) session.data_proxy.ensure_indexes_cache() elapsed_time = time() - start session.data_proxy.save_timestamp_product_template_consolidation( status="success", timestamp=timestamp, elapsed_time=elapsed_time ) return {"success": True} except Exception: log.exception("Exception on {0}:".format(__name__)) session.data_proxy.save_timestamp_product_template_consolidation(status="failed", timestamp=timestamp) return {"success": False, "message": traceback.format_exc()}
def generate_templates(session_context): generate_strengths(session_context) consolidate_product_templates(session_context, collaborative=True, tfidf=False)
def update_templates( session_context, new_activity, u_p_activities_summary=None, first_impression_date=None, should_lookup_activities_summary=True, should_lookup_first_impression=True, ): """ Updates product x product strengths based on a single new activity. :param session_context: The session context. :param new_activity: a dict {"external_user_id": user_id, "external_product_id": product_id, "activity": activity_type, "created_at": datetime}. :param u_p_activities_summary: The summary of activities for that (user, product) pair, if any, in the form of a dict {"external_user_id": the user id, "external_product_id": the product id, "activity": the latest activity type, "created_at": the datetime of the latest activity, "pp_latest_type": the type of the latest activity to be processed for that pair during p-p strengths calculation, "pp_latest_date": the date of the latest activity to be processed for that pair during p-p strengths calculation}. :param first_impression_date: The date of the first impression, if any, the activity user has received on the activity product. :param should_lookup_activities_summary: If True and previous_activity is None, it queries the database for the previous activity. :param should_lookup_first_impression: If True and first_impression_date is None, it queries the database for the first impression. """ log.info("Computing product-product strengths...") user = new_activity["external_user_id"] if config.is_anonymous(user): log.info("Anonymous users should not affect product-product strengths! Exiting now.") return product = new_activity["external_product_id"] activity_date = new_activity["created_at"] activity_type = new_activity["activity"] rating = session_context.rating_by_activity.get(activity_type) if rating is None: log.error("Unsupported activity type: %s" % activity_type) return suggested_cutoff_date = session_context.get_present_date() - dt.timedelta( session_context.product_product_strengths_window ) latest_batch_info = session_context.data_proxy.fetch_latest_batch_info_product_product_strengths() if latest_batch_info is not None: latest_batch_timestamp = latest_batch_info["timestamp"] persisted_cutoff_date = latest_batch_info.get("cutoff_date") if persisted_cutoff_date is None: cutoff_date = suggested_cutoff_date else: cutoff_date = max(persisted_cutoff_date, suggested_cutoff_date) else: latest_batch_timestamp = None cutoff_date = suggested_cutoff_date if session_context.impressions_enabled and first_impression_date is None and should_lookup_first_impression: product_user_impressions_summary = ( session_context.data_proxy.fetch_impressions_summary( product_ids=[product], user_ids=[user], group_by_product=True, anonymous=False ) .get(product, {}) .get(user, (0, None)) ) first_impression_date = product_user_impressions_summary[1] if u_p_activities_summary is None and should_lookup_activities_summary: u_p_activities_summary_as_singleton_list = session_context.data_proxy.fetch_activity_summaries_by_user( user_ids=[user], product_ids=[product], indexed_fields_only=False, anonymous=False ).get(user, []) if len(u_p_activities_summary_as_singleton_list) > 0: u_p_activities_summary = u_p_activities_summary_as_singleton_list[0] previous_activity_rating = 0 if u_p_activities_summary is not None: previous_activity_type = u_p_activities_summary.get("pp_latest_type") if previous_activity_type is not None: previous_activity_rating = session_context.rating_by_activity[previous_activity_type] previous_activity_date = u_p_activities_summary["pp_latest_date"] if previous_activity_rating == rating and not session_context.impressions_enabled: return # repeating the latest activity --- there is nothing to do here # (if using impressions, must recalculate anyway to account for latest impressions) numerator_diff = [0, 0] denominator_diff = 0 remove_previous_activity_contribution = previous_activity_rating >= min( session_context.min_rating_conservative, session_context.min_rating_recommendable_from_product ) if remove_previous_activity_contribution: if session_context.impressions_enabled: if first_impression_date is not None: # must remove former contribution if impression was already processed incrementally remove_previous_activity_contribution = previous_activity_date >= first_impression_date # must remove also if generation from scratch happened after the first impression if not remove_previous_activity_contribution and latest_batch_timestamp is not None: remove_previous_activity_contribution = latest_batch_timestamp >= first_impression_date # Removes the former contribution of the previous commanding activity for that (user, product) pair. if remove_previous_activity_contribution: if previous_activity_rating >= session_context.min_rating_conservative: numerator_diff[CONSERVATIVE] -= 1 if previous_activity_rating >= session_context.min_rating_aggressive: numerator_diff[AGGRESSIVE] -= 1 if previous_activity_rating >= session_context.min_rating_recommendable_from_product: denominator_diff -= 1 # Adds the contribution of this activity. if rating >= session_context.min_rating_conservative: numerator_diff[CONSERVATIVE] += 1 if rating >= session_context.min_rating_aggressive: numerator_diff[AGGRESSIVE] += 1 if rating >= session_context.min_rating_recommendable_from_product: denominator_diff += 1 # Fetches all the products consumed by this user. products_by_rating = session_context.data_proxy.fetch_products_by_rating_by_user( user_ids=[user], min_date=cutoff_date, max_date=session_context.get_present_date() )[0].get(user, {}) # Includes the product of the current activity (remember: this activity might not have been saved yet) products_set = products_by_rating.get(rating, set()) products_set.add(product) products_by_rating[rating] = products_set if u_p_activities_summary is not None: products_set = products_by_rating.get(previous_activity_rating, set()) if product in products_set: products_set.remove(product) products_by_rating[previous_activity_rating] = products_set products_rated_conservatively_high = set() for r in range(session_context.min_rating_conservative, 6): products_rated_conservatively_high |= products_by_rating.get(r, set()) products_rated_aggressively_high = set() for r in range(session_context.min_rating_aggressive, 6): products_rated_aggressively_high |= products_by_rating.get(r, set()) products_rated_sufficiently_for_recommendation = set() for r in range(session_context.min_rating_recommendable_from_product, 6): products_rated_sufficiently_for_recommendation |= products_by_rating.get(r, set()) numerators_with_product_as_template = None denominators_with_product_as_template = None numerators_with_product_as_base = None denominators_with_product_as_base = None strengths_map_for_insert = {} strengths_map_for_update = {} # This product as TEMPLATE # If this product has been consumed by this user without previous impressions, then it shall not contribute # for product-product strengths with this product as template. update_product_as_template = True if session_context.impressions_enabled: update_product_as_template = first_impression_date is not None # Existing pairs with product as template. if update_product_as_template and numerator_diff != [0, 0]: strength_operands_with_product_as_template = session_context.data_proxy.fetch_product_product_strength_operands( templates=[product] ) numerators_with_product_as_template = strength_operands_with_product_as_template[0] denominators_with_product_as_template = strength_operands_with_product_as_template[1] for product_and_template, numerator_tuple in numerators_with_product_as_template.items(): base_product = product_and_template[0] if base_product in products_rated_sufficiently_for_recommendation: new_numerator_tuple = [numerator_tuple[0] + numerator_diff[0], numerator_tuple[1] + numerator_diff[1]] numerators_with_product_as_template[product_and_template] = new_numerator_tuple update_doc = strengths_map_for_update.get(product_and_template, {}) update_doc["nc"] = new_numerator_tuple[CONSERVATIVE] update_doc["na"] = new_numerator_tuple[AGGRESSIVE] strengths_map_for_update[product_and_template] = update_doc # New pairs with product as template. if update_product_as_template and numerator_diff[0] == 1: # if this user has *just* rated this product high... new_base_products = [] for base_product in products_rated_sufficiently_for_recommendation: if base_product != product and (base_product, product) not in numerators_with_product_as_template: new_base_products += [base_product] new_numerator_tuple = [ 1 if rating >= session_context.min_rating_conservative else 0, 1 if rating >= session_context.min_rating_aggressive else 0, ] numerators_with_product_as_template[(base_product, product)] = new_numerator_tuple update_doc = strengths_map_for_insert.get((base_product, product), {}) update_doc["nc"] = new_numerator_tuple[CONSERVATIVE] update_doc["na"] = new_numerator_tuple[AGGRESSIVE] strengths_map_for_insert[(base_product, product)] = update_doc users_by_rating_by_new_base_product = session_context.data_proxy.fetch_users_by_rating_by_product( product_ids=new_base_products, min_date=cutoff_date, max_date=session_context.get_present_date() )[0] for new_base_product in new_base_products: source_users = set() for r in range(session_context.min_rating_recommendable_from_product, 6): source_users |= users_by_rating_by_new_base_product[new_base_product][r] if session_context.impressions_enabled: # Retrieves the intersection of the top-rated users of the base product # with the users with impressions for the template product source_users_with_impressions = session_context.data_proxy.fetch_users_with_impressions_by_product( product_ids=[product], user_ids=list(source_users), anonymous=False ).get(product, set()) new_denominator = len(source_users_with_impressions) else: new_denominator = len(source_users) denominators_with_product_as_template[(new_base_product, product)] = new_denominator insert_doc = strengths_map_for_insert.get((new_base_product, product), {}) insert_doc["denominator"] = new_denominator strengths_map_for_insert[(new_base_product, product)] = insert_doc # This product as BASE PRODUCT # Existing pairs with product as base product. if session_context.bidirectional_pp_strength_updates and denominator_diff != 0: product_product_strength_operands = session_context.data_proxy.fetch_product_product_strength_operands( products=[product] ) numerators_with_product_as_base = product_product_strength_operands[0] denominators_with_product_as_base = product_product_strength_operands[1] for product_and_template in denominators_with_product_as_base: # updates the denominator... denominator = denominators_with_product_as_base[product_and_template] new_denominator = denominator + denominator_diff denominators_with_product_as_base[product_and_template] = new_denominator update_doc = strengths_map_for_update.get(product_and_template, {}) update_doc["denominator"] = new_denominator strengths_map_for_update[product_and_template] = update_doc # ...and the numerator, in case the template product has been consumed by this user if ( product_and_template[1] in products_rated_conservatively_high and product_and_template in numerators_with_product_as_base ): numerator_tuple = numerators_with_product_as_base[product_and_template] numerator_tuple[CONSERVATIVE] += denominator_diff if product_and_template[1] in products_rated_aggressively_high: numerator_tuple[AGGRESSIVE] += denominator_diff numerators_with_product_as_base[product_and_template] = numerator_tuple update_doc = strengths_map_for_update.get(product_and_template, {}) update_doc["nc"] = numerator_tuple[CONSERVATIVE] update_doc["na"] = numerator_tuple[AGGRESSIVE] strengths_map_for_update[product_and_template] = update_doc # New pairs with product as base product. if session_context.bidirectional_pp_strength_updates and denominator_diff == 1: # if this product has *just* been rated at least conservatively high... new_templates = [] for template in products_rated_conservatively_high: if template != product and (product, template) not in denominators_with_product_as_base: # new pair new_templates += [template] if len(new_templates) > 0: users_of_product_as_base = session_context.data_proxy.fetch_users_by_rating_by_product( product_ids=[product], min_date=cutoff_date, max_date=session_context.get_present_date() )[0].get(product, {}) # Includes the user of the current activity (remember again: this activity might not have been saved yet) users_set = users_of_product_as_base.get(rating, set()) users_set.add(user) users_of_product_as_base[rating] = users_set recommending_users_of_product_as_base = set() for r in range(session_context.min_rating_recommendable_from_product, 6): recommending_users_of_product_as_base |= users_of_product_as_base.get(r, set()) if session_context.impressions_enabled: user_impressions_by_template = session_context.data_proxy.fetch_impressions_summary( product_ids=new_templates, user_ids=list(recommending_users_of_product_as_base), group_by_product=True, anonymous=False, ) for new_template in new_templates: if session_context.impressions_enabled: new_denominator = len(user_impressions_by_template.get(new_template, [])) else: new_denominator = len(recommending_users_of_product_as_base) denominators_with_product_as_base[(product, new_template)] = new_denominator insert_doc = strengths_map_for_insert.get((product, new_template), {}) insert_doc["denominator"] = new_denominator strengths_map_for_insert[(product, new_template)] = insert_doc for new_template in new_templates: if new_template in products_rated_conservatively_high: numerator_tuple = numerators_with_product_as_base.get((product, new_template), [0, 0]) numerator_tuple[CONSERVATIVE] += 1 if new_template in products_rated_aggressively_high: numerator_tuple[AGGRESSIVE] += 1 numerators_with_product_as_base[(product, new_template)] = numerator_tuple insert_doc = strengths_map_for_insert.get((product, new_template), {}) insert_doc["nc"] = numerator_tuple[CONSERVATIVE] insert_doc["na"] = numerator_tuple[AGGRESSIVE] strengths_map_for_insert[(product, new_template)] = insert_doc # Computes all affected strengths for UPDATE if len(strengths_map_for_update) > 0: _prepare_strengths_map( session_context, product, strengths_map_for_update, numerators_with_product_as_base, denominators_with_product_as_base, numerators_with_product_as_template, denominators_with_product_as_template, ) log.info("Saving product-product strengths (UPDATE)...") session_context.data_proxy.save_pp_strengths(strengths_map_for_update, upsert=True) log.info("[{0}] product-product strengths updated".format(len(strengths_map_for_update))) else: log.info("No old strengths to update.") # Computes all affected strengths for INSERT if len(strengths_map_for_insert) > 0: _prepare_strengths_map( session_context, product, strengths_map_for_insert, numerators_with_product_as_base, denominators_with_product_as_base, numerators_with_product_as_template, denominators_with_product_as_template, ) log.info("Saving product-product strengths (INSERT)...") session_context.data_proxy.save_pp_strengths(strengths_map_for_insert, upsert=False) log.info("[{0}] product-product strengths inserted".format(len(strengths_map_for_insert))) else: log.info("No new strengths to insert.") # Consolidates cached product templates log.info("Determining products whose templates must be consolidated...") products_to_consolidate = {product_and_template[0] for product_and_template in strengths_map_for_insert} updated_products = {product_and_template[0] for product_and_template in strengths_map_for_update} old_templates_map = session_context.data_proxy.fetch_product_templates(list(updated_products)) for product_and_template, strength_doc in strengths_map_for_update.items(): base_product = product_and_template[0] template_product = product_and_template[1] cutoff_strength = 0 old_template_ids = set() old_templates = old_templates_map.get(base_product) if old_templates: old_templates_collaborative = old_templates[0] if old_templates_collaborative: cutoff_strength = old_templates_collaborative[-1][0] # the strength of the weakest template if isinstance(cutoff_strength, str): cutoff_strength = 0 old_template_ids = {t[1] for t in old_templates_collaborative} if ( strength_doc["strength"] > cutoff_strength or template_product in old_template_ids or len(old_template_ids) < 3 * session_context.product_templates_count ): products_to_consolidate.add(base_product) if session_context.should_consolidate_product_templates_on_the_fly: if len(products_to_consolidate) > 0: log.info("Consolidating templates of %d products..." % len(products_to_consolidate)) consolidate_product_templates( session_context, products_list=list(products_to_consolidate), collaborative=True, tfidf=False ) else: log.info("No products with templates to consolidate.") session_context.data_proxy.save_latest_activity_for_product_product_strengths( user, product, activity_type, activity_date ) log.info("PP strengths and templates updated successfully.")
def update_templates(session_context, product_id, language, tfidf_by_top_term_by_attribute): """ Updates product-product strengths based on their content. The attributes which are taken into consideration are those defined in the customer config file PRODUCT_MODEL entry. Product attributes whose 'similarity_filter' is set to true must be equal so that two products must have non-zero mutual similarity. Product attributes whose 'similarity_weight' is strictly positive are linearly combined according to the assigned weights. This function does not recreate all strengths from scratch; rather, it updates the strengths of all product-product pairs containing the product whose *product_id* is given. :param session_context: The session context. :param product_id: The intended product. :param language: The language of the product being processed. :param tfidf_by_top_term_by_attribute: A map {attribute: {term: tfidf}}, containing the TFIDF's of the top TFIDF terms in each of the TEXT-type attribute of the product being processed. """ strengths = {} text_fields = session_context.product_text_fields cutoff_date = session_context.get_present_date() - dt.timedelta( session_context.product_product_strengths_tfidf_window) product_models = {} # Processes each TEXT attribute. for attribute in text_fields: weight = session_context.similarity_weights_by_type[pm.TEXT].get(attribute, 0) if weight == 0: continue log.info("Fetching products with common terms in attribute [%s]..." % attribute) terms = [term for term in tfidf_by_top_term_by_attribute.get(attribute, [])] new_product_models = session_context.data_proxy.fetch_product_models_for_top_tfidf_terms( attribute, language, terms, min_date=cutoff_date, max_date=session_context.get_present_date()) product_models.update(new_product_models) if len(new_product_models) > 1: # we require at least one product model other than that of the current product product_ids_list = [p_id for p_id in new_product_models] log.info("Fetching TFIDF maps for attribute [%s] in [%d] products..." % (attribute, len(product_ids_list))) tfidf_by_term_by_product = session_context.data_proxy.fetch_tfidf_map(attribute, product_ids_list) log.info("Computing strengths...") _process_text_attribute_contributions(strengths, tfidf_by_term_by_product, weight, product_id) # Processes the non-TEXT attributes. _process_non_text_attributes_contributions(session_context, product_models, strengths) # Persists the updated strengths. log.info("Saving strengths tfidf...") strengths_list = [{"product": product_pair[0], "template_product": product_pair[1], "strength": value if value >= MIN_ACCEPTABLE_PP_STRENGTH_TFIDF else 0} for product_pair, value in strengths.items()] session_context.data_proxy.save_product_product_strengths_tfidf(strengths_list) # Consolidates cached product templates log.info("Determining products whose templates tfidf must be consolidated...") products_to_consolidate = set() updated_products = {product_and_template[0] for product_and_template in strengths} old_templates_map = session_context.data_proxy.fetch_product_templates(list(updated_products)) for product_and_template, strength in strengths.items(): base_product = product_and_template[0] template_product = product_and_template[1] should_consolidate = True old_templates = old_templates_map.get(base_product) if old_templates is not None: if len(old_templates[1]) > 0: cutoff_strength = old_templates[1][-1][0] # the strength of the weakest template tfidf old_template_ids = {t[1] for t in old_templates[1]} if strength <= cutoff_strength and \ template_product not in old_template_ids and \ len(old_templates) >= 3 * session_context.product_templates_count: should_consolidate = False if should_consolidate: products_to_consolidate.add(base_product) if len(products_to_consolidate) > 0: log.info("Consolidating templates of %d products..." % len(products_to_consolidate)) consolidate_product_templates(session_context, products_list=list(products_to_consolidate), collaborative=False, tfidf=True)