def test_user_user_strengths_incremental_with_new_impressions_identified_users(self): """ Tests whether the user x user strengths generated on a step-by-step basis match exactly those created from scratch. """ test_descriptions = [("u_esp_4", "p_nonsense_1", "p_empty_with_missing_category", "p_filter_2", "buy")] for idx, (user, product1, product2, product3, activity_type) in enumerate(test_descriptions): # Saves a couple of impressions for the chosen user date = pytz.utc.localize(dateutil.parser.parse("1988-11-06 6:00:00") + dt.timedelta(seconds=(2 * idx + 1))) self.db_proxy.increment_impression_summary(user_id=user, product_id=product1, date=date, anonymous=False) self.db_proxy.increment_impression_summary(user_id=user, product_id=product2, date=date, anonymous=False) ut.generate_templates(self.session_context) # it is important to regenerate from scratch (with all new impressions) # Saves one activity for that same user date = pytz.utc.localize(dateutil.parser.parse("1988-11-06 6:00:00") + dt.timedelta(seconds=(2 * idx + 2))) activity = {"external_user_id": user, "external_product_id": product3, "activity": activity_type, "created_at": date} ut.update_templates(self.session_context, activity) tasks.update_summaries(self.session_context, activity) self.compare_incremental_vs_from_scratch( target_users=[user] if self.session_context.impressions_enabled else None)
def compare_incremental_vs_from_scratch(self, target_users=None): """ Helper method to compare strengths generated incrementally vs from-scratch. When using impressions we only care about target users whose activities triggered the updates. :param target_users: If not None, then only target users informed in this set will be considered. """ def compare_strengths(incremental, from_scratch, pair_of_users): strength1 = incremental[pair_of_users] strength2 = from_scratch[pair_of_users] nose.tools.ok_(abs(strength1 - strength2) < 0.00001, "Strengths do not match for " + str(pair_of_users) + ": " + "[incremental --> %.6f] [from scratch --> %.6f]" % (strength1, strength2)) def compare_templates(incremental, from_scratch, user): templates1 = incremental[user] templates2 = from_scratch[user] nose.tools.eq_(templates1, templates2, "Templates do not match for " + str(user) + ": " + "[incremental --> %s] [from scratch --> %s]" % (str(templates1), str(templates2))) users = target_users if target_users is not None else list(self.db_proxy.fetch_all_user_ids()) # saves locally the strengths and the templates that were obtained incrementally strengths_incremental = self.db_proxy.fetch_user_user_strengths() templates_incremental = self.db_proxy.fetch_user_templates(users) # regenerates all strengths from scratch ut.generate_templates(self.session_context) # saves locally the strengths and the templates that were obtained from scratch strengths_from_scratch = self.db_proxy.fetch_user_user_strengths() templates_from_scratch = self.db_proxy.fetch_user_templates(users) for user_pair in strengths_from_scratch: if target_users is not None: if user_pair[0] not in target_users: continue # The incremental (on-the-fly) updates take care of "user as target" strengths only. # This is ok, since "user as template" updates will be triggered indirectly # (by other target users). compare_strengths(strengths_incremental, strengths_from_scratch, user_pair) compare_templates(templates_incremental, templates_from_scratch, user_pair[0]) for user_pair in strengths_incremental: if target_users is not None: if user_pair[0] not in target_users: continue # Idem. compare_strengths(strengths_incremental, strengths_from_scratch, user_pair) compare_templates(templates_incremental, templates_from_scratch, user_pair[0])
def test_user_user_strengths_incremental_with_new_impressions_random(self): """ Tests whether the user x user strengths generated on a step-by-step basis match exactly those created from scratch. This test saves several random activities in a row, checking whether all strengths were correctly updated. """ if not tests.INCLUDE_RANDOM_TESTS: return all_users = [u for u in self.db_proxy.fetch_all_user_ids()] all_products = [p for p in self.db_proxy.fetch_all_product_ids()] for i in range(100): user = random.choice(all_users) is_anonymous = config.is_anonymous(user) print("user: %s" % user) # Saves a couple of impressions for the chosen user date = pytz.utc.localize(dateutil.parser.parse("1988-11-06 6:00:00")) + dt.timedelta(seconds=2 * i) product1 = random.choice(all_products) product2 = random.choice(all_products) self.db_proxy.increment_impression_summary(user_id=user, product_id=product1, date=date, anonymous=is_anonymous) self.db_proxy.increment_impression_summary(user_id=user, product_id=product2, date=date, anonymous=is_anonymous) print("impressions --> %s, %s" % (product1, product2)) ut.generate_templates(self.session_context) # it is important to regenerate from scratch (with all new impressions) # Saves one activity for that same user product3 = random.choice(all_products) activity_type = random.choice(self.session_context.supported_activities) date = pytz.utc.localize(dateutil.parser.parse("1988-11-06 6:00:00")) + dt.timedelta(seconds=2 * i + 1) activity = {"external_user_id": user, "external_product_id": product3, "activity": activity_type, "created_at": date} print("activity --> " + str(activity)) ut.update_templates(self.session_context, activity) tasks.update_summaries(self.session_context, activity) self.compare_incremental_vs_from_scratch( target_users=[user] if self.session_context.impressions_enabled else None)
def main(argv): if len(argv) < 1: msg = "You must specify the environment" log.error(msg) return {"success": False, "message": msg} try: # command-line arguments env = argv[0] session = init_session(env) ut.generate_templates(session) return {"success": True} except Exception: log.exception('Exception on {0}:'.format(__name__)) return {"success": False, "message": traceback.format_exc()}
def setup(self): super().setup() ut.generate_templates(self.session_context)
def setup(self): super().setup() dp.populate_impressions(self.session_context) ut.generate_templates(self.session_context)
def test_near_identical(self): """ Tests that two products considered 'near-identical' are not recommended at the same time (within the same page) when the filtering strategy is AFTER_SCORING. """ target = "u_tec_1" id_twin_product_1 = "p_tec_TWIN_1" id_twin_product_2 = "p_tec_TWIN_2" date = self.session_context.get_present_date() - dt.timedelta(days=1) twin_product_1 = {"external_id": id_twin_product_1, "language": "english", "date": date, "expiration_date": date + dt.timedelta(days=30), "resources": {"title": "Whatever Gets You Through The Night"}, "full_content": """Begin. Technology. Technology. This is all we got. End.""", "category": "Nonsense"} twin_product_2 = {"external_id": id_twin_product_2, "language": "english", "date": date, "expiration_date": date + dt.timedelta(days=30), "resources": {"title": "Whatever Gets You Through This Night is Alright"}, "full_content": """Begin. Technology. Technology. This is all we got. End.""", "category": "Nonsense"} self.db_proxy.insert_product(twin_product_1) tasks.process_product(self.session_context, id_twin_product_1) self.db_proxy.insert_product(twin_product_2) tasks.process_product(self.session_context, id_twin_product_2) # makes it so that all users consume (and have impressions on) the twins, except for the target user users = self.db_proxy.fetch_all_user_ids() for user in users: if user != target: activity = {"external_user_id": user, "external_product_id": id_twin_product_1, "activity": "buy", "created_at": self.session_context.get_present_date()} tasks.update_summaries(self.session_context, activity) activity = {"external_user_id": user, "external_product_id": id_twin_product_2, "activity": "buy", "created_at": self.session_context.get_present_date()} tasks.update_summaries(self.session_context, activity) if self.session_context.impressions_enabled: is_anonymous = config.is_anonymous(user) self.db_proxy.increment_impression_summary(user, id_twin_product_1, date=self.session_context.get_present_date(), anonymous=is_anonymous) self.db_proxy.increment_impression_summary(user, id_twin_product_2, date=self.session_context.get_present_date(), anonymous=is_anonymous) ut.generate_templates(self.session_context) pt.generate_templates(self.session_context) pttfidf.generate_templates(self.session_context) # Unfortunately we need to regenerate from scratch, # otherwise the df's of the twins will be different. # First, we recommend WITHOUT near-identical filtering, to check that the twins really appear consecutively. custom_settings = {'near_identical_filter_field': None, 'near_identical_filter_threshold': None} session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm) session.refresh() recommender = session.get_recommender() if not recommender.is_hybrid(): # For hybrid recommenders, this check is meaningless. recommendations = recommender.recommend(100) twin_index = -1 for idx, recommendation in enumerate(recommendations): if recommendation[1].startswith("p_tec_TWIN_"): if twin_index >= 0: nose.tools.eq_(idx - twin_index, 1, "The two near-identical products should appear consecutively without filtering") break twin_index = idx # Now we recommend WITH near-identical filtering recommendation_page_size = 5 custom_settings = {'near_identical_filter_field': 'resources.title', 'near_identical_filter_threshold': 2, 'recommendations_page_size': recommendation_page_size} session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm) session.refresh() recommender = session.get_recommender() recommendations = recommender.recommend(100) # Sanity check recommended_products = {r[1] for r in recommendations} count_recommended_twins = len({id_twin_product_1, id_twin_product_2} & recommended_products) nose.tools.ok_(count_recommended_twins > 0, "At least one of the twins should have been recommended, otherwise the test is meaningless") # Actual tests twin_index = -1 * recommendation_page_size - 1 # initial value, so the first twin passes the test for idx, recommendation in enumerate(recommendations): if recommendation[1].startswith("p_tec_TWIN_"): nose.tools.ok_(idx - twin_index > 1, # it suffices to show that the twins have been separated "Two near-identical products should not appear within the same recommendations page") twin_index = idx
def test_product_age_decay_exponential(self): """ Tests the effect of applying a product age decay factor based on an exponential function on recommendations. It applies to all recommendation heuristics. """ target = "u_tec_1" id_twin_product_old = "p_tec_TWIN_OLD" id_twin_product_new = "p_tec_TWIN_NEW" # makes it so that the oldest twin is 2 days (the configured half life) older old_date = self.session_context.get_present_date() - dt.timedelta(days=2) new_date = self.session_context.get_present_date() twin_product_old = {"external_id": id_twin_product_old, "language": "english", "date": old_date, "expiration_date": old_date + dt.timedelta(days=30), "resources": {"title": "Whatever Gets You Through The Night"}, "full_content": """Begin. Technology. Technology. This is all we got. End.""", "category": "Nonsense"} twin_product_new = {"external_id": id_twin_product_new, "language": "english", "date": new_date, "expiration_date": new_date + dt.timedelta(days=30), "resources": {"title": "Whatever Gets You Through The Night"}, "full_content": """Begin. Technology. Technology. This is all we got. End.""", "category": "Nonsense"} self.db_proxy.insert_product(twin_product_old) tasks.process_product(self.session_context, id_twin_product_old) self.db_proxy.insert_product(twin_product_new) tasks.process_product(self.session_context, id_twin_product_new) # makes it so that all users consume (and have impressions on) the twins, except for the target user users = self.db_proxy.fetch_all_user_ids() for user in users: if user != target: activity = {"external_user_id": user, "external_product_id": id_twin_product_old, "activity": "buy", "created_at": self.session_context.get_present_date()} tasks.update_summaries(self.session_context, activity) activity = {"external_user_id": user, "external_product_id": id_twin_product_new, "activity": "buy", "created_at": self.session_context.get_present_date()} tasks.update_summaries(self.session_context, activity) if self.session_context.impressions_enabled: is_anonymous = config.is_anonymous(user) self.db_proxy.increment_impression_summary(user, id_twin_product_old, date=self.session_context.get_present_date(), anonymous=is_anonymous) self.db_proxy.increment_impression_summary(user, id_twin_product_new, date=self.session_context.get_present_date(), anonymous=is_anonymous) ut.generate_templates(self.session_context) pt.generate_templates(self.session_context) pttfidf.generate_templates(self.session_context) # Unfortunately we need to regenerate from scratch, # otherwise the df's of the twins will be different. custom_settings = {'product_age_decay_function_name': 'exponential', 'product_age_decay_exponential_function_halflife': 2, 'near_identical_filter_field': None, 'near_identical_filter_threshold': None} # Disables near-identical filtering session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm) session.refresh() recommender = session.get_recommender() # Determines the index of the first actual value in the score tuples # produced by the recommender (note that hybrid recommenders use the first # position to indicate the algorithm number) if recommender.is_hybrid(): start_index = 1 else: start_index = 0 recommendations = recommender.recommend(100) nose.tools.ok_(len(recommendations) > 0, "No recommendations were returned!") strength_old_twin = None strength_new_twin = None for rec in recommendations: if rec[1] == id_twin_product_old: strength_old_twin = rec[0] if rec[1] == id_twin_product_new: strength_new_twin = rec[0] for i in range(start_index, len(strength_old_twin)): old_strength_value = strength_old_twin[i] new_strength_value = strength_new_twin[i] nose.tools.ok_(abs(old_strength_value / new_strength_value - 0.5) < tests.FLOAT_DELTA, "Incorrect application of the product age decay")