Exemple #1
0
def main(argv):
    if len(argv) < 2:
        msg = "You must specify the environment and the document id"
        log.error(msg)
        return {"success": False, "message": msg}
    try:
        # command-line arguments
        env = argv[0]
        product = json.loads(argv[1])
        product_id = product.get("external_id")

        if product_id is None:
            msg = "Product has no external_id"
            log.error(msg)
            return {"success": False, "message": msg}

        session = init_session(env)

        maintenance.process_product(session, product_id, product=product, force_update=True)

    except Exception:
        log.exception('Exception on {0}:'.format(__name__))
        return {"success": False, "message": traceback.format_exc()}

    return {"success": True}
def main(argv):
    if len(argv) < 2:
        msg = "You must specify the environment, " \
              "the product id (or --all, or --resume) and the number of days (or --complete) if using --all."
        log.error(msg)
        return {"success": False, "message": msg}
    try:
        # command-line arguments
        env = argv[0]
        product_id = argv[1]

        session = init_session(env)

        if product_id == "--all":
            if argv[2] == "--complete":
                days = None
            else:
                days = int(argv[2])
            maintenance.process_products(session, days)

        elif product_id == "--resume":
            maintenance.process_products(session, resume=True)

        else:
            if len(argv) == 3 and argv[2] == '--force':
                force = True
            else:
                force = False
            maintenance.process_product(session, product_id, force_update=force)

    except Exception:
        log.exception('Exception on {0}:'.format(__name__))
        return {"success": False, "message": traceback.format_exc()}

    return {"success": True}
Exemple #3
0
    def test_tfidf_repeated_calls(self):
        """ Tests the df of (language, term) pairs after processing a same document several times.
        """
        maintenance.process_products(self.session_context)
        maintenance.process_product(self.session_context, "p_mus_4")
        maintenance.process_product(self.session_context, "p_mus_4")

        tfidf_by_term = self.db_proxy.fetch_tfidf_map(self.text_field, ["p_mus_4"]).get("p_mus_4", {})
        nose.tools.ok_(abs(tfidf_by_term.get("músic") - 1) < tests.FLOAT_DELTA)
Exemple #4
0
    def test_df_repeated_calls(self):
        """ Tests the df of (language, term) pairs after processing a same document several times.
        """
        maintenance.process_product(self.session_context, "p_mus_4")
        maintenance.process_product(self.session_context, "p_mus_4")

        nose.tools.eq_(self.db_proxy.find_df("portuguese", "rock"), 4)
        nose.tools.eq_(self.db_proxy.find_df("english", "rock"), 1)
        nose.tools.eq_(self.db_proxy.find_df("english", "merc"), 0)
Exemple #5
0
    def test_tf_repeated_calls(self):
        """ Tests the tf of the terms in a document after processing the document more than once.
        """
        product = "p_aut_1"

        maintenance.process_product(self.session_context, product)
        maintenance.process_product(self.session_context, product)

        tf_map = self.db_proxy.fetch_tf_map(self.text_field, [product]).get(product)
        nose.tools.eq_(tf_map["civic"], 2)
        nose.tools.eq_(tf_map["coroll"], 2)
        nose.tools.eq_(tf_map["merc"], 2)
        nose.tools.eq_(tf_map["consum"], 1)
    def test_product_product_strengths_tfidf_from_scratch_versus_incremental(self):
        """ Tests whether the product x product strengths (TFIDF) generated on a step-by-step basis
            match exactly those created from scratch.
        """
        # inner method to compare strengths
        def compare_strengths(pair_of_products):
            strength1 = strengths_incremental.get(pair_of_products, 0.0)
            strength2 = strengths_from_scratch[pair_of_products]
            nose.tools.ok_(
                "Strengths do not match for product pair (%s, %s): " % (pair_of_products[0], pair_of_products[1]) +
                "[incremental --> %.6f] [from scratch --> %.6f]" % (strength1, strength2),
                abs(strength1 - strength2) < ACCEPTABLE_ON_THE_FLY_VS_FROM_SCRATCH_DEVIATION)
        # ---

        # inner method to compare templates tfidf
        def compare_templates(product):
            templates1 = templates_incremental.get(product, (None, []))
            templates2 = templates_from_scratch.get(product, (None, []))
            nose.tools.eq_(len(templates1[1]), len(templates2[1]),
                           "Numbers of incremental and from-scratch templates do not match")
            for idx in range(len(templates1[1])):
                strength_incremental = templates1[1][idx][0]
                strength_from_scratch = templates2[1][idx][0]
                nose.tools.ok_(
                    abs(strength_incremental - strength_from_scratch) < ACCEPTABLE_ON_THE_FLY_VS_FROM_SCRATCH_DEVIATION,
                    "Templates do not approximately match for product %s: " % product +
                    "[incremental --> %s] [from scratch --> %s]" % (str(templates1), str(templates2)))
        # ---

        all_products = list(self.db_proxy.fetch_all_product_ids())

        sentence = " produto para teste de atualização de similaridade via tfidf"
        products = [{"external_id": product[0],
                     "resources": {"title": product[0]},
                     "date": self.session_context.get_present_date(),
                     "expiration_date": self.session_context.get_present_date() + dt.timedelta(days=30),
                     "full_content": product[1],
                     "language": "portuguese"} for product in
                    [("p_new_1", "Primeiro" + sentence),
                     ("p_new_2", "Segundo" + sentence),
                     ("p_new_3", "Terceiro" + sentence),
                     ("p_new_4", "Quarto" + sentence)]]

        # updates strengths after each new product
        for product in products:
            self.db_proxy.insert_product(product)
            maintenance.process_product(self.session_context, product["external_id"])

        # saves locally the strengths and the templates that were obtained incrementally
        strengths_incremental = self.db_proxy.fetch_product_product_strengths_tfidf()
        templates_incremental = self.db_proxy.fetch_product_templates(all_products)

        # regenerates all strengths from scratch
        pttfidf.generate_templates(self.session_context)

        # saves locally the strengths and the templates that were obtained from scratch
        strengths_from_scratch = self.db_proxy.fetch_product_product_strengths_tfidf()
        templates_from_scratch = self.db_proxy.fetch_product_templates(all_products)

        nose.tools.eq_(len(strengths_incremental), len(strengths_from_scratch),
                       "Number of non-zero strengths tfidf do not match")

        for product_pair in strengths_from_scratch:
            compare_strengths(product_pair)

        for product_pair in strengths_incremental:
            compare_strengths(product_pair)

        for product in all_products:
            compare_templates(product)
    def test_near_identical(self):
        """ Tests that two products considered 'near-identical' are not recommended at the same time
            (within the same page) when the filtering strategy is AFTER_SCORING.
        """
        target = "u_tec_1"

        id_twin_product_1 = "p_tec_TWIN_1"
        id_twin_product_2 = "p_tec_TWIN_2"

        date = self.session_context.get_present_date() - dt.timedelta(days=1)

        twin_product_1 = {"external_id": id_twin_product_1,
                          "language": "english",
                          "date": date,
                          "expiration_date": date + dt.timedelta(days=30),
                          "resources": {"title": "Whatever Gets You Through The Night"},
                          "full_content": """Begin. Technology. Technology. This is all we got. End.""",
                          "category": "Nonsense"}

        twin_product_2 = {"external_id": id_twin_product_2,
                          "language": "english",
                          "date": date,
                          "expiration_date": date + dt.timedelta(days=30),
                          "resources": {"title": "Whatever Gets You Through This Night is Alright"},
                          "full_content": """Begin. Technology. Technology. This is all we got. End.""",
                          "category": "Nonsense"}

        self.db_proxy.insert_product(twin_product_1)
        tasks.process_product(self.session_context, id_twin_product_1)
        self.db_proxy.insert_product(twin_product_2)
        tasks.process_product(self.session_context, id_twin_product_2)

        # makes it so that all users consume (and have impressions on) the twins, except for the target user
        users = self.db_proxy.fetch_all_user_ids()
        for user in users:
            if user != target:
                activity = {"external_user_id": user,
                            "external_product_id": id_twin_product_1,
                            "activity": "buy",
                            "created_at": self.session_context.get_present_date()}
                tasks.update_summaries(self.session_context, activity)

                activity = {"external_user_id": user,
                            "external_product_id": id_twin_product_2,
                            "activity": "buy",
                            "created_at": self.session_context.get_present_date()}
                tasks.update_summaries(self.session_context, activity)

                if self.session_context.impressions_enabled:
                    is_anonymous = config.is_anonymous(user)
                    self.db_proxy.increment_impression_summary(user,
                                                               id_twin_product_1,
                                                               date=self.session_context.get_present_date(),
                                                               anonymous=is_anonymous)
                    self.db_proxy.increment_impression_summary(user,
                                                               id_twin_product_2,
                                                               date=self.session_context.get_present_date(),
                                                               anonymous=is_anonymous)
        ut.generate_templates(self.session_context)
        pt.generate_templates(self.session_context)
        pttfidf.generate_templates(self.session_context)  # Unfortunately we need to regenerate from scratch,
                                                          # otherwise the df's of the twins will be different.

        # First, we recommend WITHOUT near-identical filtering, to check that the twins really appear consecutively.

        custom_settings = {'near_identical_filter_field': None,
                           'near_identical_filter_threshold': None}

        session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm)
        session.refresh()

        recommender = session.get_recommender()

        if not recommender.is_hybrid():
        # For hybrid recommenders, this check is meaningless.

            recommendations = recommender.recommend(100)

            twin_index = -1
            for idx, recommendation in enumerate(recommendations):
                if recommendation[1].startswith("p_tec_TWIN_"):
                    if twin_index >= 0:
                        nose.tools.eq_(idx - twin_index, 1,
                                       "The two near-identical products should appear consecutively without filtering")
                        break
                    twin_index = idx

        # Now we recommend WITH near-identical filtering

        recommendation_page_size = 5
        custom_settings = {'near_identical_filter_field': 'resources.title',
                           'near_identical_filter_threshold': 2,
                           'recommendations_page_size': recommendation_page_size}

        session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm)
        session.refresh()

        recommender = session.get_recommender()
        recommendations = recommender.recommend(100)

        # Sanity check
        recommended_products = {r[1] for r in recommendations}
        count_recommended_twins = len({id_twin_product_1, id_twin_product_2} & recommended_products)
        nose.tools.ok_(count_recommended_twins > 0,
                       "At least one of the twins should have been recommended, otherwise the test is meaningless")

        # Actual tests
        twin_index = -1 * recommendation_page_size - 1  # initial value, so the first twin passes the test
        for idx, recommendation in enumerate(recommendations):
            if recommendation[1].startswith("p_tec_TWIN_"):
                nose.tools.ok_(idx - twin_index > 1,  # it suffices to show that the twins have been separated
                               "Two near-identical products should not appear within the same recommendations page")
                twin_index = idx
    def test_product_age_decay_exponential(self):
        """ Tests the effect of applying a product age decay factor based on an exponential
            function on recommendations. It applies to all recommendation heuristics.
        """
        target = "u_tec_1"

        id_twin_product_old = "p_tec_TWIN_OLD"
        id_twin_product_new = "p_tec_TWIN_NEW"

        # makes it so that the oldest twin is 2 days (the configured half life) older
        old_date = self.session_context.get_present_date() - dt.timedelta(days=2)
        new_date = self.session_context.get_present_date()

        twin_product_old = {"external_id": id_twin_product_old,
                            "language": "english",
                            "date": old_date,
                            "expiration_date": old_date + dt.timedelta(days=30),
                            "resources": {"title": "Whatever Gets You Through The Night"},
                            "full_content": """Begin. Technology. Technology. This is all we got. End.""",
                            "category": "Nonsense"}

        twin_product_new = {"external_id": id_twin_product_new,
                            "language": "english",
                            "date": new_date,
                            "expiration_date": new_date + dt.timedelta(days=30),
                            "resources": {"title": "Whatever Gets You Through The Night"},
                            "full_content": """Begin. Technology. Technology. This is all we got. End.""",
                            "category": "Nonsense"}

        self.db_proxy.insert_product(twin_product_old)
        tasks.process_product(self.session_context, id_twin_product_old)
        self.db_proxy.insert_product(twin_product_new)
        tasks.process_product(self.session_context, id_twin_product_new)

        # makes it so that all users consume (and have impressions on) the twins, except for the target user
        users = self.db_proxy.fetch_all_user_ids()
        for user in users:
            if user != target:
                activity = {"external_user_id": user,
                            "external_product_id": id_twin_product_old,
                            "activity": "buy",
                            "created_at": self.session_context.get_present_date()}
                tasks.update_summaries(self.session_context, activity)

                activity = {"external_user_id": user,
                            "external_product_id": id_twin_product_new,
                            "activity": "buy",
                            "created_at": self.session_context.get_present_date()}
                tasks.update_summaries(self.session_context, activity)

                if self.session_context.impressions_enabled:
                    is_anonymous = config.is_anonymous(user)
                    self.db_proxy.increment_impression_summary(user,
                                                               id_twin_product_old,
                                                               date=self.session_context.get_present_date(),
                                                               anonymous=is_anonymous)
                    self.db_proxy.increment_impression_summary(user,
                                                               id_twin_product_new,
                                                               date=self.session_context.get_present_date(),
                                                               anonymous=is_anonymous)

        ut.generate_templates(self.session_context)
        pt.generate_templates(self.session_context)
        pttfidf.generate_templates(self.session_context)  # Unfortunately we need to regenerate from scratch,
                                                          # otherwise the df's of the twins will be different.

        custom_settings = {'product_age_decay_function_name': 'exponential',
                           'product_age_decay_exponential_function_halflife': 2,
                           'near_identical_filter_field': None, 'near_identical_filter_threshold': None}

        # Disables near-identical filtering
        session = tests.init_session(user_id=target, custom_settings=custom_settings, algorithm=self.algorithm)
        session.refresh()

        recommender = session.get_recommender()

        # Determines the index of the first actual value in the score tuples
        # produced by the recommender (note that hybrid recommenders use the first
        # position to indicate the algorithm number)
        if recommender.is_hybrid():
            start_index = 1
        else:
            start_index = 0

        recommendations = recommender.recommend(100)
        nose.tools.ok_(len(recommendations) > 0, "No recommendations were returned!")

        strength_old_twin = None
        strength_new_twin = None

        for rec in recommendations:
            if rec[1] == id_twin_product_old:
                strength_old_twin = rec[0]
            if rec[1] == id_twin_product_new:
                strength_new_twin = rec[0]

        for i in range(start_index, len(strength_old_twin)):
            old_strength_value = strength_old_twin[i]
            new_strength_value = strength_new_twin[i]
            nose.tools.ok_(abs(old_strength_value / new_strength_value - 0.5) < tests.FLOAT_DELTA,
                           "Incorrect application of the product age decay")