Beispiel #1
0
    def gather_candidate_products(self, n_recommendations):
        """ See barbante.recommendation.Recommender.
        """
        log.info(barbante_logging.PERF_BEGIN)

        n_algorithms = len(self.algorithms_including_fill_in)

        candidate_products_by_algorithm = {}
        try:
            with concurrent.futures.ThreadPoolExecutor(max_workers=n_algorithms) as executor:
                future_to_algorithm = {
                    executor.submit(wrap(self._gather_candidate_products), algorithm, n_recommendations): algorithm
                    for algorithm in self.algorithms_including_fill_in}
                for future in concurrent.futures.as_completed(future_to_algorithm,
                                                              timeout=self.session_context.recommendation_timeout):
                    candidate_products_by_algorithm.update(future.result())

        except concurrent.futures._base.TimeoutError as err:
            log.error("Specialist recommender timeout error: {0}".format(str(err)))
            log.info("Specialists that returned within time limit: {0}".format(candidate_products_by_algorithm.keys()))
            log.info("Specialists that timed out: {0}".format(
                set(self.algorithms_including_fill_in) - set(candidate_products_by_algorithm.keys())))

        log.info(barbante_logging.PERF_END)
        return candidate_products_by_algorithm
def consolidate_user_templates(session_context, users_list=None):
    if users_list is None:
        users_list = [u for u in session_context.data_proxy.fetch_all_user_ids()]
    total_users = len(users_list)
    if total_users == 0:
        log.info("No users to perform templates consolidation on.")
        return

    log.info("Performing consolidation of templates on %d users..." % total_users)

    # shuffles the list to balance the workers
    shuffle(users_list)

    max_workers = session_context.max_workers_template_consolidation
    n_pages = min(max_workers, total_users)
    page_sizes = [total_users // n_pages] * n_pages
    for i in range(total_users % n_pages):
        page_sizes[i] += 1

    pages_processed = 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_page = {
            executor.submit(wrap(__gather_user_templates), session_context,
                            page, users_list, page_sizes, session_context.flush_size / max_workers): page
            for page in range(n_pages) if page_sizes[page] > 0}
        for _ in concurrent.futures.as_completed(future_to_page):
            pages_processed += 1
            log.info("Processed [%d] pages out of %d" % (pages_processed, n_pages))
def consolidate_product_templates(session_context, products_list=None, collaborative=True, tfidf=True):
    if products_list is None:
        cutoff_date = session_context.get_present_date() - \
            dt.timedelta(session_context.product_product_strengths_window)
        products_list = [p for p in session_context.data_proxy.fetch_date_filtered_products(
            reference_date=cutoff_date)]
    elif products_list == "--all":
        products_list = [p for p in session_context.data_proxy.fetch_all_product_ids()]

    total_products = len(products_list)
    if total_products == 0:
        log.info("No products to perform templates consolidation on.")
        return

    log.info("Performing consolidation of templates on %d products..." % total_products)

    allowed_templates = None
    if session_context.recommendable_product_start_date_field or \
            session_context.recommendable_product_end_date_field:
        allowed_templates = session_context.data_proxy.fetch_date_filtered_products(
            reference_date=session_context.get_present_date(),
            lte_date_field=session_context.recommendable_product_start_date_field,
            gte_date_field=session_context.recommendable_product_end_date_field)
        log.info("(%d templates are allowed, based on due dates)" % len(allowed_templates))
    else:
        log.info("(no restrictions will be applied to templates)")

    # shuffles the list to balance the workers
    shuffle(products_list)

    max_workers = session_context.max_workers_template_consolidation
    n_pages = min(max_workers, total_products)
    page_sizes = [total_products // n_pages] * n_pages
    for i in range(total_products % n_pages):
        page_sizes[i] += 1

    pages_processed = 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_page = {
            executor.submit(wrap(__gather_product_templates), session_context,
                            page, products_list, page_sizes,
                            collaborative, tfidf, allowed_templates,
                            session_context.flush_size / max_workers): page
            for page in range(n_pages) if page_sizes[page] > 0}
        for _ in concurrent.futures.as_completed(future_to_page):
            pages_processed += 1
            log.info("Processed [%d] pages out of %d" % (pages_processed, n_pages))
Beispiel #4
0
    def gather_recommendation_scores(self, candidate_product_ids_by_algorithm, n_recommendations):
        """ See barbante.recommendation.Recommender.
        """
        log.info(barbante_logging.PERF_BEGIN)

        n_algorithms = len(self.algorithms_including_fill_in)

        for algorithm in self.algorithms_including_fill_in:
            if self.recommenders.get(algorithm) is None:
                recommender = self.session_context.get_recommender(algorithm)
                self.recommenders[algorithm] = recommender

        if self.session_context.supported_activities is not None:
            log.info("supported activities: " + ", ".join(self.session_context.supported_activities))
        else:
            log.info("NO SUPPORTED ACTIVITIES")
        log.info("5-star activities: " + ", ".join(self.session_context.activities_by_rating[5]))

        log.debug("Querying %d distinct recommenders" % n_algorithms)

        sorted_scores_by_algorithm = {}
        try:
            with concurrent.futures.ThreadPoolExecutor(max_workers=n_algorithms) as executor:
                future_to_algorithm = {
                    executor.submit(wrap(self._query_recommender), self.recommenders[algorithm],
                                    candidate_product_ids_by_algorithm, n_recommendations): algorithm
                    for algorithm in self.algorithms_including_fill_in}

                for future in concurrent.futures.as_completed(future_to_algorithm,
                                                              timeout=self.session_context.recommendation_timeout):
                    sorted_scores_by_algorithm[future_to_algorithm[future]] = future.result()

        except concurrent.futures._base.TimeoutError as err:
            log.error("Specialist recommender timeout error: {0}".format(str(err)))
            log.info("Specialists that returned within time limit: {0}".format(sorted_scores_by_algorithm.keys()))
            log.info("Specialists that timed out: {0}".format(
                set(self.algorithms_including_fill_in) - set(sorted_scores_by_algorithm.keys())))

        # Merges the contributions of different specialists.
        recommendations = self.merge_algorithm_contributions(sorted_scores_by_algorithm, n_recommendations)

        # Calls for the fill-in algorithm, when need be.
        self.include_fill_in_recommendations(recommendations, sorted_scores_by_algorithm, n_recommendations)

        log.info(barbante_logging.PERF_END)
        return recommendations
Beispiel #5
0
def generate_strengths(session_context):
    """ Computes product x product strengths (from scratch) based on the users' activities.
        It uses the context data proxy to read input data and write the strengths back to the database.

        :param session_context: The session context.
    """
    # drops the collections and recreates the necessary indexes
    session_context.data_proxy.reset_product_product_strength_auxiliary_data()

    # registers the start of the operation and the cutoff_date
    timestamp = session_context.get_present_date()
    cutoff_date = timestamp - dt.timedelta(session_context.product_product_strengths_window)
    real_time_start = time()

    users_list = [u for u in session_context.data_proxy.fetch_all_user_ids()]
    total_users = len(users_list)

    # shuffles the list to balance the workers
    shuffle(users_list)

    # auxiliary in-memory maps (probably ok, linear-size in the overall number of recommendable activities)
    template_products = set()
    users_by_base_product = {}
    users_by_base_product_size = 0  # let's monitor the number of users closely, just in case

    # We process the numerators first, in parallel threads.

    n_pages = total_users // session_context.page_size_product_product_numerators + 1
    max_workers = session_context.max_workers_product_product_strengths
    pages_processed = 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_page = {
            executor.submit(
                wrap(__compute_strength_numerators),
                session_context,
                page,
                users_list,
                session_context.flush_size / max_workers,
            ): page
            for page in range(n_pages)
        }
        for future in concurrent.futures.as_completed(future_to_page):
            users_by_base_product_partial, template_products_partial = future.result()
            template_products |= template_products_partial
            for product, other_users in users_by_base_product_partial.items():
                users = users_by_base_product.get(product, set())
                old_size_for_this_product = len(users)
                users |= other_users
                new_size_for_this_product = len(users)
                users_by_base_product[product] = users
                users_by_base_product_size += new_size_for_this_product - old_size_for_this_product
            pages_processed += 1
            log.info(
                "Processed [{0}] pages out of [{1}] during p-p strengths generation (numerators)".format(
                    pages_processed, n_pages
                )
            )
            log.info(
                "In-memory users_by_base_product map size = %d products, %d users"
                % (len(users_by_base_product), users_by_base_product_size)
            )

    log.info("All numerators saved")

    del users_list

    # Now we know the product pairs that have non-zero strengths, we can process the denominators and the strengths.

    template_products_list = list(template_products)
    total_products = len(template_products_list)
    n_pages = total_products // session_context.page_size_product_product_denominators + 1
    pages_processed = 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_page = {
            executor.submit(
                wrap(__compute_denominators_and_strengths),
                session_context,
                page,
                template_products_list,
                users_by_base_product,
                session_context.flush_size / max_workers,
            ): page
            for page in range(n_pages)
        }

        for _ in concurrent.futures.as_completed(future_to_page):
            pages_processed += 1
            log.info(
                "Processed [{0}] pages out of [{1}] during p-p strengths generation (denominators)".format(
                    pages_processed, n_pages
                )
            )

    # Finalizes batch write.

    log.info("Persisting data about activities considered in this batch...")
    session_context.data_proxy.copy_all_latest_activities_for_product_product_strengths(cutoff_date)

    session_context.data_proxy.hotswap_pp_strengths()

    session_context.data_proxy.save_timestamp_product_product_strengths(
        timestamp, cutoff_date, time() - real_time_start
    )

    log.info("Product-product strengths generated successfully")
Beispiel #6
0
def process_products_from_scratch(session_context, days=None):
    """ Processes product models and product terms for all non-deleted, valid products in the database.

        :param session_context: The customer context.
        :param days: The number of days which should be considered. Only products whose date
            attribute lies within the last days
    """
    session_context.data_proxy.reset_all_product_content_data()

    # registers the start of the operation and the cutoff_date
    timestamp = session_context.get_present_date()
    if days is not None:
        cutoff_date = session_context.get_present_date() - dt.timedelta(days)
    else:
        cutoff_date = None
    real_time_start = time()

    required_fields = session_context.product_model_factory.get_custom_required_attributes()
    log.info("Loading product ids of all products with required fields: {0}".format(required_fields))
    product_ids_list = [p for p in session_context.data_proxy.fetch_all_product_ids(
        allow_deleted=False, required_fields=required_fields, min_date=cutoff_date)]
    total_products = len(product_ids_list)
    log.info("Loaded [%d] products" % total_products)

    skipped = 0

    # Auxiliary map of products by language.
    language_map = {}

    # The 1st stage of parallel processing: generates the product models and splits products by language.

    n_pages = total_products // session_context.page_size_batch_process_products + 1
    max_workers = session_context.max_workers_batch_process_products
    pages_processed = 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_page = {
            executor.submit(wrap(__process_product_models), session_context, page, product_ids_list,
                            session_context.flush_size / max_workers): page
            for page in range(n_pages)}

        for future in concurrent.futures.as_completed(future_to_page):
            pages_processed += 1
            result = future.result()

            products_by_language = result[0]

            skipped_products = result[1]
            skipped += skipped_products

            # Updates the map of products by language.
            for language, new_products in products_by_language.items():
                products = language_map.get(language, [])
                products += new_products
                language_map[language] = products

            if pages_processed % 100 == 0:
                log.info("Processed [{0}] pages out of [{1}] during product model creation".format(pages_processed,
                                                                                                   n_pages))

    session_context.data_proxy.hotswap_product_models()

    # With all product models duly created, we split the processing of terms by language.

    for language, language_product_ids_list in language_map.items():
        language_products_count = len(language_product_ids_list)
        log.info("Processing [%d] %s products..." % (language_products_count, language))

        # An auxiliary in-memory map to hold all DFs per language.
        df_map = {}
        # This is probably ok, since its size is linear in the overall number of product terms per language,
        # but I have added a safety pig just in case.
        #                          _
        #  _._ _..._ .-',     _.._(`))
        # '-. `     '  /-._.-'    ',/
        #    )         \            '.
        #   / _    _    |             \
        #  |  a    a    /              |
        #  \   .-.                     ;
        #   '-('' ).-'       ,'       ;
        #      '-;           |      .'
        #         \           \    /
        #         | 7  .__  _.-\   \
        #         | |  |  ``/  /`  /
        #        /,_|  |   /,_/   /
        #           /,_/      '`-'
        #
        df_map_size = 0  # to closely monitor de size of the in-memory map
        #
        # In case we have OOM issues, we can kill the pig altogether and use the database
        # to perform the aggregation of DFs. The drawback is that we'll have to do costly upserts,
        # instead of the current inserts.

        # The 2nd stage of parallel processing: saving TEXT-type attributes' terms' TFs (and aggregation of DFs).

        n_pages = language_products_count // session_context.page_size_batch_process_products + 1
        max_workers = session_context.max_workers_batch_process_products
        pages_processed = 0
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_page = {
                executor.submit(wrap(__process_product_terms), session_context, page, language_product_ids_list,
                                language, session_context.flush_size / max_workers): page
                for page in range(n_pages)}

            for future in concurrent.futures.as_completed(future_to_page):
                pages_processed += 1
                result = future.result()

                df_by_term = result[0]

                skipped_products = result[1]
                skipped += skipped_products

                # Performs an in-memory aggregation of DF's.

                for term, df in df_by_term.items():
                    previous_df = df_map.get(term)
                    if previous_df is None:
                        previous_df = 0
                        df_map_size += 1
                    df_map[term] = previous_df + df

                if pages_processed % 100 == 0:
                    log.info("Processed [{0}] pages out of [{1}] during TF processing".format(pages_processed, n_pages))
                    log.info("In-memory df_map size = [{0}] terms".format(df_map_size))

        # The 3rd stage of parallel processing: saving the TFIDF's of the top terms per attribute.

        n_pages = language_products_count // session_context.page_size_batch_process_products + 1
        max_workers = session_context.max_workers_batch_process_products
        pages_processed = 0
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_page = {
                executor.submit(wrap(__process_products_TFIDFs), session_context, page, language_product_ids_list,
                                total_products, language, df_map, session_context.flush_size / max_workers): page
                for page in range(n_pages)}

            for _ in concurrent.futures.as_completed(future_to_page):
                pages_processed += 1
                if pages_processed % 100 == 0:
                    log.info("Processed [{0}] pages out of [{1}] during TFIDF processing".format(pages_processed,
                                                                                                 n_pages))

        # Persists all DF's (aggregated in memory).
        log.info("Saving DF's (language: %s)..." % language)
        _flush_df_map(session_context, language, df_map)
        log.info("DF's saved")

    session_context.data_proxy.save_timestamp_product_models(
        timestamp, cutoff_date, time() - real_time_start)

    success = total_products - skipped
    log.info("Done: [%d] products were processed successfully; [%d] products were skipped." % (success, skipped))
    def test_recommend(self, test_recommendation_quality=True):
        """ Tests whether meaningful recommendations were obtained.
        """
        # pre-generates a session context and use it for all recommendation tests below
        session = tests.init_session(algorithm=self.algorithm)

        def generate_queries_for_category(category, user_count, product_count):
            for i in range(1, user_count + 1):
                target_user = '******'.format(category, str(i))
                result = {
                    'target_user': target_user,
                    'category': category,
                    'product_count': product_count
                }
                yield result

        def recommend(target_user):
            """ Returns recommendations for a certain user.
                :param target_user: user to recommend
                :return: list of recommendations
            """
            # updates the session's user context
            session.user_id = target_user
            session.refresh()
            recommender = session.get_recommender()
            return recommender.recommend(self.n_recommendations)

        def verify_recommendations(_query, _recommendations):
            """ Verify that the recommendation was successful.
                :param _query: query parameters
                :param _recommendations: recommendation result set
            """
            recent_activities = session.user_context.recent_activities
            products_consumed = list({act["external_product_id"] for act in recent_activities})
            n_products_consumed = len(products_consumed)

            nose.tools.ok_(len(_recommendations) > 0, "No recommendations were retrieved")
            if test_recommendation_quality:
                for j in range(min(_query['product_count'] - n_products_consumed, len(_recommendations))):
                    nose.tools.eq_(_recommendations[j][1][:6], "p_{0}_".format(_query['category']),
                                   "Questionable recommendations were obtained " +
                                   "for user %s: %s" % (_query['target_user'], _recommendations))
        queries = []
        # Economia
        queries += generate_queries_for_category('eco', dp.N_USR_ECONOMIA, dp.N_PROD_ECONOMIA)
        # Esportes
        queries += generate_queries_for_category('esp', dp.N_USR_ESPORTES, dp.N_PROD_ESPORTES)
        # Música
        queries += generate_queries_for_category('mus', dp.N_USR_MUSICA, dp.N_PROD_MUSICA)
        # Tecnologia
        queries += generate_queries_for_category('tec', dp.N_USR_TECNOLOGIA, dp.N_PROD_TECNOLOGIA)

        # We did an experiment trying to parallelize the test recommendations, but there was no speedup because the
        # overhead is too cumbersome.
        n_workers = 1  # For some reason a thread pool with 1 worker is slightly faster than the nonconcurrent version
        with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
            future_to_query = {executor.submit(wrap(recommend), q['target_user']): q
                               for q in queries}

            for future in concurrent.futures.as_completed(future_to_query):
                query = future_to_query[future]
                recommendations = future.result()
                verify_recommendations(query, recommendations)