def gather_candidate_products(self, n_recommendations): """ See barbante.recommendation.Recommender. """ log.info(barbante_logging.PERF_BEGIN) n_algorithms = len(self.algorithms_including_fill_in) candidate_products_by_algorithm = {} try: with concurrent.futures.ThreadPoolExecutor(max_workers=n_algorithms) as executor: future_to_algorithm = { executor.submit(wrap(self._gather_candidate_products), algorithm, n_recommendations): algorithm for algorithm in self.algorithms_including_fill_in} for future in concurrent.futures.as_completed(future_to_algorithm, timeout=self.session_context.recommendation_timeout): candidate_products_by_algorithm.update(future.result()) except concurrent.futures._base.TimeoutError as err: log.error("Specialist recommender timeout error: {0}".format(str(err))) log.info("Specialists that returned within time limit: {0}".format(candidate_products_by_algorithm.keys())) log.info("Specialists that timed out: {0}".format( set(self.algorithms_including_fill_in) - set(candidate_products_by_algorithm.keys()))) log.info(barbante_logging.PERF_END) return candidate_products_by_algorithm
def consolidate_user_templates(session_context, users_list=None): if users_list is None: users_list = [u for u in session_context.data_proxy.fetch_all_user_ids()] total_users = len(users_list) if total_users == 0: log.info("No users to perform templates consolidation on.") return log.info("Performing consolidation of templates on %d users..." % total_users) # shuffles the list to balance the workers shuffle(users_list) max_workers = session_context.max_workers_template_consolidation n_pages = min(max_workers, total_users) page_sizes = [total_users // n_pages] * n_pages for i in range(total_users % n_pages): page_sizes[i] += 1 pages_processed = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_page = { executor.submit(wrap(__gather_user_templates), session_context, page, users_list, page_sizes, session_context.flush_size / max_workers): page for page in range(n_pages) if page_sizes[page] > 0} for _ in concurrent.futures.as_completed(future_to_page): pages_processed += 1 log.info("Processed [%d] pages out of %d" % (pages_processed, n_pages))
def consolidate_product_templates(session_context, products_list=None, collaborative=True, tfidf=True): if products_list is None: cutoff_date = session_context.get_present_date() - \ dt.timedelta(session_context.product_product_strengths_window) products_list = [p for p in session_context.data_proxy.fetch_date_filtered_products( reference_date=cutoff_date)] elif products_list == "--all": products_list = [p for p in session_context.data_proxy.fetch_all_product_ids()] total_products = len(products_list) if total_products == 0: log.info("No products to perform templates consolidation on.") return log.info("Performing consolidation of templates on %d products..." % total_products) allowed_templates = None if session_context.recommendable_product_start_date_field or \ session_context.recommendable_product_end_date_field: allowed_templates = session_context.data_proxy.fetch_date_filtered_products( reference_date=session_context.get_present_date(), lte_date_field=session_context.recommendable_product_start_date_field, gte_date_field=session_context.recommendable_product_end_date_field) log.info("(%d templates are allowed, based on due dates)" % len(allowed_templates)) else: log.info("(no restrictions will be applied to templates)") # shuffles the list to balance the workers shuffle(products_list) max_workers = session_context.max_workers_template_consolidation n_pages = min(max_workers, total_products) page_sizes = [total_products // n_pages] * n_pages for i in range(total_products % n_pages): page_sizes[i] += 1 pages_processed = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_page = { executor.submit(wrap(__gather_product_templates), session_context, page, products_list, page_sizes, collaborative, tfidf, allowed_templates, session_context.flush_size / max_workers): page for page in range(n_pages) if page_sizes[page] > 0} for _ in concurrent.futures.as_completed(future_to_page): pages_processed += 1 log.info("Processed [%d] pages out of %d" % (pages_processed, n_pages))
def gather_recommendation_scores(self, candidate_product_ids_by_algorithm, n_recommendations): """ See barbante.recommendation.Recommender. """ log.info(barbante_logging.PERF_BEGIN) n_algorithms = len(self.algorithms_including_fill_in) for algorithm in self.algorithms_including_fill_in: if self.recommenders.get(algorithm) is None: recommender = self.session_context.get_recommender(algorithm) self.recommenders[algorithm] = recommender if self.session_context.supported_activities is not None: log.info("supported activities: " + ", ".join(self.session_context.supported_activities)) else: log.info("NO SUPPORTED ACTIVITIES") log.info("5-star activities: " + ", ".join(self.session_context.activities_by_rating[5])) log.debug("Querying %d distinct recommenders" % n_algorithms) sorted_scores_by_algorithm = {} try: with concurrent.futures.ThreadPoolExecutor(max_workers=n_algorithms) as executor: future_to_algorithm = { executor.submit(wrap(self._query_recommender), self.recommenders[algorithm], candidate_product_ids_by_algorithm, n_recommendations): algorithm for algorithm in self.algorithms_including_fill_in} for future in concurrent.futures.as_completed(future_to_algorithm, timeout=self.session_context.recommendation_timeout): sorted_scores_by_algorithm[future_to_algorithm[future]] = future.result() except concurrent.futures._base.TimeoutError as err: log.error("Specialist recommender timeout error: {0}".format(str(err))) log.info("Specialists that returned within time limit: {0}".format(sorted_scores_by_algorithm.keys())) log.info("Specialists that timed out: {0}".format( set(self.algorithms_including_fill_in) - set(sorted_scores_by_algorithm.keys()))) # Merges the contributions of different specialists. recommendations = self.merge_algorithm_contributions(sorted_scores_by_algorithm, n_recommendations) # Calls for the fill-in algorithm, when need be. self.include_fill_in_recommendations(recommendations, sorted_scores_by_algorithm, n_recommendations) log.info(barbante_logging.PERF_END) return recommendations
def generate_strengths(session_context): """ Computes product x product strengths (from scratch) based on the users' activities. It uses the context data proxy to read input data and write the strengths back to the database. :param session_context: The session context. """ # drops the collections and recreates the necessary indexes session_context.data_proxy.reset_product_product_strength_auxiliary_data() # registers the start of the operation and the cutoff_date timestamp = session_context.get_present_date() cutoff_date = timestamp - dt.timedelta(session_context.product_product_strengths_window) real_time_start = time() users_list = [u for u in session_context.data_proxy.fetch_all_user_ids()] total_users = len(users_list) # shuffles the list to balance the workers shuffle(users_list) # auxiliary in-memory maps (probably ok, linear-size in the overall number of recommendable activities) template_products = set() users_by_base_product = {} users_by_base_product_size = 0 # let's monitor the number of users closely, just in case # We process the numerators first, in parallel threads. n_pages = total_users // session_context.page_size_product_product_numerators + 1 max_workers = session_context.max_workers_product_product_strengths pages_processed = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_page = { executor.submit( wrap(__compute_strength_numerators), session_context, page, users_list, session_context.flush_size / max_workers, ): page for page in range(n_pages) } for future in concurrent.futures.as_completed(future_to_page): users_by_base_product_partial, template_products_partial = future.result() template_products |= template_products_partial for product, other_users in users_by_base_product_partial.items(): users = users_by_base_product.get(product, set()) old_size_for_this_product = len(users) users |= other_users new_size_for_this_product = len(users) users_by_base_product[product] = users users_by_base_product_size += new_size_for_this_product - old_size_for_this_product pages_processed += 1 log.info( "Processed [{0}] pages out of [{1}] during p-p strengths generation (numerators)".format( pages_processed, n_pages ) ) log.info( "In-memory users_by_base_product map size = %d products, %d users" % (len(users_by_base_product), users_by_base_product_size) ) log.info("All numerators saved") del users_list # Now we know the product pairs that have non-zero strengths, we can process the denominators and the strengths. template_products_list = list(template_products) total_products = len(template_products_list) n_pages = total_products // session_context.page_size_product_product_denominators + 1 pages_processed = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_page = { executor.submit( wrap(__compute_denominators_and_strengths), session_context, page, template_products_list, users_by_base_product, session_context.flush_size / max_workers, ): page for page in range(n_pages) } for _ in concurrent.futures.as_completed(future_to_page): pages_processed += 1 log.info( "Processed [{0}] pages out of [{1}] during p-p strengths generation (denominators)".format( pages_processed, n_pages ) ) # Finalizes batch write. log.info("Persisting data about activities considered in this batch...") session_context.data_proxy.copy_all_latest_activities_for_product_product_strengths(cutoff_date) session_context.data_proxy.hotswap_pp_strengths() session_context.data_proxy.save_timestamp_product_product_strengths( timestamp, cutoff_date, time() - real_time_start ) log.info("Product-product strengths generated successfully")
def process_products_from_scratch(session_context, days=None): """ Processes product models and product terms for all non-deleted, valid products in the database. :param session_context: The customer context. :param days: The number of days which should be considered. Only products whose date attribute lies within the last days """ session_context.data_proxy.reset_all_product_content_data() # registers the start of the operation and the cutoff_date timestamp = session_context.get_present_date() if days is not None: cutoff_date = session_context.get_present_date() - dt.timedelta(days) else: cutoff_date = None real_time_start = time() required_fields = session_context.product_model_factory.get_custom_required_attributes() log.info("Loading product ids of all products with required fields: {0}".format(required_fields)) product_ids_list = [p for p in session_context.data_proxy.fetch_all_product_ids( allow_deleted=False, required_fields=required_fields, min_date=cutoff_date)] total_products = len(product_ids_list) log.info("Loaded [%d] products" % total_products) skipped = 0 # Auxiliary map of products by language. language_map = {} # The 1st stage of parallel processing: generates the product models and splits products by language. n_pages = total_products // session_context.page_size_batch_process_products + 1 max_workers = session_context.max_workers_batch_process_products pages_processed = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_page = { executor.submit(wrap(__process_product_models), session_context, page, product_ids_list, session_context.flush_size / max_workers): page for page in range(n_pages)} for future in concurrent.futures.as_completed(future_to_page): pages_processed += 1 result = future.result() products_by_language = result[0] skipped_products = result[1] skipped += skipped_products # Updates the map of products by language. for language, new_products in products_by_language.items(): products = language_map.get(language, []) products += new_products language_map[language] = products if pages_processed % 100 == 0: log.info("Processed [{0}] pages out of [{1}] during product model creation".format(pages_processed, n_pages)) session_context.data_proxy.hotswap_product_models() # With all product models duly created, we split the processing of terms by language. for language, language_product_ids_list in language_map.items(): language_products_count = len(language_product_ids_list) log.info("Processing [%d] %s products..." % (language_products_count, language)) # An auxiliary in-memory map to hold all DFs per language. df_map = {} # This is probably ok, since its size is linear in the overall number of product terms per language, # but I have added a safety pig just in case. # _ # _._ _..._ .-', _.._(`)) # '-. ` ' /-._.-' ',/ # ) \ '. # / _ _ | \ # | a a / | # \ .-. ; # '-('' ).-' ,' ; # '-; | .' # \ \ / # | 7 .__ _.-\ \ # | | | ``/ /` / # /,_| | /,_/ / # /,_/ '`-' # df_map_size = 0 # to closely monitor de size of the in-memory map # # In case we have OOM issues, we can kill the pig altogether and use the database # to perform the aggregation of DFs. The drawback is that we'll have to do costly upserts, # instead of the current inserts. # The 2nd stage of parallel processing: saving TEXT-type attributes' terms' TFs (and aggregation of DFs). n_pages = language_products_count // session_context.page_size_batch_process_products + 1 max_workers = session_context.max_workers_batch_process_products pages_processed = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_page = { executor.submit(wrap(__process_product_terms), session_context, page, language_product_ids_list, language, session_context.flush_size / max_workers): page for page in range(n_pages)} for future in concurrent.futures.as_completed(future_to_page): pages_processed += 1 result = future.result() df_by_term = result[0] skipped_products = result[1] skipped += skipped_products # Performs an in-memory aggregation of DF's. for term, df in df_by_term.items(): previous_df = df_map.get(term) if previous_df is None: previous_df = 0 df_map_size += 1 df_map[term] = previous_df + df if pages_processed % 100 == 0: log.info("Processed [{0}] pages out of [{1}] during TF processing".format(pages_processed, n_pages)) log.info("In-memory df_map size = [{0}] terms".format(df_map_size)) # The 3rd stage of parallel processing: saving the TFIDF's of the top terms per attribute. n_pages = language_products_count // session_context.page_size_batch_process_products + 1 max_workers = session_context.max_workers_batch_process_products pages_processed = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_page = { executor.submit(wrap(__process_products_TFIDFs), session_context, page, language_product_ids_list, total_products, language, df_map, session_context.flush_size / max_workers): page for page in range(n_pages)} for _ in concurrent.futures.as_completed(future_to_page): pages_processed += 1 if pages_processed % 100 == 0: log.info("Processed [{0}] pages out of [{1}] during TFIDF processing".format(pages_processed, n_pages)) # Persists all DF's (aggregated in memory). log.info("Saving DF's (language: %s)..." % language) _flush_df_map(session_context, language, df_map) log.info("DF's saved") session_context.data_proxy.save_timestamp_product_models( timestamp, cutoff_date, time() - real_time_start) success = total_products - skipped log.info("Done: [%d] products were processed successfully; [%d] products were skipped." % (success, skipped))
def test_recommend(self, test_recommendation_quality=True): """ Tests whether meaningful recommendations were obtained. """ # pre-generates a session context and use it for all recommendation tests below session = tests.init_session(algorithm=self.algorithm) def generate_queries_for_category(category, user_count, product_count): for i in range(1, user_count + 1): target_user = '******'.format(category, str(i)) result = { 'target_user': target_user, 'category': category, 'product_count': product_count } yield result def recommend(target_user): """ Returns recommendations for a certain user. :param target_user: user to recommend :return: list of recommendations """ # updates the session's user context session.user_id = target_user session.refresh() recommender = session.get_recommender() return recommender.recommend(self.n_recommendations) def verify_recommendations(_query, _recommendations): """ Verify that the recommendation was successful. :param _query: query parameters :param _recommendations: recommendation result set """ recent_activities = session.user_context.recent_activities products_consumed = list({act["external_product_id"] for act in recent_activities}) n_products_consumed = len(products_consumed) nose.tools.ok_(len(_recommendations) > 0, "No recommendations were retrieved") if test_recommendation_quality: for j in range(min(_query['product_count'] - n_products_consumed, len(_recommendations))): nose.tools.eq_(_recommendations[j][1][:6], "p_{0}_".format(_query['category']), "Questionable recommendations were obtained " + "for user %s: %s" % (_query['target_user'], _recommendations)) queries = [] # Economia queries += generate_queries_for_category('eco', dp.N_USR_ECONOMIA, dp.N_PROD_ECONOMIA) # Esportes queries += generate_queries_for_category('esp', dp.N_USR_ESPORTES, dp.N_PROD_ESPORTES) # Música queries += generate_queries_for_category('mus', dp.N_USR_MUSICA, dp.N_PROD_MUSICA) # Tecnologia queries += generate_queries_for_category('tec', dp.N_USR_TECNOLOGIA, dp.N_PROD_TECNOLOGIA) # We did an experiment trying to parallelize the test recommendations, but there was no speedup because the # overhead is too cumbersome. n_workers = 1 # For some reason a thread pool with 1 worker is slightly faster than the nonconcurrent version with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor: future_to_query = {executor.submit(wrap(recommend), q['target_user']): q for q in queries} for future in concurrent.futures.as_completed(future_to_query): query = future_to_query[future] recommendations = future.result() verify_recommendations(query, recommendations)