def process_product(session_context, product_id, product=None, force_update=False): log.info("Processing product [%s]" % product_id) start = time() if product is None: product = session_context.data_proxy.fetch_products([product_id]).get(product_id) if product is None: raise ValueError("No product exists in the db with id [%s]" % product_id) log.info("Product [{0}] loaded".format(product_id)) product_model, has_pre_existing_product_model = prd.prepare_product_model( session_context, product, force_update=force_update) if product_model is None: log.error("Error while processing product [%s]: product model was not generated" % product_id) else: language = product_model.get_attribute("language") product_as_dict = None product_model_as_dict = None if not has_pre_existing_product_model or force_update: for attribute in session_context.product_text_fields: if product_model_as_dict is None: product_model_as_dict = utils.flatten_dict(product_model.to_dict()) # lazily flattens the product model if attribute not in product_model_as_dict: if product_as_dict is None: product_as_dict = utils.flatten_dict(product) # lazily flattens the product value = product_as_dict.get(attribute) if value is not None: stemmed_value = text.parse_text_to_stems(language, value) product_model_as_dict[attribute] = stemmed_value _, _, tfidf_by_top_term_by_attribute = prd.prepare_product_terms( session_context, product_model_as_dict, reprocessing_product=has_pre_existing_product_model) pt_tfidf.update_templates(session_context, product_id, language, tfidf_by_top_term_by_attribute) log.info("---Done processing product [%s] (took %.6f seconds)" % (product_id, time() - start)) session_context.clear_context_filters_cache()
def from_dict(product_id, product_model_dict, validator): """ Converts a product model in the form of a dict into an instance of ProductModel. It differs from the constructor in that from_dict() expects a product model, as the constructor expects a raw product. :param product_id: The id of the intended product. :param product_model_dict: A flat dict of attributes. :param validator: an instance of a ProductModelFactory. :returns: a ProductModel instance. """ product_id = product_id product_model_values = utils.flatten_dict(product_model_dict) return ProductModel(validator, product_id, product_model_values)
def __process_product_terms(session_context, page, products_list, language, flush_size): session_context = session_context.new_session() start_idx = page * session_context.page_size_batch_process_products end_idx = min((page + 1) * session_context.page_size_batch_process_products, len(products_list)) page_product_ids = products_list[start_idx:end_idx] total_products = len(page_product_ids) tf_records = [] df_by_term = {} product_models_map = session_context.data_proxy.fetch_product_models(page_product_ids) product_dicts_map = {p_id: utils.flatten_dict(p_model.to_dict()) for p_id, p_model in product_models_map.items()} skipped = total_products - len(product_dicts_map) non_persisted_text_fields = set(session_context.product_text_fields) - \ session_context.product_model_factory.persisted_attributes if len(non_persisted_text_fields) > 0: # Fetches the non-persisted text attributes from the raw products collection and stemmizes them. products_map = session_context.data_proxy.fetch_products(product_ids=page_product_ids, fields_to_project=list(non_persisted_text_fields)) for p_id, product in products_map.items(): attributes_ok = True if p_id not in product_dicts_map: attributes_ok = False if attributes_ok: product = utils.flatten_dict(product) stemmed_attributes_map = {} for attribute in non_persisted_text_fields: value = product.get(attribute) if value is not None: try: stemmed_attributes_map[attribute] = text.parse_text_to_stems(language, value) except Exception as err: log.error('Exception: {0}'.format(str(err))) log.error('Offending value: {0}'.format(value)) attributes_ok = False continue if attributes_ok: product_dicts_map[p_id].update(stemmed_attributes_map) else: skipped += 1 if p_id in product_dicts_map: product_dicts_map.pop(p_id) for product_dict in product_dicts_map.values(): product_terms_results = prepare_product_terms(session_context, product_dict, batch_processing=True) if product_terms_results is None: skipped += 1 continue new_tf_records, new_terms, _ = product_terms_results tf_records += new_tf_records for term in new_terms: df = df_by_term.get(term, 0) + 1 df_by_term[term] = df if len(tf_records) >= flush_size: _flush_tf_records(session_context, tf_records) if len(tf_records) > 0: _flush_tf_records(session_context, tf_records) return df_by_term, skipped