def test_conversion_from_dict(): """ Tests conversion from a dict to a ProductModel instance. """ model_definition = { 'language': {'type': 'fixed', 'default': 'english'}, 'a': {'type': 'fixed', 'persisted': True}, 'b.c': {'type': 'fixed', 'persisted': True}, 'b.d.e': {'type': 'text', 'persisted': True}, 'b.d.f': {'type': 'numeric', 'persisted': True} } factory = ProductModelFactory(model_definition) stemmed = text.parse_text_to_stems('english', 'a value that should be stemmed') model_dict = { 'a': 'test', 'b': { 'c': 'foo', 'd': { 'e': stemmed, 'f': 54321 } } } product = pm.ProductModel.from_dict('test_product', model_dict, factory) nose.tools.eq_(product.get_attribute('a'), model_dict['a'], 'Attribute does not match') nose.tools.eq_(product.get_attribute('b.c'), model_dict['b']['c'], 'Attribute does not match') nose.tools.assert_list_equal(product.get_attribute('b.d.e'), model_dict['b']['d']['e'], 'Attribute does not match') nose.tools.eq_(product.get_attribute('b.d.f'), model_dict['b']['d']['f'], 'Attribute does not match')
def test_conversion_to_dict(): """ Tests conversion from a ProductModel instance to a dict. """ model_definition = { 'language': {'type': 'fixed', 'default': 'english'}, 'a': {'type': 'fixed', 'persisted': True}, 'b.c': {'type': 'fixed', 'persisted': True}, 'b.d.e': {'type': 'text', 'persisted': True}, 'b.d.f': {'type': 'numeric', 'persisted': True} } factory = ProductModelFactory(model_definition) raw_product = { 'a': 'foo', 'b': { 'c': 'bar', 'd': { 'e': 'some nested stuff', 'f': 12345 } } } stemmed = text.parse_text_to_stems('english', raw_product['b']['d']['e']) model = factory.build('test_product', raw_product) model_dict = model.to_dict() nose.tools.eq_(model_dict['a'], raw_product['a'], 'Attribute does not match') nose.tools.eq_(model_dict['b']['c'], raw_product['b']['c'], 'Attribute does not match') nose.tools.assert_list_equal(model_dict['b']['d']['e'], stemmed, 'Attribute does not match') nose.tools.eq_(model_dict['b']['d']['f'], raw_product['b']['d']['f'], 'Attribute does not match')
def process_product(session_context, product_id, product=None, force_update=False): log.info("Processing product [%s]" % product_id) start = time() if product is None: product = session_context.data_proxy.fetch_products([product_id]).get(product_id) if product is None: raise ValueError("No product exists in the db with id [%s]" % product_id) log.info("Product [{0}] loaded".format(product_id)) product_model, has_pre_existing_product_model = prd.prepare_product_model( session_context, product, force_update=force_update) if product_model is None: log.error("Error while processing product [%s]: product model was not generated" % product_id) else: language = product_model.get_attribute("language") product_as_dict = None product_model_as_dict = None if not has_pre_existing_product_model or force_update: for attribute in session_context.product_text_fields: if product_model_as_dict is None: product_model_as_dict = utils.flatten_dict(product_model.to_dict()) # lazily flattens the product model if attribute not in product_model_as_dict: if product_as_dict is None: product_as_dict = utils.flatten_dict(product) # lazily flattens the product value = product_as_dict.get(attribute) if value is not None: stemmed_value = text.parse_text_to_stems(language, value) product_model_as_dict[attribute] = stemmed_value _, _, tfidf_by_top_term_by_attribute = prd.prepare_product_terms( session_context, product_model_as_dict, reprocessing_product=has_pre_existing_product_model) pt_tfidf.update_templates(session_context, product_id, language, tfidf_by_top_term_by_attribute) log.info("---Done processing product [%s] (took %.6f seconds)" % (product_id, time() - start)) session_context.clear_context_filters_cache()
def build(self, product_id, product_fields): """ Validates and creates an instance of ProductModel from an {attribute: value} dict. :param product_id: The id of the product to be encapsulated in a ProductModel object. :param product_fields: A dict with the product' contents. :return: A validated ProductModel object. Or raises exception if product is invalid. """ if product_id is None: raise ValueError("The Product ID is a required attribute") model_values = {self._ID_ATTRIBUTE: product_id} for attribute_name, attribute_properties in self.model_attributes.items(): default = self.default_values.get(attribute_name) value = self._get_product_attribute(product_fields, attribute_name, default) try: self.validate_required_field(attribute_name, value) if value is not None: if attribute_properties['type'] in [pm.FIXED, pm.LIST, pm.NUMERIC]: model_values[attribute_name] = value elif attribute_properties['type'] == pm.DATE: if not isinstance(value, dt.datetime): value = dateutil.parser.parse(value) model_values[attribute_name] = value else: language = self.get_language(product_fields) if language is None: raise AttributeError("Text attributes are only supported when a language is defined") model_values[attribute_name] = text.parse_text_to_stems(language, value) except Exception as err: log.error('Exception: {0}'.format(str(err))) log.error('Offending product: {0}'.format(product_fields)) raise err return ProductModel(self, product_id, model_values)
def __process_product_terms(session_context, page, products_list, language, flush_size): session_context = session_context.new_session() start_idx = page * session_context.page_size_batch_process_products end_idx = min((page + 1) * session_context.page_size_batch_process_products, len(products_list)) page_product_ids = products_list[start_idx:end_idx] total_products = len(page_product_ids) tf_records = [] df_by_term = {} product_models_map = session_context.data_proxy.fetch_product_models(page_product_ids) product_dicts_map = {p_id: utils.flatten_dict(p_model.to_dict()) for p_id, p_model in product_models_map.items()} skipped = total_products - len(product_dicts_map) non_persisted_text_fields = set(session_context.product_text_fields) - \ session_context.product_model_factory.persisted_attributes if len(non_persisted_text_fields) > 0: # Fetches the non-persisted text attributes from the raw products collection and stemmizes them. products_map = session_context.data_proxy.fetch_products(product_ids=page_product_ids, fields_to_project=list(non_persisted_text_fields)) for p_id, product in products_map.items(): attributes_ok = True if p_id not in product_dicts_map: attributes_ok = False if attributes_ok: product = utils.flatten_dict(product) stemmed_attributes_map = {} for attribute in non_persisted_text_fields: value = product.get(attribute) if value is not None: try: stemmed_attributes_map[attribute] = text.parse_text_to_stems(language, value) except Exception as err: log.error('Exception: {0}'.format(str(err))) log.error('Offending value: {0}'.format(value)) attributes_ok = False continue if attributes_ok: product_dicts_map[p_id].update(stemmed_attributes_map) else: skipped += 1 if p_id in product_dicts_map: product_dicts_map.pop(p_id) for product_dict in product_dicts_map.values(): product_terms_results = prepare_product_terms(session_context, product_dict, batch_processing=True) if product_terms_results is None: skipped += 1 continue new_tf_records, new_terms, _ = product_terms_results tf_records += new_tf_records for term in new_terms: df = df_by_term.get(term, 0) + 1 df_by_term[term] = df if len(tf_records) >= flush_size: _flush_tf_records(session_context, tf_records) if len(tf_records) > 0: _flush_tf_records(session_context, tf_records) return df_by_term, skipped