Example #1
0
def test_conversion_from_dict():
    """ Tests conversion from a dict to a ProductModel instance.
    """
    model_definition = {
        'language': {'type': 'fixed', 'default': 'english'},
        'a': {'type': 'fixed', 'persisted': True},
        'b.c': {'type': 'fixed', 'persisted': True},
        'b.d.e': {'type': 'text', 'persisted': True},
        'b.d.f': {'type': 'numeric', 'persisted': True}
    }
    factory = ProductModelFactory(model_definition)
    stemmed = text.parse_text_to_stems('english', 'a value that should be stemmed')
    model_dict = {
        'a': 'test',
        'b': {
            'c': 'foo',
            'd': {
                'e': stemmed,
                'f': 54321
            }
        }
    }
    product = pm.ProductModel.from_dict('test_product', model_dict, factory)
    nose.tools.eq_(product.get_attribute('a'), model_dict['a'], 'Attribute does not match')
    nose.tools.eq_(product.get_attribute('b.c'), model_dict['b']['c'], 'Attribute does not match')
    nose.tools.assert_list_equal(product.get_attribute('b.d.e'),
                                 model_dict['b']['d']['e'], 'Attribute does not match')
    nose.tools.eq_(product.get_attribute('b.d.f'), model_dict['b']['d']['f'], 'Attribute does not match')
Example #2
0
def test_conversion_to_dict():
    """ Tests conversion from a ProductModel instance to a dict.
    """
    model_definition = {
        'language': {'type': 'fixed', 'default': 'english'},
        'a': {'type': 'fixed', 'persisted': True},
        'b.c': {'type': 'fixed', 'persisted': True},
        'b.d.e': {'type': 'text', 'persisted': True},
        'b.d.f': {'type': 'numeric', 'persisted': True}
    }
    factory = ProductModelFactory(model_definition)
    raw_product = {
        'a': 'foo',
        'b': {
            'c': 'bar',
            'd': {
                'e': 'some nested stuff',
                'f': 12345
            }
        }
    }
    stemmed = text.parse_text_to_stems('english', raw_product['b']['d']['e'])
    model = factory.build('test_product', raw_product)
    model_dict = model.to_dict()
    nose.tools.eq_(model_dict['a'], raw_product['a'], 'Attribute does not match')
    nose.tools.eq_(model_dict['b']['c'], raw_product['b']['c'], 'Attribute does not match')
    nose.tools.assert_list_equal(model_dict['b']['d']['e'], stemmed, 'Attribute does not match')
    nose.tools.eq_(model_dict['b']['d']['f'], raw_product['b']['d']['f'], 'Attribute does not match')
Example #3
0
def process_product(session_context, product_id, product=None, force_update=False):
    log.info("Processing product [%s]" % product_id)
    start = time()

    if product is None:
        product = session_context.data_proxy.fetch_products([product_id]).get(product_id)
        if product is None:
            raise ValueError("No product exists in the db with id [%s]" % product_id)

    log.info("Product [{0}] loaded".format(product_id))

    product_model, has_pre_existing_product_model = prd.prepare_product_model(
        session_context, product, force_update=force_update)

    if product_model is None:
        log.error("Error while processing product [%s]: product model was not generated" % product_id)
    else:
        language = product_model.get_attribute("language")

    product_as_dict = None
    product_model_as_dict = None

    if not has_pre_existing_product_model or force_update:

        for attribute in session_context.product_text_fields:
            if product_model_as_dict is None:
                product_model_as_dict = utils.flatten_dict(product_model.to_dict())  # lazily flattens the product model
            if attribute not in product_model_as_dict:
                if product_as_dict is None:
                    product_as_dict = utils.flatten_dict(product)  # lazily flattens the product
                value = product_as_dict.get(attribute)
                if value is not None:
                    stemmed_value = text.parse_text_to_stems(language, value)
                    product_model_as_dict[attribute] = stemmed_value

        _, _, tfidf_by_top_term_by_attribute = prd.prepare_product_terms(
            session_context, product_model_as_dict, reprocessing_product=has_pre_existing_product_model)

        pt_tfidf.update_templates(session_context, product_id, language, tfidf_by_top_term_by_attribute)
    log.info("---Done processing product [%s] (took %.6f seconds)" % (product_id, time() - start))

    session_context.clear_context_filters_cache()
    def build(self, product_id, product_fields):
        """ Validates and creates an instance of ProductModel from an {attribute: value} dict.

            :param product_id: The id of the product to be encapsulated in a ProductModel object.
            :param product_fields: A dict with the product' contents.

            :return: A validated ProductModel object. Or raises exception if product is invalid.
        """
        if product_id is None:
            raise ValueError("The Product ID is a required attribute")

        model_values = {self._ID_ATTRIBUTE: product_id}

        for attribute_name, attribute_properties in self.model_attributes.items():
            default = self.default_values.get(attribute_name)
            value = self._get_product_attribute(product_fields, attribute_name, default)

            try:
                self.validate_required_field(attribute_name, value)
                if value is not None:
                    if attribute_properties['type'] in [pm.FIXED, pm.LIST, pm.NUMERIC]:
                        model_values[attribute_name] = value
                    elif attribute_properties['type'] == pm.DATE:
                        if not isinstance(value, dt.datetime):
                            value = dateutil.parser.parse(value)
                        model_values[attribute_name] = value
                    else:
                        language = self.get_language(product_fields)
                        if language is None:
                            raise AttributeError("Text attributes are only supported when a language is defined")
                        model_values[attribute_name] = text.parse_text_to_stems(language, value)

            except Exception as err:
                log.error('Exception: {0}'.format(str(err)))
                log.error('Offending product: {0}'.format(product_fields))
                raise err

        return ProductModel(self, product_id, model_values)
Example #5
0
def __process_product_terms(session_context, page, products_list, language, flush_size):
    session_context = session_context.new_session()
    start_idx = page * session_context.page_size_batch_process_products
    end_idx = min((page + 1) * session_context.page_size_batch_process_products, len(products_list))

    page_product_ids = products_list[start_idx:end_idx]
    total_products = len(page_product_ids)

    tf_records = []
    df_by_term = {}

    product_models_map = session_context.data_proxy.fetch_product_models(page_product_ids)
    product_dicts_map = {p_id: utils.flatten_dict(p_model.to_dict()) for p_id, p_model in product_models_map.items()}
    skipped = total_products - len(product_dicts_map)

    non_persisted_text_fields = set(session_context.product_text_fields) - \
                                session_context.product_model_factory.persisted_attributes
    if len(non_persisted_text_fields) > 0:
        # Fetches the non-persisted text attributes from the raw products collection and stemmizes them.
        products_map = session_context.data_proxy.fetch_products(product_ids=page_product_ids,
                                                                 fields_to_project=list(non_persisted_text_fields))
        for p_id, product in products_map.items():
            attributes_ok = True
            if p_id not in product_dicts_map:
                attributes_ok = False
            if attributes_ok:
                product = utils.flatten_dict(product)
                stemmed_attributes_map = {}
                for attribute in non_persisted_text_fields:
                    value = product.get(attribute)
                    if value is not None:
                        try:
                            stemmed_attributes_map[attribute] = text.parse_text_to_stems(language, value)
                        except Exception as err:
                            log.error('Exception: {0}'.format(str(err)))
                            log.error('Offending value: {0}'.format(value))
                            attributes_ok = False
                            continue
            if attributes_ok:
                product_dicts_map[p_id].update(stemmed_attributes_map)
            else:
                skipped += 1
                if p_id in product_dicts_map:
                    product_dicts_map.pop(p_id)

    for product_dict in product_dicts_map.values():

        product_terms_results = prepare_product_terms(session_context, product_dict, batch_processing=True)
        if product_terms_results is None:
            skipped += 1
            continue
        new_tf_records, new_terms, _ = product_terms_results

        tf_records += new_tf_records

        for term in new_terms:
            df = df_by_term.get(term, 0) + 1
            df_by_term[term] = df

        if len(tf_records) >= flush_size:
                _flush_tf_records(session_context, tf_records)

    if len(tf_records) > 0:
        _flush_tf_records(session_context, tf_records)

    return df_by_term, skipped