Exemple #1
0
def _analyze_orders_similarity(args):
    order, data_set, min_similarity, offset, limit = args
    logger.info("{} {} {}".format(order['_id'], offset, limit))
    repository = Repository(data_set=data_set)
    progress = Progress(limit - 1)

    for orders2 in batch(repository.get_orders(offset=offset, limit=limit - 1),
                         1000):
        for o2 in orders2:
            if o2['_id'] == order['_id']:
                continue

            progress.advance()
            similarity, common, additional1, additional2 = calculate_products_similarity(
                order['products'], o2['products'])
            if similarity >= min_similarity:
                similar = dict(order1_id=order['order_id'],
                               user1_id=order['user_id'],
                               order2_id=o2['order_id'],
                               user2_id=o2['user_id'],
                               similarity=similarity,
                               common_products=common,
                               add_products1=additional1,
                               add_products2=additional2)
                repository.add_orders_similarity(similar)

        logger.info("{:.1f}% ETA {}".format(progress.get_progress(),
                                            progress.get_estimated_time()))
Exemple #2
0
def analyze_orders_similarity(data_set, samples):
    repository = Repository(data_set=data_set)
    progress = Progress(math.ceil(((samples - 1) * samples) / 2))
    similarity_threshold = 0.2
    offset = 1
    for orders1 in batch(repository.get_orders(limit=samples - 1), 100):
        for o1 in orders1:
            max_similarity = similarity_threshold
            similar = None
            count = 0
            for orders2 in batch(
                    repository.get_orders(offset=offset,
                                          limit=samples - offset), 100):
                for o2 in orders2:
                    progress.advance()
                    similarity, common, additional1, additional2 = calculate_products_similarity(
                        o1['products'], o2['products'])
                    if similarity > max_similarity:
                        max_similarity = similarity
                        similar = dict(order1_id=o1['order_id'],
                                       user1_id=o1['user_id'],
                                       order2_id=o2['order_id'],
                                       user2_id=o2['user_id'],
                                       similarity=similarity,
                                       common_products=common,
                                       add_products1=additional1,
                                       add_products2=additional2)
                        logger.info("Similarity {} {} {}".format(
                            similar['user1_id'], similar['user2_id'],
                            similarity))

            if similar is not None:
                repository.add_orders_similarity(similar)

            offset += 1
            logger.info("{:.1f}% ETA {}".format(progress.get_progress(),
                                                progress.get_estimated_time()))