Example #1
0
def _analyze_users_similarity(args):
    user, data_set, min_similarity, offset, limit = args
    logger.info("{} {}".format(offset, limit))
    repository = Repository(data_set=data_set)
    progress = Progress(limit - 1)
    for users in batch(
            repository.get_users_products(offset=offset, limit=limit - 1),
            1000):
        for user2 in users:
            if user['_id'] == user2['_id']:
                continue

            progress.advance()
            similarity, common, additional1, additional2 = calculate_products_similarity(
                user['products'], user2['products'])
            if similarity >= min_similarity:
                similar = dict(user1_id=user['user_id'],
                               user2_id=user2['user_id'],
                               similarity=similarity,
                               common_products=common,
                               add_products1=additional1,
                               add_products2=additional2)

                repository.add_users_similarity(similar)

        logger.info("{:.1f}% ETA {}".format(progress.get_progress(),
                                            progress.get_estimated_time()))
Example #2
0
def analyze_orders_similarity_multi(data_set, samples, orders, last_order_id,
                                    user_id):
    repository = Repository(data_set=data_set)
    progress = Progress(orders)
    min_similarity = 0.2
    # offset = 1

    processes = 5
    pool = multiprocessing.Pool(processes=processes)
    step = math.ceil(samples / processes)

    logger.info("Last order {}".format(last_order_id))

    for orders in batch(repository.get_orders_for_user(user_id=user_id), 10):
        tasks = []
        for order in orders:
            progress.advance()
            last_order_id = order['_id']
            for from_sample in range(0, samples, step):
                tasks.append(
                    (order, data_set, min_similarity, from_sample, step))

        logger.info("Last order {}".format(last_order_id))
        pool.map(_analyze_orders_similarity, tasks)
        logger.info("{:.1f}% ETA {}".format(progress.get_progress(),
                                            progress.get_estimated_time()))

    pool.close()
    pool.join()
Example #3
0
def load_products(data_set):
    reader = Reader(data_set=data_set)
    repository = Repository(data_set=data_set)

    loaded = 0
    logger.info("Loading products")
    for products in batch(reader.load_products(), 100):
        repository.add_products(products)
        loaded += len(products)
        logger.info("Loaded products {}".format(loaded))
Example #4
0
def analyze_orders_similarity(data_set, samples):
    repository = Repository(data_set=data_set)
    progress = Progress(math.ceil(((samples - 1) * samples) / 2))
    similarity_threshold = 0.2
    offset = 1
    for orders1 in batch(repository.get_orders(limit=samples - 1), 100):
        for o1 in orders1:
            max_similarity = similarity_threshold
            similar = None
            count = 0
            for orders2 in batch(
                    repository.get_orders(offset=offset,
                                          limit=samples - offset), 100):
                for o2 in orders2:
                    progress.advance()
                    similarity, common, additional1, additional2 = calculate_products_similarity(
                        o1['products'], o2['products'])
                    if similarity > max_similarity:
                        max_similarity = similarity
                        similar = dict(order1_id=o1['order_id'],
                                       user1_id=o1['user_id'],
                                       order2_id=o2['order_id'],
                                       user2_id=o2['user_id'],
                                       similarity=similarity,
                                       common_products=common,
                                       add_products1=additional1,
                                       add_products2=additional2)
                        logger.info("Similarity {} {} {}".format(
                            similar['user1_id'], similar['user2_id'],
                            similarity))

            if similar is not None:
                repository.add_orders_similarity(similar)

            offset += 1
            logger.info("{:.1f}% ETA {}".format(progress.get_progress(),
                                                progress.get_estimated_time()))
Example #5
0
def analyze_users_similarity(data_set, samples):
    repository = Repository(data_set=data_set)

    progress = Progress(math.ceil(((samples - 1) * samples) / 2))

    offset = 1
    for user_products1 in batch(
            repository.get_user_products(limit=samples - 1), 100):
        for up1 in user_products1:
            max_similarity = 0.1
            similar = None
            for user_products2 in batch(
                    repository.get_user_products(offset=offset,
                                                 limit=samples - offset), 100):
                for up2 in user_products2:
                    progress.advance()
                    similarity, common, additional1, additional2 = calculate_products_similarity(
                        up1['products'], up2['products'])
                    if similarity > max_similarity:
                        max_similarity = similarity
                        similar = dict(user1_id=up1['user_id'],
                                       user2_id=up2['user_id'],
                                       similarity=similarity,
                                       common_products=common,
                                       add_products1=additional1,
                                       add_products2=additional2)
                        logger.info("{} {} {}".format(similar['user1_id'],
                                                      similar['user2_id'],
                                                      similarity))

            if similar is not None:
                repository.add_users_similarity(similar)

            offset += 1
            logger.info("{:.1f}% ETA {}".format(progress.get_progress(),
                                                progress.get_estimated_time()))
Example #6
0
def load_orders(data_set):
    reader = Reader(data_set=data_set)
    repository = Repository(data_set=data_set)

    loaded = 0
    logger.info("Loading orders")
    for orders in batch(reader.load_orders(), 100):
        orders_products = []
        for order in orders:
            order_products = repository.find_order_products(order['order_id'])
            order_products = [p.copy() for p in order_products]
            if not order_products:
                continue

            order['products'] = order_products
            orders_products.append(order)

        if orders_products:
            repository.add_orders(orders_products)

        loaded += len(orders_products)
        logger.info("Loaded orders {}".format(loaded))
Example #7
0
def analyze_products_by_user(data_set):
    repository = Repository(data_set=data_set)
    users = repository.get_users()
    count = 0
    total = len(users)
    for user_ids in batch(users, 100):
        users_products = []
        for user_id in user_ids:
            user_products = repository.get_products_bought_by_user(user_id)
            user_products = dict(user_id=user_id,
                                 products=[
                                     dict(product_id=p['_id'],
                                          count=p['count'])
                                     for p in user_products
                                 ])

            users_products.append(user_products)

            count += 1
            logger.info("{}/{}".format(count, total))

        repository.add_user_products(users_products)