Beispiel #1
0
def _analyze_users_similarity(args):
    user, data_set, min_similarity, offset, limit = args
    logger.info("{} {}".format(offset, limit))
    repository = Repository(data_set=data_set)
    progress = Progress(limit - 1)
    for users in batch(
            repository.get_users_products(offset=offset, limit=limit - 1),
            1000):
        for user2 in users:
            if user['_id'] == user2['_id']:
                continue

            progress.advance()
            similarity, common, additional1, additional2 = calculate_products_similarity(
                user['products'], user2['products'])
            if similarity >= min_similarity:
                similar = dict(user1_id=user['user_id'],
                               user2_id=user2['user_id'],
                               similarity=similarity,
                               common_products=common,
                               add_products1=additional1,
                               add_products2=additional2)

                repository.add_users_similarity(similar)

        logger.info("{:.1f}% ETA {}".format(progress.get_progress(),
                                            progress.get_estimated_time()))
Beispiel #2
0
def analyze_users_similarity(data_set, samples):
    repository = Repository(data_set=data_set)

    progress = Progress(math.ceil(((samples - 1) * samples) / 2))

    offset = 1
    for user_products1 in batch(
            repository.get_user_products(limit=samples - 1), 100):
        for up1 in user_products1:
            max_similarity = 0.1
            similar = None
            for user_products2 in batch(
                    repository.get_user_products(offset=offset,
                                                 limit=samples - offset), 100):
                for up2 in user_products2:
                    progress.advance()
                    similarity, common, additional1, additional2 = calculate_products_similarity(
                        up1['products'], up2['products'])
                    if similarity > max_similarity:
                        max_similarity = similarity
                        similar = dict(user1_id=up1['user_id'],
                                       user2_id=up2['user_id'],
                                       similarity=similarity,
                                       common_products=common,
                                       add_products1=additional1,
                                       add_products2=additional2)
                        logger.info("{} {} {}".format(similar['user1_id'],
                                                      similar['user2_id'],
                                                      similarity))

            if similar is not None:
                repository.add_users_similarity(similar)

            offset += 1
            logger.info("{:.1f}% ETA {}".format(progress.get_progress(),
                                                progress.get_estimated_time()))