def _analyze_users_similarity(args): user, data_set, min_similarity, offset, limit = args logger.info("{} {}".format(offset, limit)) repository = Repository(data_set=data_set) progress = Progress(limit - 1) for users in batch( repository.get_users_products(offset=offset, limit=limit - 1), 1000): for user2 in users: if user['_id'] == user2['_id']: continue progress.advance() similarity, common, additional1, additional2 = calculate_products_similarity( user['products'], user2['products']) if similarity >= min_similarity: similar = dict(user1_id=user['user_id'], user2_id=user2['user_id'], similarity=similarity, common_products=common, add_products1=additional1, add_products2=additional2) repository.add_users_similarity(similar) logger.info("{:.1f}% ETA {}".format(progress.get_progress(), progress.get_estimated_time()))
def analyze_users_similarity(data_set, samples): repository = Repository(data_set=data_set) progress = Progress(math.ceil(((samples - 1) * samples) / 2)) offset = 1 for user_products1 in batch( repository.get_user_products(limit=samples - 1), 100): for up1 in user_products1: max_similarity = 0.1 similar = None for user_products2 in batch( repository.get_user_products(offset=offset, limit=samples - offset), 100): for up2 in user_products2: progress.advance() similarity, common, additional1, additional2 = calculate_products_similarity( up1['products'], up2['products']) if similarity > max_similarity: max_similarity = similarity similar = dict(user1_id=up1['user_id'], user2_id=up2['user_id'], similarity=similarity, common_products=common, add_products1=additional1, add_products2=additional2) logger.info("{} {} {}".format(similar['user1_id'], similar['user2_id'], similarity)) if similar is not None: repository.add_users_similarity(similar) offset += 1 logger.info("{:.1f}% ETA {}".format(progress.get_progress(), progress.get_estimated_time()))