def _analyze_users_similarity(args): user, data_set, min_similarity, offset, limit = args logger.info("{} {}".format(offset, limit)) repository = Repository(data_set=data_set) progress = Progress(limit - 1) for users in batch( repository.get_users_products(offset=offset, limit=limit - 1), 1000): for user2 in users: if user['_id'] == user2['_id']: continue progress.advance() similarity, common, additional1, additional2 = calculate_products_similarity( user['products'], user2['products']) if similarity >= min_similarity: similar = dict(user1_id=user['user_id'], user2_id=user2['user_id'], similarity=similarity, common_products=common, add_products1=additional1, add_products2=additional2) repository.add_users_similarity(similar) logger.info("{:.1f}% ETA {}".format(progress.get_progress(), progress.get_estimated_time()))
def analyze_orders_similarity_multi(data_set, samples, orders, last_order_id, user_id): repository = Repository(data_set=data_set) progress = Progress(orders) min_similarity = 0.2 # offset = 1 processes = 5 pool = multiprocessing.Pool(processes=processes) step = math.ceil(samples / processes) logger.info("Last order {}".format(last_order_id)) for orders in batch(repository.get_orders_for_user(user_id=user_id), 10): tasks = [] for order in orders: progress.advance() last_order_id = order['_id'] for from_sample in range(0, samples, step): tasks.append( (order, data_set, min_similarity, from_sample, step)) logger.info("Last order {}".format(last_order_id)) pool.map(_analyze_orders_similarity, tasks) logger.info("{:.1f}% ETA {}".format(progress.get_progress(), progress.get_estimated_time())) pool.close() pool.join()
def load_products(data_set): reader = Reader(data_set=data_set) repository = Repository(data_set=data_set) loaded = 0 logger.info("Loading products") for products in batch(reader.load_products(), 100): repository.add_products(products) loaded += len(products) logger.info("Loaded products {}".format(loaded))
def analyze_orders_similarity(data_set, samples): repository = Repository(data_set=data_set) progress = Progress(math.ceil(((samples - 1) * samples) / 2)) similarity_threshold = 0.2 offset = 1 for orders1 in batch(repository.get_orders(limit=samples - 1), 100): for o1 in orders1: max_similarity = similarity_threshold similar = None count = 0 for orders2 in batch( repository.get_orders(offset=offset, limit=samples - offset), 100): for o2 in orders2: progress.advance() similarity, common, additional1, additional2 = calculate_products_similarity( o1['products'], o2['products']) if similarity > max_similarity: max_similarity = similarity similar = dict(order1_id=o1['order_id'], user1_id=o1['user_id'], order2_id=o2['order_id'], user2_id=o2['user_id'], similarity=similarity, common_products=common, add_products1=additional1, add_products2=additional2) logger.info("Similarity {} {} {}".format( similar['user1_id'], similar['user2_id'], similarity)) if similar is not None: repository.add_orders_similarity(similar) offset += 1 logger.info("{:.1f}% ETA {}".format(progress.get_progress(), progress.get_estimated_time()))
def analyze_users_similarity(data_set, samples): repository = Repository(data_set=data_set) progress = Progress(math.ceil(((samples - 1) * samples) / 2)) offset = 1 for user_products1 in batch( repository.get_user_products(limit=samples - 1), 100): for up1 in user_products1: max_similarity = 0.1 similar = None for user_products2 in batch( repository.get_user_products(offset=offset, limit=samples - offset), 100): for up2 in user_products2: progress.advance() similarity, common, additional1, additional2 = calculate_products_similarity( up1['products'], up2['products']) if similarity > max_similarity: max_similarity = similarity similar = dict(user1_id=up1['user_id'], user2_id=up2['user_id'], similarity=similarity, common_products=common, add_products1=additional1, add_products2=additional2) logger.info("{} {} {}".format(similar['user1_id'], similar['user2_id'], similarity)) if similar is not None: repository.add_users_similarity(similar) offset += 1 logger.info("{:.1f}% ETA {}".format(progress.get_progress(), progress.get_estimated_time()))
def load_orders(data_set): reader = Reader(data_set=data_set) repository = Repository(data_set=data_set) loaded = 0 logger.info("Loading orders") for orders in batch(reader.load_orders(), 100): orders_products = [] for order in orders: order_products = repository.find_order_products(order['order_id']) order_products = [p.copy() for p in order_products] if not order_products: continue order['products'] = order_products orders_products.append(order) if orders_products: repository.add_orders(orders_products) loaded += len(orders_products) logger.info("Loaded orders {}".format(loaded))
def analyze_products_by_user(data_set): repository = Repository(data_set=data_set) users = repository.get_users() count = 0 total = len(users) for user_ids in batch(users, 100): users_products = [] for user_id in user_ids: user_products = repository.get_products_bought_by_user(user_id) user_products = dict(user_id=user_id, products=[ dict(product_id=p['_id'], count=p['count']) for p in user_products ]) users_products.append(user_products) count += 1 logger.info("{}/{}".format(count, total)) repository.add_user_products(users_products)