def find_for_users(self, users_plays): scores = [] current_user_number = 1 for user_plays in users_plays: if current_user_number == 1 or current_user_number % 100 == 0: print 'Looking for knn for %s. user' % current_user_number scores.append({'user_id': user_plays['_id'], 'scores': self.find_for_user(user_plays)}) if current_user_number % 10000 == 0: invoke_measurable_task(lambda: self.write_scores_to_file(scores), "Writing %d scores to file..." % len(scores)) scores = [] current_user_number += 1 invoke_measurable_task(lambda: self.write_scores_to_file(scores), "Writing %d scores to file..." % len(scores))
for score in scores[0]['scores']: file.write(' ' + str(score['user_id'])) return scores with MongoClient('localhost', MONGODB_PORT) as client: db = client.local plays_for_all_users = [1] def load_plays_for_all_users(): plays_for_all_users[0] = list(db.plays_by_user_filtered_t.find()) invoke_measurable_task(load_plays_for_all_users, 'Load plays for all users') plays_for_all_users = plays_for_all_users[0] plays_for_validated_users = [1] def load_plays_for_all_users(): plays_for_validated_users[0] = list( db.plays_by_user_filtered_v.find().limit(NUMBER_OF_USERS)) invoke_measurable_task(load_plays_for_all_users, 'Load plays for validated users') plays_for_validated_users = plays_for_validated_users[0] knn = JaccardBasedKnn(plays_for_all_users, KNN_K) invoke_measurable_task( lambda: knn.find_for_users(plays_for_validated_users),
insert_batch(plays_batch, target_collection) plays_batch = [] if len(plays_batch) > 0: insert_batch(plays_batch, target_collection) with MongoClient('localhost', MONGODB_PORT) as client: db = client.local most_popular_songs = [1] def load_most_popular_songs(): most_popular_songs[0] = list(db.play_count_by_song_t.find().sort( 'value', DESCENDING).limit(NUMBER_OF_MOST_POPULAR_SONGS)) invoke_measurable_task( load_most_popular_songs, 'Load %d most popular songs' % NUMBER_OF_MOST_POPULAR_SONGS) most_popular_songs = most_popular_songs[0] invoke_measurable_task( lambda: create_plays_by_user_filtered( db.plays_by_user_t, most_popular_songs, db.plays_by_user_filtered_t ), "Create plays_by_user_filtered collection for train set") invoke_measurable_task( lambda: create_plays_by_user_filtered(db.plays_by_user_v, most_popular_songs, db.plays_by_user_filtered_v, True), "Create plays_by_user_filtered collection for validation set")
for line in knn_results: print i knn_map.append(get_data_from_fileline(line)) i += 1 with MongoClient('localhost', MONGODB_PORT) as client: db = client.local most_popular_songs = [1] def load_most_popular_songs(): most_popular_songs[0] = list( db.play_count_by_song_t.find() .sort('value', DESCENDING) .limit(NUMBER_OF_MOST_POPULAR_SONGS) ) invoke_measurable_task(load_most_popular_songs, 'Load %d most popular songs' % NUMBER_OF_MOST_POPULAR_SONGS) most_popular_songs = most_popular_songs[0] load_knn_results() with open(KAGGLE_USERS_MAPPING_FILE_PATH) as kaggle_users: with open(KNN_CONVERTED_PATH, 'w') as my_file: my_file.write('Id,Expected\n') j = 1 for index, user_id in enumerate(kaggle_users): my_file.write(user_id.strip() + ',') result = knn_map[index] index += 1 predicted_songs = 0 for song in result[1]:
print 'Query created' songs_group = plays_by_user_binary_t.aggregate(pipeline) with MongoClient('localhost', MONGODB_PORT) as client: db = client.local most_popular_songs = [1] def load_most_popular_songs(): most_popular_songs[0] = list( db.play_count_by_song_t.find() .sort('value', DESCENDING) .limit(NUMBER_OF_MOST_POPULAR_SONGS) ) invoke_measurable_task(load_most_popular_songs, 'Load %d most popular songs' % NUMBER_OF_MOST_POPULAR_SONGS) most_popular_songs = most_popular_songs[0] print NUMBER_OF_MOST_POPULAR_SONGS, ' most popular songs selected\n' invoke_measurable_task( lambda: one_by_one(most_popular_songs, db.plays_by_user_binary_t), 'Group by most popular song one by one') invoke_measurable_task( lambda: pairs(most_popular_songs, db.plays_by_user_binary_t), 'Group by pairs of most popular songs') invoke_measurable_task(load_most_popular_songs, 'Load %d most popular songs' % NUMBER_OF_SONGS_AT_ONE) most_popular_songs = most_popular_songs[0] print NUMBER_OF_SONGS_AT_ONE, 'most popular songs selected\n'
def plays_by_user_filtered(): os.system('python create_plays_by_user_filtered.py') def delete_map_reduce_contetn(): folder = './map_reduce/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isdir(file_path): shutil.rmtree(file_path) except Exception, e: print e try: os.remove('./map_reduce/created_collections.txt') except OSError: pass print 'INITIALIZING DATABASE' invoke_measurable_task(lambda: import_triplets_to_mongo_db(), 'Import triplets to mongo db') invoke_measurable_task(lambda: create_map_reduce_collections(), 'Create Map Reduce collections') invoke_measurable_task(lambda: plays_by_user_simple(), 'Create collection plays_by_user_simple') invoke_measurable_task(lambda: plays_by_user_binary(), 'Create collection plays_by_user_binary') invoke_measurable_task(lambda: plays_by_user_filtered(), 'Create collection plays_by_user_filtered')
def get_triplet_from_fileline(line): parts = line.strip().split() user_id = parts[0] if user_id in users_map: user_index = users_map[parts[0]] else: last_index[0] += 1 user_index = last_index[0] users_map[user_id] = user_index return { 'user_index': user_index, 'song_index': songs_map[parts[1]], 'play_count': int(parts[2]) } invoke_measurable_task(load_kaggle_users_mapping, 'Load Kaggle users mapping') invoke_measurable_task(load_kaggle_songs_mapping, 'Load Kaggle songs mapping') with MongoClient('localhost', MONGODB_PORT) as client: db = client.local invoke_measurable_task( lambda: import_triplets_from_file(TRAIN_TRIPLETS_FILE_PATH, db.triplets_t), 'Import train triplets') invoke_measurable_task( lambda: import_triplets_from_file(TEST_TRIPLETS_FILE_PATH, db.triplets_v), 'Import validation triplets')
reduce = reduce_file.read() finalize_path = '%s/finalize.js' % directory if exists(finalize_path): with open(finalize_path) as finalize_file: finalize = finalize_file.read() else: finalize = None collection_path = '%s/source_collection.txt' % directory if exists(collection_path): with open(collection_path) as collection_file: source_collection_name = collection_file.read() else: source_collection_name = 'triplets' with MongoClient('localhost', MONGODB_PORT) as client: db = client.local invoke_measurable_task( lambda: db[source_collection_name + '_t'].map_reduce( map, reduce, directory + '_t', finalize=finalize), "Create map reduce collection %s for train set" % directory) invoke_measurable_task( lambda: db[source_collection_name + '_v'].map_reduce( map, reduce, directory + '_v', finalize=finalize), "Create map reduce collection %s for validation set" % directory) with open(CREATED_COLLECTIONS_FILE_PATH, 'a') as file: file.write('%s\n' % directory)
current_user_number = 1 for user_play in users_plays: print 'Looking for knn for %s. user' % current_user_number scores.append({'user_id': user_play['_id'], 'scores': self.find_for_user(set(user_play['value']))}) current_user_number += 1 with open('../results.txt', 'w') as file: file.write(str(scores[0]['user_id'])) for score in scores[0]['scores']: file.write(' ' + str(score['user_id'])) return scores with MongoClient('localhost', MONGODB_PORT) as client: db = client.local plays_for_all_users = [1] def load_plays_for_all_users(): plays_for_all_users[0] = list(db.plays_by_user_filtered_t.find()) invoke_measurable_task(load_plays_for_all_users, 'Load plays for all users') plays_for_all_users = plays_for_all_users[0] plays_for_validated_users = [1] def load_plays_for_all_users(): plays_for_validated_users[0] = list(db.plays_by_user_filtered_v.find().limit(NUMBER_OF_USERS)) invoke_measurable_task(load_plays_for_all_users, 'Load plays for validated users') plays_for_validated_users = plays_for_validated_users[0] knn = JaccardBasedKnn(plays_for_all_users, KNN_K) invoke_measurable_task(lambda: knn.find_for_users(plays_for_validated_users), 'Find knn for %d users' % NUMBER_OF_USERS)
most_often_played_song = plays[0] if most_often_played_song in buckets: bucket = buckets[most_often_played_song] else: bucket = [] buckets[most_often_played_song] = bucket bucket.append({'_id': user_id, 'value': plays}) i += 1 print "Number of buckets: %d" % len(buckets) target_collection = db[target_collection_name] batch = [{'_id': bucket_id, 'value': value} for bucket_id, value in buckets.iteritems()] print "Buckets batch ready" start = 0 end = BATCH_SIZE while start < len(batch): insert_batch(batch[start:end], target_collection) start += BATCH_SIZE end += BATCH_SIZE with MongoClient('localhost', MONGODB_PORT) as client: db = client.local invoke_measurable_task( lambda: create_plays_by_most_often_played_song(db, 'plays_by_user_simple_t', 'plays_by_most_often_played_song_t'), 'Create plays_by_most_often_played_song collection for train set')
else: bucket = [] buckets[most_often_played_song] = bucket bucket.append({'_id': user_id, 'value': plays}) i += 1 print "Number of buckets: %d" % len(buckets) target_collection = db[target_collection_name] batch = [{ '_id': bucket_id, 'value': value } for bucket_id, value in buckets.iteritems()] print "Buckets batch ready" start = 0 end = BATCH_SIZE while start < len(batch): insert_batch(batch[start:end], target_collection) start += BATCH_SIZE end += BATCH_SIZE with MongoClient('localhost', MONGODB_PORT) as client: db = client.local invoke_measurable_task( lambda: create_plays_by_most_often_played_song( db, 'plays_by_user_simple_t', 'plays_by_most_often_played_song_t'), 'Create plays_by_most_often_played_song collection for train set')
for plays_by_user in plays_by_user.find(): entry = { '_id': plays_by_user['_id'], 'value': [song_id for song_id in plays_by_user['value']] } # sort by plays count entry['value'].sort(key=lambda x: plays_by_user['value'][x], reverse=True) if len(entry['value']) > 1: plays_batch.append(entry) if len(plays_batch) > 0 and len(plays_batch) % BATCH_SIZE == 0: insert_batch(plays_batch, target_collection) plays_batch = [] if len(plays_batch) > 0: insert_batch(plays_batch, target_collection) with MongoClient('localhost', MONGODB_PORT) as client: db = client.local invoke_measurable_task( lambda: create_plays_by_user_filtered_simple(db.plays_by_user_t, db.plays_by_user_simple_t), "Create plays_by_user_simple_filtered collection for train set") invoke_measurable_task( lambda: create_plays_by_user_filtered_simple(db.plays_by_user_v, db.plays_by_user_simple_v), "Create plays_by_user_simple_filtered collection for validation set")
print 'Query created' songs_group = plays_by_user_binary_t.aggregate(pipeline) with MongoClient('localhost', MONGODB_PORT) as client: db = client.local most_popular_songs = [1] def load_most_popular_songs(): most_popular_songs[0] = list(db.play_count_by_song_t.find().sort( 'value', DESCENDING).limit(NUMBER_OF_MOST_POPULAR_SONGS)) invoke_measurable_task( load_most_popular_songs, 'Load %d most popular songs' % NUMBER_OF_MOST_POPULAR_SONGS) most_popular_songs = most_popular_songs[0] print NUMBER_OF_MOST_POPULAR_SONGS, ' most popular songs selected\n' invoke_measurable_task( lambda: one_by_one(most_popular_songs, db.plays_by_user_binary_t), 'Group by most popular song one by one') invoke_measurable_task( lambda: pairs(most_popular_songs, db.plays_by_user_binary_t), 'Group by pairs of most popular songs') invoke_measurable_task( load_most_popular_songs, 'Load %d most popular songs' % NUMBER_OF_SONGS_AT_ONE)
def plays_by_user_filtered(): os.system('python create_plays_by_user_filtered.py') def delete_map_reduce_contetn(): folder = './map_reduce/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isdir(file_path): shutil.rmtree(file_path) except Exception, e: print e try: os.remove('./map_reduce/created_collections.txt') except OSError: pass print 'INITIALIZING DATABASE' invoke_measurable_task( lambda: import_triplets_to_mongo_db(), 'Import triplets to mongo db') invoke_measurable_task( lambda: create_map_reduce_collections(), 'Create Map Reduce collections') invoke_measurable_task( lambda: plays_by_user_simple(), 'Create collection plays_by_user_simple') invoke_measurable_task( lambda: plays_by_user_binary(), 'Create collection plays_by_user_binary') invoke_measurable_task( lambda: plays_by_user_filtered(), 'Create collection plays_by_user_filtered')
if len(plays_batch) % BATCH_SIZE == 0: insert_batch(plays_batch, target_collection) plays_batch = [] if len(plays_batch) > 0: insert_batch(plays_batch, target_collection) with MongoClient('localhost', MONGODB_PORT) as client: db = client.local most_popular_songs = [1] def load_most_popular_songs(): most_popular_songs[0] = list( db.play_count_by_song_t.find() .sort('value', DESCENDING) .limit(NUMBER_OF_MOST_POPULAR_SONGS) ) invoke_measurable_task(load_most_popular_songs, 'Load %d most popular songs' % NUMBER_OF_MOST_POPULAR_SONGS) most_popular_songs = most_popular_songs[0] invoke_measurable_task( lambda: create_plays_by_user_binary(db.plays_by_user_t, most_popular_songs, db.plays_by_user_binary_t), "Create plays_by_user_binary collection for train set") invoke_measurable_task( lambda: create_plays_by_user_binary(db.plays_by_user_v, most_popular_songs, db.plays_by_user_binary_v), "Create plays_by_user_binary collection for validation set")
song_number = 1 for song_id in songs.iterkeys(): if song_number >= 500: break file.write(str(' ' + str(song_id))) song_number += 1 file.write('\n') buckets = [1] plays_for_validated_users = [1] with MongoClient('localhost', MONGODB_PORT) as client: db = client.local def load_buckets(): buckets[0] = {x['_id']: x['value'] for x in list(db.plays_by_most_often_played_song_t.find())} invoke_measurable_task(load_buckets, 'Load buckets') def load_plays_for_all_users(): plays_for_validated_users[0] = list(db.plays_by_user_simple_v.find().limit(NUMBER_OF_USERS)) invoke_measurable_task(load_plays_for_all_users, 'Load plays for validated users') buckets = buckets[0] plays_for_validated_users = plays_for_validated_users[0] def find_knn_scores(): knn = LshOptimizedJaccardBasedKnn(buckets, KNN_K, MIN_SIMILARITY) knn.find_for_users(plays_for_validated_users) invoke_measurable_task(find_knn_scores, 'Find knn for %d users' % NUMBER_OF_USERS)
naive_bayes(arguments) def save_to_file(collection): f = open(WORK_FILE, 'a') f.truncate() for song in collection: for song_id in song[u'value']: val = (song[u'value'][song_id]) f.seek(0) f.write(str(val)) f.write(' ') f.write('\n') f.close() with MongoClient('localhost', MONGODB_PORT) as client: collection = [1] def load_collection(): collection[0] = list(db.plays_by_user_binary_t.find().limit(COUNT)) db = client.local invoke_measurable_task(lambda: load_collection(), 'Get collection') invoke_measurable_task(lambda: save_to_file(collection[0]), 'Save collection to file') invoke_measurable_task(lambda: learn_bayes(WORK_FILE), 'Teach Naive Bayes')
f.truncate() for song in collection: for song_id in song[u'value']: val = (song[u'value'][song_id]) f.seek(0) f.write(str(val)) f.write(' ') f.write('\n') f.close() with MongoClient('localhost', MONGODB_PORT) as client: collection = [1] def load_collection(): collection[0] = list( db.plays_by_user_binary_t.find().limit(COUNT) ) db = client.local invoke_measurable_task( lambda: load_collection(), 'Get collection') invoke_measurable_task( lambda: save_to_file(collection[0]), 'Save collection to file') invoke_measurable_task( lambda: learn_bayes(WORK_FILE), 'Teach Naive Bayes')
for line in knn_results: print i knn_map.append(get_data_from_fileline(line)) i += 1 with MongoClient('localhost', MONGODB_PORT) as client: db = client.local most_popular_songs = [1] def load_most_popular_songs(): most_popular_songs[0] = list(db.play_count_by_song_t.find().sort( 'value', DESCENDING).limit(NUMBER_OF_MOST_POPULAR_SONGS)) invoke_measurable_task( load_most_popular_songs, 'Load %d most popular songs' % NUMBER_OF_MOST_POPULAR_SONGS) most_popular_songs = most_popular_songs[0] load_knn_results() with open(KAGGLE_USERS_MAPPING_FILE_PATH) as kaggle_users: with open(KNN_CONVERTED_PATH, 'w') as my_file: my_file.write('Id,Expected\n') j = 1 for index, user_id in enumerate(kaggle_users): my_file.write(user_id.strip() + ',') result = knn_map[index] index += 1 predicted_songs = 0
'_id': plays_by_user['_id'], 'value': [song_id for song_id in plays_by_user['value']] } # sort by plays count entry['value'].sort(key=lambda x: plays_by_user['value'][x], reverse=True) if len(entry['value']) > 1: plays_batch.append(entry) if len(plays_batch) > 0 and len(plays_batch) % BATCH_SIZE == 0: insert_batch(plays_batch, target_collection) plays_batch = [] if len(plays_batch) > 0: insert_batch(plays_batch, target_collection) with MongoClient('localhost', MONGODB_PORT) as client: db = client.local invoke_measurable_task( lambda: create_plays_by_user_filtered_simple(db.plays_by_user_t, db. plays_by_user_simple_t), "Create plays_by_user_simple_filtered collection for train set") invoke_measurable_task( lambda: create_plays_by_user_filtered_simple(db.plays_by_user_v, db. plays_by_user_simple_v), "Create plays_by_user_simple_filtered collection for validation set")