if __name__ == "__main__": #chunks = file_io.read_lastfm_user_art_file("data/userid-timestamp-artid-artname-traid-traname.tsv") chunks = file_io.read_lastfm_user_art_file("data/test_shorter.tsv") # read songs vectorizer = FeatureHasher() pre = Preprocessor(chunks, vectorizer) songs = pre.read_songs(20) print(songs) # reset file reader #chunks = file_io.read_lastfm_user_art_file("data/tmp.tsv") #pre.reset_file_reader(chunks) # read user song mapping pre.read_user_songs(1000) # convert to user-song matrix X = pre.get_user_song_matrix() start_time = time.time() clf = KNeighborsClassifier(n_neighbors=1) clf.fit(X, list(range(X.shape[0]))) print(clf.predict(pre.user_song_dict["user_000001"])) print("training and predict using {0}".format(time.time() - start_time)) #song_content = [ (artist,song) for (artist,song) in zip(list(songs['artname']), list(songs['traname']))] # temporary song_list song_content = [('Underworld', 'Boy, Boy, Boy'), ('Underworld', 'Crocodile'), ('Led Zeppelin', 'Stairway to heaven'),
if __name__ == "__main__": start_time = time.time() np.set_printoptions(threshold=np.nan) vectorizer = DictVectorizer() # reset file reader chunks = file_io.read_lastfm_user_art_file("data/halfid_20%_train.tsv") valid_songs = [] # don't filter with valid songs valid_songs = file_io.get_all_valid_songs('data/song_word2vec_whole_truncate_60000_new.csv') pre = Preprocessor(chunks, vectorizer, valid_songs) pre.reset_file_reader(chunks) # read user song mapping pre.read_user_songs(3000000) # convert to user-song matrix X = pre.get_user_song_matrix() print("non zeros: {0}".format(X.count_nonzero())) print("pre-processed in {0:.2f} sec".format(time.time() - start_time)) #cluster_cf.cluster_usr(X, k=5) print("non zeros: {0}".format(X.count_nonzero())) pred = recommendation.predict_by_user(X) #pred = recommendation.predict_by_factorize(X) recommended = recommendation.recommend_all(X, pred, masked=False) recommend_numbers = 5 songs = pre.get_songs_by_indices(recommended, recommend_numbers) print("user 1 top recommended songs: {0}".format(songs[0])) print("predict in {0:.2f} sec".format(time.time() - start_time))