description_sim_matrix = rw.readffile(finput_description_sim_matrix) user_cluster_set = rw.readffile(finput_user_cluster_set) train_item_id = rw.readffile(finput_train_item_id) test_item_id = rw.readffile(finput_test_item_id) # run matlab script and get parameters for title and description print("call matlab script....") cur_path = os.getcwd() os.chdir("D:\GitCode\Dissertation\Step1-Preprocessing") eng = matlab.engine.start_matlab() x = eng.my_fitnlm(finput_nonlinreg, finput_init_tp, finput_init_dp, nargout=3) theta1, theta2, RMSE = x[0], x[1], x[2] eng.quit() sim_matrix = theta1*title_sim_matrix + theta2*description_sim_matrix os.chdir(cur_path) rw.write2file(sim_matrix, foutput_item_sim_matrix) print("theta1 = ", theta1) print("theta2 = ", theta2) print("RMSE = ", RMSE) print("matlab finished") # extract similarity matrix for training and test item # resort_id = list(train_item_id.keys()) + list(test_item_id.keys()) sim_matrix_train = sim_matrix.loc[list(train_item_id.keys()), list(train_item_id.keys())].values sim_matrix_test = sim_matrix.loc[list(test_item_id.keys()), list(test_item_id.keys())].values # user cluster - item rating matrix iuclst_rating_matrix_train = np.zeros((len(train_item_id), len(user_cluster_set))) iuclst_rating_matrix_test = np.zeros((len(test_item_id), len(user_cluster_set))) item_in_node_train = list(range(iu_rating_matrix_train.shape[0])) item_in_node_test = list(range(iu_rating_matrix_test.shape[0]))
user_id_dict[row['reviewerID']]] = int(row['overall']) cnt += 1 iu_sparse_matrix_train = scipy.sparse.csr_matrix(iu_matrix_train) print( "density of iu train matrix is: %.4f%%" % (100 * len(find(iu_sparse_matrix_train)[0]) / (iu_sparse_matrix_train.shape[0] * iu_sparse_matrix_train.shape[1]))) scipy.sparse.save_npz(foutput1, iu_sparse_matrix_train) # test iu_matrix_test = np.zeros((test_row, col), dtype=np.int8) cnt = 0 lenght = df_test.shape[0] for index, row in df_test.iterrows(): print("iu test matrix: %d / %d" % (cnt, lenght), end="\r") if row['reviewerID'] in user_id_dict.keys(): iu_matrix_test[test_item_id_dict[row['asin']], user_id_dict[row['reviewerID']]] = int( row['overall']) cnt += 1 iu_sparse_matrix_test = scipy.sparse.csr_matrix(iu_matrix_test) print("density of iu test matrix is: %.4f%%" % (100 * len(find(iu_sparse_matrix_test)[0]) / (iu_sparse_matrix_test.shape[0] * iu_sparse_matrix_test.shape[1]))) scipy.sparse.save_npz(foutput2, iu_sparse_matrix_test) print("iu matrix generated done!") # write uid, train_item_id and test_item_id into files rw.write2file(user_id_dict, foutput_uid) rw.write2file(train_item_id_dict, foutput_train_item_id) rw.write2file(test_item_id_dict, foutput_test_item_id) print("write done!")
import read2df as rdf import read_write as rw ''' Input: input path ("../Dataset/All_Beauty/meta_All_Beauty.json.gz") output path ("Data/title" && "Data/description") output: files ''' if (__name__ == '__main__'): #### data path finput = sys.argv[1] foutput_title = sys.argv[2] foutput_description = sys.argv[3] #### read data into dataframe df = rdf.getDF(finput) #### delete rows where title or description is nan dict_title = {} dict_description = {} subdf = df[~(df['title'].isin([np.nan]) | df['description'].isin([np.nan]))] for indexs in subdf.index: dict_title[subdf.loc[indexs]['asin']] = subdf.loc[indexs]['title'] dict_description[subdf.loc[indexs] ['asin']] = subdf.loc[indexs]['description'] #### write generated dictionary into files rw.write2file(dict_title, foutput_title) rw.write2file(dict_description, foutput_description) print("Write Done!") print("Info: %d/%d" % (subdf.shape[0], df.shape[0]))
rating_matrix_train = ( rating_matrix_train - np.sum(rating_matrix_train, axis=0) / np.sum(rating_matrix_train != 0, axis=0)) * (rating_matrix_train != 0) rating_matrix_train_2 = rating_matrix_train**2 # user_similarity_matrix = np.dot(rating_matrix_train.T, rating_matrix_train) / (np.dot(rating_matrix_train_2.T, rating_matrix_train_2)**0.5 + 1e-9) row_num = rating_matrix_train.shape[0] col_num = rating_matrix_train.shape[1] user_similarity_matrix = np.zeros((col_num, col_num)) nominatorM = np.dot(rating_matrix_train.T, rating_matrix_train) print("nominator done!") cnt = 0 for i in range(col_num): cnt += 1 print("progress: %d / %d" % (cnt, col_num), end="\r") flag = ((rating_matrix_train[:, i] != 0).reshape( row_num, 1)) * (rating_matrix_train != 0) user_similarity_matrix[i] = nominatorM[i] / ( (np.dot(rating_matrix_train_2[:, i].T, flag)**0.5) * (np.sum(rating_matrix_train_2 * flag, axis=0)**0.5) + 1e-9) # or it will be 0 for some users # np.fill_diagonal(user_similarity_matrix, 1) print("\ndone!") # transfer to dataframe and save to file # rw.write2file(user_similarity_matrix, "Data/test") df_user_similarity_matrix = pd.DataFrame(user_similarity_matrix, index=list(uid.keys()), columns=list(uid.keys())) del user_similarity_matrix rw.write2file(df_user_similarity_matrix, foutput_user_similarity) print("file saved done!")
num_topics=finput_topic_num) description_similarity = lda.LDA(texts=list_description_preprocessed, index_lst=index_lst, num_topics=finput_topic_num) print("lda similarity calculated done!") #### generate train/test item similarity matrix df_title_similarity_matrix = pd.DataFrame(np.array(title_similarity), index=item_tt_id_lst, columns=item_tt_id_lst) df_description_similarity_matrix = pd.DataFrame( np.array(description_similarity), index=item_tt_id_lst, columns=item_tt_id_lst) # train_item_id = rw.readffile(finput_train_item_id) # test_item_id = rw.readffile(finput_test_item_id) # #### title/train # df_title_similarity_matrix_train = df_title_similarity_matrix.loc[list(train_item_id.keys()), list(train_item_id.keys())] # #### title/test # df_title_similarity_matrix_test = df_title_similarity_matrix.loc[list(test_item_id.keys()), list(test_item_id.keys())] # #### description/train # df_description_similarity_matrix_train = df_description_similarity_matrix.loc[list(train_item_id.keys()), list(train_item_id.keys())] # #### description/test # df_description_similarity_matrix_test = df_description_similarity_matrix.loc[list(test_item_id.keys()), list(test_item_id.keys())] print("similarity matrix generated done!") #### write data into files rw.write2file(df_title_similarity_matrix, foutput_title_similarity) rw.write2file(df_description_similarity_matrix, foutput_description_similarity) print("file saved done!")
import k_medoids as km ''' finput_user_similarity = "Data/user_similarity_matrix" finput_cluster_number = 200 foutput_user_cluster_set = "Data/user_cluster_set" ''' if (__name__ == '__main__'): # data path finput_user_similarity = sys.argv[1] finput_cluster_number = int(sys.argv[2]) foutput_user_cluster_set = sys.argv[3] # read into user similarity matrix user_similarity_matrix = rw.readffile(finput_user_similarity) # k-medoids user_cluster_set = km.k_medoids(user_similarity_matrix.values, K=finput_cluster_number, max_iterations=20) print("\ndone!") rw.write2file(user_cluster_set, foutput_user_cluster_set) print("file saved done!") print("top 20% of user cluster:") length = [] for lst in user_cluster_set: length.append(len(lst)) length.sort(reverse=True) print(length[0:int(len(length) * 0.2)])