def evaluate_minhash(self, n, p): mean_accuracy = 0 headers = [] for i in range(1, self.n_folds + 1): headers.append("fold {}".format(i)) headers.append("Mean") accuracy_list = [] for i in range(1, self.n_folds + 1): indices = self.data_folds[i] training = self.input.drop(self.input.index[indices]) training_y = self.output.drop(self.output.index[indices]) test = self.input.loc[self.input.index[indices], :] test_y = self.output.loc[self.output.index[indices], :] lsh = LSH.MinHash(training, training_y, n, p) lsh.train() lsh.predict(test, 5, 1) correct, counter, accuracy = lsh.accuracy(test_y) accuracy_list.append(accuracy) mean_accuracy += accuracy accuracy_list.append(float(mean_accuracy) / self.n_folds) accuracy_table = pd.DataFrame([accuracy_list], columns=headers) accuracy_table = accuracy_table.rename(index={0: "result"}) print(accuracy_table) return accuracy_table
def evaluate_minhash(self, b, r): """ evaluates Min-hash """ mean_accuracy = 0 mean_coverage = 0 headers = [] for i in range(1, self.n_folds + 1): headers.append("fold {}".format(i)) headers.append("Mean") accuracy_list = [] coverage_list = [] for i in range(1, self.n_folds + 1): print("fold {}".format(i)) d = [] p = [] indices = self.data_folds[i] training = self.input.drop(self.input.index[indices]) training_y = self.output.drop(self.output.index[indices]) test = self.input.loc[self.input.index[indices], :] test_y = self.output.loc[self.output.index[indices], :] '''train without fold i''' lsh = LSH.MinHash(training, training_y, b, r) lsh.train() '''test on fold i''' courses = lsh.predict(test) '''calculate rmse''' rmse = lsh.accuracy(test_y, d, p) accuracy_list.append(rmse) mean_accuracy += rmse '''calculate coverage. We have defined coverage as follows: coverage = # of unique items we have recommended on the test set / # of all items ''' for item in courses: if item not in self.recommended: self.recommended.append(item) c = len(self.recommended) / float(lsh.item_num) mean_coverage += c coverage_list.append(c) coverage_list.append(float(mean_coverage) / self.n_folds) accuracy_list.append(float(mean_accuracy) / self.n_folds) accuracy_table = pd.DataFrame([accuracy_list, coverage_list], columns=headers) accuracy_table = accuracy_table.rename(index={ 0: "RMSE", 1: "Coverage" }) print(accuracy_table) return accuracy_table
def main(): df = pd.read_csv("old.csv", names=['user', 'rating']) df2 = pd.read_csv("new.csv", names=['user', 'rating']) df['rating'] = df.apply(lambda row: ast.literal_eval(row['rating']), axis=1) df2['rating'] = df2.apply(lambda row: ast.literal_eval(row['rating']), axis=1) item_column = [] item_column2 = [] for i in range(df.shape[0]): item_column.append(list(df.iloc[i]['rating'])) for i in range(df2.shape[0]): item_column2.append(list(df2.iloc[i]['rating'])) df['item'] = pd.Series(item_column) df2['item'] = pd.Series(item_column2) # df['item'] = df.apply(lambda row: list(row['rating'].keys()), axis=1) output = df[['rating']] input = df[['item']] input2 = df2[['item']] # print('data loaded') # data = Accuracy.CrossValidate(input, output, n_folds=5) # data.split() # print('data preprocessed') # tuned_param = list() # for i in range(4, 5): # for j in range(3, 4): # print(i, j) # accuracy = data.evaluate_minhash(i, j) # mean_score = accuracy['Mean'][0] # if len(tuned_param) == 0: # tuned_param = [i, j, mean_score] # elif tuned_param[2] > mean_score: # tuned_param = [i, j, mean_score] # print("best param: ", tuned_param[0], tuned_param[1]) nrows = input.shape[0] numbers = list(range(nrows)) each_fold_size = math.floor(float(nrows) / 5) indices = np.random.choice(numbers, each_fold_size, replace=False).tolist() nrows2 = input2.shape[0] numbers2 = list(range(nrows2)) each_fold_size2 = math.floor(float(nrows2) / 5) indices2 = np.random.choice(numbers2, each_fold_size2, replace=False).tolist() training = input.drop(input.index[indices]) training_y = output.drop(output.index[indices]) test1 = input.loc[input.index[indices], :] test2 = input2.loc[input2.index[indices2], :] lsh = LSH.MinHash(training, training_y, 4, 3) lsh.train() '''test on fold i''' courses = lsh.predict(test1) courses2 = lsh.predict(test2) counter = 0 for i in courses2: if i not in courses: counter += 1 print(counter / float(len(courses2)))