def test(): # marks = get_marks(10, 0, 0.1) # print('marks:', marks)#test A = [0, 0, 1, 1, 0] P = [0, 1, 0, 1, 1] table = get_table_of_confussion(A, P) print("table:", table) a = np.arange(15).reshape(5, 3) b = a[2:, 2] print(a) print(b) marks = get_marks(count=10, lower=0, upper=1) print('marks:', marks) #test a = 0.00000001 b = 0.999999 print("is_same_interval({}, {}): {}".format(\ a, b, is_same_interval(a, b, marks))) for i in range(10): pass print("i:", i) val = 1.2341 label = get_label(val, marks) print("label:", label) # a = (3,5) # print("a[0]:", a[0]) # a = np.arange(6).reshape(3,2) # b = a.shape[0] # print("b:", b) # a = 1 # print("a:", a) # a = 6.0 # b = 6 # print("a == b :", a == b) a = np.arange(15).reshape(5, 3) b = a[2, -1] print(a) print(b) a = 0 a -= 1 if True else 0 print(a)
def predict_bivar_judge_with_error(in_file, in_filename, out_address): # tansfer string to number so that we can train data = [] # track_time_all = [] # with open('/scratch/zpeng.scratch/pppp/music/data/listen/user_000002_time.tsv') as f: with open(in_file, 'r') as f: for line in f: # song,l,artist,percentage,a4,a5,a6 = line.split(",") userid, lt, tt, percentage, artid, artist, traid, song = line.split( '\t') # track_time_all.append(float(tt)/1000) if float(percentage) > 1: continue bb = [bin(ord(c))[2:] for c in song] px = 0 for item in bb: px = int(item) ^ px px = float(px) / 100 cc = [bin(ord(c))[2:] for c in artist] py = 0 for item in cc: py = int(item) ^ py py = float(py) / 100 # if float(percentage) > 1.0: # percentage = '1.0' # 1 means non-skip data.append([float(px), float(py), percentage]) #training two randomforestregressor models, one for judge whether it is 1 or 0, the other is used to judge the specific number less than zero data = np.asarray(data, dtype='float') # estimator = RandomForestRegressor(n_estimators = 100) # origin ################################################### # 1n Classifier Training ################################################### train_start = 0 train_end = int(np.floor(data.shape[0] * 2 / 3)) zero_y = data.copy() # label marks = get_marks(count=10, lower=0, upper=1) label_max = len(marks) - 1 for i in range(zero_y.shape[0]): sp = zero_y[i, 2] label = get_label(sp, marks) if label != label_max: label = 0 # 0 means Skip zero_y[i, 2] = label # /label # estimator = RandomForestClassifier(n_estimators = 100) estimator = RandomForestRegressor(n_estimators=100) try: estimator.fit(data[train_start:train_end, :2], y=zero_y[train_start:train_end, 2]) except: print("Exception: the 1st training failed.") return ################################################### # 2n Classifier Training ################################################### data_labeled = data.copy() for i in range(data_labeled.shape[0]): sp = data_labeled[i, 2] label = get_label(sp, marks) data_labeled[i, 2] = label # Train_index2 should not contain Non-Skip index train_index2 = [i for i in range(train_end)] for i in range(train_end): if data_labeled[i, 2] == label_max: # Drop Non-Skip index train_index2.remove(i) estimator2 = RandomForestClassifier(n_estimators=100) try: estimator2.fit(data_labeled[train_index2,:2],\ y = data_labeled[train_index2,2]) except: print("Exception: the 2st training failed.") return ################################################### # 1st Predicting phase ################################################### #result = gmm1.predict(data[300:400,:2]) # test_start = 22000 test_start = train_end test_end = data.shape[0] # test_index = [t for t in xrange(test_start,test_end)] test_index1 = [t for t in range(test_start, test_end)] try: # result = estimator.predict(data[test_index,:2]) result1 = estimator.predict(zero_y[test_index1, :2]) except: print("Exception: the 1st prediction failed.") return ################ # Regression ################ result1 = result1.astype(float) # origin none_zero_index = np.where(result1 >= label_max / 2) zero_index = np.where(result1 < label_max / 2) result1[none_zero_index] = label_max # means Non-Skip result1[zero_index] = 0 # 0 means Skip ################ # /Regression ################ ################################################### # Calculate the precision for 1st judgement ################################################### true_count1 = 0 good_count1 = 0 # tmp_str = "precision of 0-1 judge: {0:.1f}%".format(float(counter)/len(test_index)*100) # print(tmp_str) # discript = tmp_str A = zero_y[test_index1, 2] P = result1 test_amount1 = len(P) for i in range(test_amount1): act = A[i] pre = P[i] if act == pre and pre == label_max: true_count1 += 1 if pre == label_max: good_count1 += 1 tc = get_table_of_confussion(A, P) tp = tc['TP'] # True Positive tn = tc['TN'] # True Negative fp = tc['FP'] # False Positive fn = tc['FN'] # False Negative accuracy = (tp + tn) / len(P) if tp == 0 and fp == 0: precision = -1 else: precision = tp / (tp + fp) if tp + fn == 0: recall = -1 else: recall = tp / (tp + fn) if 2 * tp + fp + fn == 0: f1_score = -1 else: f1_score = 2 * tp / (2 * tp + fp + fn) tmp_str = '0-1 Judge: Accuracy: ' + str(accuracy) + '\n' + \ 'Precision: ' + str(precision) + '\n' + \ 'Recall: ' + str(recall) + '\n' + \ 'F1 Score: ' + str(f1_score) print(tmp_str) discript = tmp_str ################################################### # 2n Prediction ################################################### # Test_index2 should not contain PREDICTED-Non-Skip index test_index2 = test_index1.copy() for i in range(len(result1)): if result1[i] == label_max: # Drop Non-Skip index test_index2.remove(test_start + i) try: result2 = estimator2.predict(data_labeled[test_index2, :2]) except: print("Exception: the 2nd prediction failed.") return # result2 = result2.astype(float) # !!!!!!!!!!!!!!!!!!!!!!! # error = abs(result2-data[index,2]) ################################################### # Calculate 2nd Predict Accuracy ################################################### A = data_labeled[test_index2, 2] P = result2 test_amount2 = len(P) true_count2 = 0 good_count2 = 0 # marks = get_marks(count=10, lower=0, upper=1) for i in range(test_amount2): act = A[i] pre = P[i] # if is_same_interval(act, pre, marks): if act == pre: true_count2 += 1 if pre >= act: good_count2 += 1 # accuracy = true_count2 / test_amount true_count = true_count1 + true_count2 test_amount = test_end - test_start accuracy_all = true_count / test_amount tmp_str = "Accuracy of all: {}".format(accuracy_all) print(tmp_str) discript += '\n' + tmp_str good_count = good_count1 + good_count2 user_exp = good_count / test_amount tmp_str = "User experience: {}".format(user_exp) print(tmp_str) discript += '\n' + tmp_str ################################################### # Plot a figure ################################################### file_name = in_filename[:-9] out_file_text = out_address + file_name + '_bi_predict_with_partition.txt' with open(out_file_text, 'w') as output: output.write(discript)
def predict_bivar_judge_with_error(in_file, in_filename, out_address): # tansfer string to number so that we can train data = [] # track_time_all = [] # with open('/scratch/zpeng.scratch/pppp/music/data/listen/user_000002_time.tsv') as f: with open(in_file, 'r') as f: for line in f: # song,l,artist,percentage,a4,a5,a6 = line.split(",") userid, lt, tt, percentage, artid, artist, traid, song = line.split( '\t') # track_time_all.append(float(tt)/1000) bb = [bin(ord(c))[2:] for c in song] px = 0 for item in bb: px = int(item) ^ px px = float(px) / 100 cc = [bin(ord(c))[2:] for c in artist] py = 0 for item in cc: py = int(item) ^ py py = float(py) / 100 # if float(percentage) > 1.0: # percentage = '1.0' # 1 means non-skip data.append([float(px), float(py), percentage]) #training two randomforestregressor models, one for judge whether it is 1 or 0, the other is used to judge the specific number less than zero data = np.asarray(data, dtype='float') estimator = RandomForestRegressor(n_estimators=100) # origin # estimator = RandomForestClassifier(n_estimators = 100) zero_y = data.copy() # origin not_one_index = np.where(data[:, 2] != 1)[0] # 1 means Non-Skip zero_y[not_one_index, 2] = 0 # 0 means Skip # /origin #training phase train_start = 0 train_end = int(np.floor(data.shape[0] * 2 / 3)) # # label # marks = get_marks(count=10, lower=0, upper=1) # label_max = len(marks) - 1 # for i in range(zero_y.shape[0]): # sp = zero_y[i,2] # label = get_label(sp, marks) # if label != label_max: # label = 0 # 0 means Skip # zero_y[i,2] = label # # /label estimator.fit(data[train_start:train_end, :2], y=zero_y[train_start:train_end, 2]) ################################################### # 1st Predicting phase ################################################### #result = gmm1.predict(data[300:400,:2]) # test_start = 22000 test_start = train_end test_end = data.shape[0] # test_index = [t for t in xrange(test_start,test_end)] test_index = [t for t in range(test_start, test_end)] try: result = estimator.predict(data[test_index, :2]) #origin # result = estimator.predict(zero_y[test_index,:2]) except: print("Exception: the 1st prediction failed.") return result = result.astype(float) # origin none_zero_index = np.where(result >= 0.5) zero_index = np.where(result < 0.5) result[none_zero_index] = 1 # 1 means Non-Skip result[zero_index] = 0 # 0 means Skip # /origin ################################################### # Calculate the precision for 1st judgement ################################################### counter = 0 i = 0 for item in result: print("@95 zero_y[{},2]: {}".format(i + test_start, zero_y[i + test_start, 2]), "item:", item) #test if item == zero_y[i + test_start, 2]: counter += 1 i += 1 # tmp_str = "precision of 0-1 judge: {0:.1f}%".format(float(counter)/len(test_index)*100) # print(tmp_str) # discript = tmp_str A = zero_y[test_start:, 2] P = result tc = get_table_of_confussion(A, P) tp = tc['TP'] # True Positive tn = tc['TN'] # True Negative fp = tc['FP'] # False Positive fn = tc['FN'] # False Negative accuracy = (tp + tn) / len(P) precision = tp / (tp + fp) recall = tp / (tp + fn) f1_score = 2 * tp / (2 * tp + fp + fn) tmp_str = '0-1 Judge: Accuracy: ' + str(accuracy) + '\n' + \ 'Precision: ' + str(precision) + '\n' + \ 'Recall: ' + str(recall) + '\n' + \ 'F1 Score: ' + str(f1_score) print(tmp_str) discript = tmp_str ################################################### # Predict skip point ################################################### # one_index = np.where(data[:,2] >= 1)[0] # data[one_index,2] = 1 # 1 means Non-Skip marks = get_marks(count=10, lower=0, upper=1) for i in range(data.shape[0]): sp = data[i, 2] label = get_label(sp, marks) data[i, 2] = label #training another model to find the exact number #gmm2 = mixture.GaussianMixture(n_components=5,covariance_type='full') # estimator2 = RandomForestRegressor(n_estimators = 100) estimator2 = RandomForestClassifier(n_estimators=100) #gmm2.fit(data[:,:2],y = data[:,2]) estimator2.fit(data[train_start:train_end, :2], y=data[train_start:train_end, 2]) index = np.copy(test_index) index = list(index) for t in none_zero_index[0]: tt = t + test_start index.remove(tt) try: result2 = estimator2.predict(data[index, :2]) except: print("Exception: the 2nd prediction failed.") return # result2 = result2.astype(float) # !!!!!!!!!!!!!!!!!!!!!!! # error = abs(result2-data[index,2]) ################################################### # Calculate 2nd Predict Accuracy ################################################### A = data[index, 2] P = result2 test_amount = len(P) true_count = 0 # marks = get_marks(count=10, lower=0, upper=1) for i in range(test_amount): act = A[i] pre = P[i] # if is_same_interval(act, pre, marks): if act == pre: true_count += 1 accuracy = true_count / test_amount tmp_str = "Precision of skip judge: {}%".format(accuracy * 100) print(tmp_str) discript += '\n' + tmp_str good_count = 0 for i in range(test_amount): act = A[i] pre = P[i] if pre >= act: good_count += 1 user_exp = good_count / test_amount tmp_str = "User experience: {}%".format(user_exp * 100) print(tmp_str) discript += '\n' + tmp_str # test_count = len(error) # error_mean = np.mean(error) # # error_threshold = error_mean # error_threshold = error_mean/2 # right_count = 0 # for delta in error: # if abs(delta) <= error_threshold: # right_count += 1 # ratio_predict = right_count / test_count # tmp_str = "Number of test: {}".format(test_count) # print(tmp_str) # discript += '\n' + tmp_str # tmp_str = "Precision of skip judge: {:.1f}%".format(ratio_predict * 100) # print(tmp_str) # discript += '\n' + tmp_str # track_time_test_all = [] # for i in index: # track_time_test_all.append(track_time_all[i]) # track_time_mean = np.mean(track_time_test_all) # ratio_error2tracktime = error_threshold / track_time_mean # tmp_str = \ # "Ratio of mean error to mean track time: {:.1f}%".format(ratio_error2tracktime*100) # print(tmp_str) # discript += '\n' + tmp_str # tmp_str = "mean of error: {}".format(np.mean(error)) # print(tmp_str) # discript += '\n' + tmp_str # tmp_str = "max of error: {}".format(error.max()) # print(tmp_str) # discript += '\n' + tmp_str # tmp_str = "min of error: {}".format(error.min()) # print(tmp_str) # discript += '\n' + tmp_str ################################################### # Plot a figure ################################################### # n,bins,patches = plt.hist(error,20,facecolor='green',alpha=0.5) # plt.xlabel('error') # plt.ylabel('number') # plt.title(r'Histogram of prediction error') # plt.show() file_name = in_filename[:-9] out_file_text = out_address + file_name + '_bi_predict_with_partition.txt' with open(out_file_text, 'w') as output: output.write(discript)
def predict_bivar_judge_with_error(in_file, in_filename, out_address): # tansfer string to number so that we can train data = [] # track_time_all = [] # with open('/scratch/zpeng.scratch/pppp/music/data/listen/user_000002_time.tsv') as f: with open(in_file, 'r') as f: for line in f: # song,l,artist,percentage,a4,a5,a6 = line.split(",") userid, start, lt, tt, percentage, artid, artist, traid, song = line.split( '\t') # track_time_all.append(float(tt)/1000) if float(percentage) > 1: continue start_label = get_start_time_label(start) bb = [bin(ord(c))[2:] for c in song] px = 0 for item in bb: px = int(item) ^ px px = float(px) / 100 cc = [bin(ord(c))[2:] for c in artist] py = 0 for item in cc: py = int(item) ^ py py = float(py) / 100 # if float(percentage) > 1.0: # percentage = '1.0' # 1 means non-skip data.append([float(px), float(py), float(start_label), percentage]) # data.append([percentage,float(px),float(py),float(start_label)]) #training two randomforestregressor models, one for judge whether it is 1 or 0, the other is used to judge the specific number less than zero data = np.asarray(data, dtype='float') # estimator = RandomForestRegressor(n_estimators = 100) # origin ################################################### # 1n Classifier Training ################################################### train_start = 0 train_end = int(np.floor(data.shape[0] * 2 / 3)) # train_end = int(np.floor(data.shape[0] * 9/10)) zero_y = data.copy() # label: 0 means Skip, label_max means Non-Skip marks = get_marks(count=10, lower=0, upper=1) label_max = len(marks) - 1 for i in range(zero_y.shape[0]): # sp = zero_y[i,2] sp = zero_y[i, -1] label = get_label(sp, marks) if label != label_max: label = 0 # 0 means Skip # zero_y[i,2] = label zero_y[i, -1] = label # /label estimator = RandomForestClassifier(n_estimators=100) try: # estimator.fit(data[train_start:train_end,:2],y = zero_y[train_start:train_end,2]) estimator.fit(data[train_start:train_end, :-1], y=zero_y[train_start:train_end, -1]) except: print("Exception: the 1st training failed.") return ################################################### # Testing for the 1st training set ################################################### try: result_training1 = estimator.predict( zero_y[train_start:train_end, :-1]) except: print("Exception: the testing for 1st training set failed.") return ################################################### # Calculate the Confusion Matrix for 1st training test ################################################### A = zero_y[train_start:train_end, -1] P = result_training1 tc = get_table_of_confussion(A, P) tp = tc['TP'] # True Positive tn = tc['TN'] # True Negative fp = tc['FP'] # False Positive fn = tc['FN'] # False Negative accuracy = (tp + tn) / len(P) if tp == 0 and fp == 0: precision = -1 else: precision = tp / (tp + fp) if tp + fn == 0: recall = -1 else: recall = tp / (tp + fn) if 2 * tp + fp + fn == 0: f1_score = -1 else: f1_score = 2 * tp / (2 * tp + fp + fn) tmp_str = 'Training set1: Accuracy: ' + str(accuracy) + '\n' + \ 'Precision: ' + str(precision) + '\n' + \ 'Recall: ' + str(recall) + '\n' + \ 'F1 Score: ' + str(f1_score) print(tmp_str) discript = tmp_str ################################################### # Plot a figure ################################################### # file_name = in_filename[:-9] file_name = in_filename[:11] out_file_text = out_address + file_name + '_training_set1_tc.txt' with open(out_file_text, 'w') as output: output.write(discript)