def process(person_list): file_count = 0 for tmp_i in range(0, all_person): for tmp_j in range(0, all_person): flag = len(person_list[tmp_i]) <= len(person_list[tmp_j]) cos_sum = 0 sim = 0 if len_list[tmp_i] != 0 and len_list[tmp_j] != 0: if flag: for key in person_list[tmp_i].keys(): if key in person_list[tmp_j].keys(): cos_sum += person_list[tmp_i][key] * person_list[ tmp_j][key] else: for key in person_list[tmp_j].keys(): if key in person_list[tmp_i].keys(): cos_sum += person_list[tmp_i][key] * person_list[ tmp_j][key] sim = cos_sum / (len_list[tmp_i] * len_list[tmp_j]) sim_list.append(sim) if tmp_i % 100 == 99: files.writepkl('data/train_res%d.pkl' % file_count, sim_list) file_count += 1 sim_list.clear() print((tmp_i, tmp_j, sim)), files.writepkl('data/train_res%d.pkl' % file_count, sim_list) print("all files finishi")
def loadtrain_xy(): #loadtrain() train = files.readpkl("pro_data/train.pkl") train_X, train_Y = [], [] for i in range(len(train)): train_X.append(train[i][:, :-1]) train_Y.append(train[i][:, -1]) files.writepkl('pro_data/train_X.pkl', train_X) #files.writetxt('pro_data/train_X.txt', train_X) files.writepkl('pro_data/train_Y.pkl', train_Y)
def load_rmsedata(): train = files.readpkl("pro_data/train_X.pkl") each_label_num = np.array([len(data) for data in train]) train_count = each_label_num * 0.7 train_count = train_count.astype(int) # 训练集每层总量 rmse_train, rmse_test = [], [] for i in range(101): rmse_train.append(train[i][:train_count[i]]) rmse_test.append(train[i][train_count[i]:]) files.writepkl("rmse_data/rmse_train.pkl", rmse_train) files.writepkl("rmse_data/rmse_test.pkl", rmse_test)
def loadtest(): data = files.readpkl("pro_data/test_stat.pkl") for i in range(len(data)): if i % 500 == 0: print("i:", i) user_id = np.array([data[i][0]] * data[i][1]).reshape(-1, 1) item_id = np.array(list(data[i][2].keys()), dtype=int).reshape(-1, 1) if i == 0: matrix = np.concatenate((user_id, item_id), axis=1) else: m = np.concatenate((user_id, item_id), axis=1) matrix = np.vstack((matrix, m)) #files.writetxt('pro_data/test.txt', matrix) files.writepkl('pro_data/test.pkl', matrix)
def get_scores(): train_X = files.readpkl("pro_data/train_X.pkl") each_label_num = np.array([len(data) for data in train_X]) prior_pr = each_label_num / np.sum(each_label_num) # 先验概率 ave = np.array([np.mean(train_X[i], axis=0) for i in range(len(train_X))]) # 均值 std = np.array([np.std(train_X[i], axis=0) for i in range(len(train_X))]) # 标准差 test = files.readpkl("pro_data/test.pkl") # 二维矩阵 scores = [] for t in test: scores.append(NaiveBayes_Classifier(ave, std, t, prior_pr)) scores = np.array(scores) files.writepkl("scores.pkl", scores)
def get_similarity(item_cnt): cnt = 0 for ids in range(int(maxid/20)+1): filename = "new/sim_matrix/"+str(ids)+".pkl" matrix_file = "new/item_matrix/"+str(ids)+".pkl" try: matrix = f.readpkl(matrix_file) sim_matrix = f.readpkl(filename) for pair in matrix: item1 = pair[0] item2 = pair[1] if item1 not in sim_matrix: sim_matrix[item1] = {} sim_matrix[item1][item2] = matrix[pair]/(item_cnt[item1]*item_cnt[item2]) f.writepkl(filename,sim_matrix) cnt = cnt+1 print ("finish",cnt) except FileNotFoundError: continue
def get_item_item_matrix(item_record): cnt = 0 for ids in range(1,user_max,10): filename = "data/item_matrix2/"+str(int(ids/10))+".pkl" #读取当前块存储的同现矩阵 try: item_matrix = f.readpkl(filename) for user in range(ids,ids+10): for item1 in item_record[user]:#遍历用户评分的商品,组成商品对(item1,item2) for item2 in item_record[user]: item_pair = (item1,item2) if item_pair not in item_matrix: item_matrix[item_pair] = 0 #初始化字典条目 item_matrix[item_pair] = item_matrix[item_pair]+1 #同现矩阵计数+1 f.writepkl(filename,item_matrix) #保存当前块 cnt = cnt+1 print("finish",cnt) except EOFError: continue except FileNotFoundError: continue
def loadtrain(): data = files.readpkl("pro_data/train_stat.pkl") for i in range(len(data)): if i % 500 == 0: print("i:", i) user_id = np.array([data[i][0]] * data[i][1]).reshape(-1, 1) item_id = np.array(list(data[i][2].keys()), dtype=int).reshape(-1, 1) score = np.array(list(data[i][2].values()), dtype=int).reshape(-1, 1) if i == 0: matrix = np.concatenate((user_id, item_id, score), axis=1) else: m = np.concatenate((user_id, item_id, score), axis=1) matrix = np.vstack((matrix, m)) _score = matrix[:, 2] train = [] for i in range(101): index = np.argwhere(_score == i).reshape(-1) train.append(matrix[index]) #files.writetxt('pro_data/train.txt', train) files.writepkl('pro_data/train.pkl', train)
def run(data, fea_num, method, noise=None): # method=0 -> ReliefF || method=1 -> MRMR sampling_times = 50 # 抽样次数 k_samples = 10 # 最近邻样本个数 k_cross = 10 # k折交叉验证折数 accuracy, auc = np.zeros((len(delta), 4)), np.zeros((len(delta), 4)) for i in range(len(fea_num)): if method == 0: features = ReliefF(data, sampling_times, fea_num[i], k_samples) elif method == 1: features = MRMR(data, fea_num[i]) _data = np.concatenate((data[:, features], data[:, -1].reshape(-1, 1)), axis=1) # 特征选择后的样本 # k折交叉验证运行代码(未分层处理) for j in range(k_cross): np.random.shuffle(_data) train, test = _data[int(len(_data) / k_cross ):], _data[:int(len(_data) / k_cross)] clf = Classifier(train, test) _knn, _nb, _svm, _rf = clf.knn(), clf.NaiveBayes(), clf.SVM( ), clf.RandomForest() accuracy[i][0] += _knn['score'] accuracy[i][1] += _nb['score'] accuracy[i][2] += _svm['score'] accuracy[i][3] += _rf['score'] auc[i][0] += _knn['auc'] auc[i][1] += _nb['auc'] auc[i][2] += _svm['auc'] auc[i][3] += _rf['auc'] accuracy, auc = accuracy / k_cross, auc / k_cross if noise == None: if method == 0: f.writepkl("result/acc_ReliefF.pkl", accuracy) f.writepkl("result/auc_ReliefF.pkl", auc) elif method == 1: f.writepkl("result/acc_MRMR.pkl", accuracy) f.writepkl("result/auc_MRMR.pkl", auc) elif noise != None: if method == 0: f.writepkl("result/acc_ReliefF_noise" + str(noise) + ".pkl", accuracy) f.writepkl("result/auc_ReliefF_noise" + str(noise) + ".pkl", auc) elif method == 1: f.writepkl("result/acc_MRMR_noise" + str(noise) + ".pkl", accuracy) f.writepkl("result/auc_MRMR_noise" + str(noise) + ".pkl", auc)
data = [] with codecs.open(filename, encoding="utf-8") as f: _data = f.readlines() for line in _data: line = line.rstrip('\r\n') if '|' in line: data.append([int(item) for item in line.split('|')]) data[-1].append({}) else: # 使用时要把数据集中的空行去掉,不然报错 data[-1][2][line] = -1 return data if __name__ == "__main__": data1 = stat_loadtrain('data/train.txt') files.writepkl('pro_data/train_stat.pkl', data1) data2 = stat_loadtest('data/test.txt') files.writepkl('pro_data/test_stat.pkl', data2) ''' #data = files.readpkl('data/test_stat.pkl') #data = files.readpkl('data/train_stat.pkl') print(len(data), data[-1][0]) items, scores = [], [] for e in data: items += list(e[2].keys()) scores += list(e[2].values()) items = list(set(items)) items.sort()
for i in range(0, len(test_stat)): for key in test_stat[i][2].keys(): scored_people, scored_sim = find_relative_people(i, key) if len(scored_people) == 0: test_stat[i][2][key] = 0 continue sim_sum = 0 for j in range(0, len(scored_sim)): sim_sum += scored_sim[j] # print(coefficience) # print('sim sum:%f' % sim_sum) sum = 0 for j in range(0, len(scored_people)): sum += train_stat[scored_people[j]][2][key] * scored_sim[j] # print('item sum:%f' % sum) if sim_sum != 0: sum = sum / sim_sum # print('modified sum:%f'%sum) test_stat[i][2][key] = round(sum, 2) # break # print(test_stat[i]) if i % 100 == 99: print("Finish %d*100 users" % counter) files.writepkl('data/rmse_stat%d.pkl' % counter, test_stat[counter * 100:counter * 100 + 100]) files.writepkl('data/rmse_origin%d.pkl' % counter, frag[counter * 100:counter * 100 + 100]) counter += 1 files.writepkl('data/rmse_stat.pkl', test_stat) files.writepkl('data/rmse_origin.pkl', frag)
if __name__ == "__main__": person_list = initial(train_stat) counter = 0 for i in range(0, len(test_stat)): for key in test_stat[i][2].keys(): scored_people, scored_sim = find_relative_people(i, key) if len(scored_people) == 0: test_stat[i][2][key] = 0 continue sim_sum = 0 for j in range(0, len(scored_sim)): sim_sum += scored_sim[j] # print(coefficience) # print('sim sum:%f' % sim_sum) sum = 0 for j in range(0, len(scored_people)): sum += train_stat[scored_people[j]][2][key] * scored_sim[j] # print('item sum:%f' % sum) if sim_sum != 0: sum = sum / sim_sum # print('modified sum:%f'%sum) test_stat[i][2][key] = round(sum, 2) # break # print(test_stat[i]) if i % 100 == 99: print("Finish %d*100 users" % counter) counter += 1 files.writepkl('data/test_res.pkl', test_stat)
def ini_item_item(): for ids in range(1,user_max,10): filename = "data/item_matrix2/" + str(int(ids / 10)) + ".pkl" item_matrix = {} f.writepkl(filename,item_matrix) print("ini ok")
def ini_sim(): for ids in range(int(maxid/20)+1): filename = "new/sim_matrix/"+str(ids)+".pkl" sim_matrix = {} f.writepkl(filename,sim_matrix)
for user in item_record: for item_predict in item_record[user]: predict_score = 0 filename = "data/sim_matrix/"+str(int(int(item_predict)/10))+".pkl" #读取物品相似度矩阵 sim_matrix = f.readpkl(filename) try: for item in user_item_record[user]: if item in sim_matrix[item_predict]:# 计算评分 predict_score = predict_score+user_item_record[user][item] *sim_matrix[item_predict][item] item_record[user][item_predict] = predict_score #存储对应的预测评分 except KeyError: continue cnt = cnt+1 print ("finish",cnt) f.writepkl("new/result.pkl",item_record) def ini_item_item(): for ids in range(1,user_max,10): filename = "data/item_matrix2/" + str(int(ids / 10)) + ".pkl" item_matrix = {} f.writepkl(filename,item_matrix) print("ini ok") if __name__ == '__main__': train_record = f.readpkl("data/item_record.pkl") test_record = f.readpkl("data/test_record.pkl") item_cnt = f.readpkl("data/item_cnt.pkl") item_dict = {} predict2(train_record=train_record,test_record=test_record,item_cnt=item_cnt,item_dict= item_dict) print ("ok")