def process(person_list):
    file_count = 0
    for tmp_i in range(0, all_person):
        for tmp_j in range(0, all_person):
            flag = len(person_list[tmp_i]) <= len(person_list[tmp_j])
            cos_sum = 0
            sim = 0
            if len_list[tmp_i] != 0 and len_list[tmp_j] != 0:
                if flag:
                    for key in person_list[tmp_i].keys():
                        if key in person_list[tmp_j].keys():
                            cos_sum += person_list[tmp_i][key] * person_list[
                                tmp_j][key]
                else:
                    for key in person_list[tmp_j].keys():
                        if key in person_list[tmp_i].keys():
                            cos_sum += person_list[tmp_i][key] * person_list[
                                tmp_j][key]
                sim = cos_sum / (len_list[tmp_i] * len_list[tmp_j])
            sim_list.append(sim)
        if tmp_i % 100 == 99:
            files.writepkl('data/train_res%d.pkl' % file_count, sim_list)
            file_count += 1
            sim_list.clear()
            print((tmp_i, tmp_j, sim)),
    files.writepkl('data/train_res%d.pkl' % file_count, sim_list)
    print("all files finishi")
def loadtrain_xy():
    #loadtrain()
    train = files.readpkl("pro_data/train.pkl")
    train_X, train_Y = [], []
    for i in range(len(train)):
        train_X.append(train[i][:, :-1])
        train_Y.append(train[i][:, -1])
    files.writepkl('pro_data/train_X.pkl', train_X)
    #files.writetxt('pro_data/train_X.txt', train_X)
    files.writepkl('pro_data/train_Y.pkl', train_Y)
def load_rmsedata():
    train = files.readpkl("pro_data/train_X.pkl")
    each_label_num = np.array([len(data) for data in train])
    train_count = each_label_num * 0.7
    train_count = train_count.astype(int)  # 训练集每层总量

    rmse_train, rmse_test = [], []
    for i in range(101):
        rmse_train.append(train[i][:train_count[i]])
        rmse_test.append(train[i][train_count[i]:])

    files.writepkl("rmse_data/rmse_train.pkl", rmse_train)
    files.writepkl("rmse_data/rmse_test.pkl", rmse_test)
def loadtest():
    data = files.readpkl("pro_data/test_stat.pkl")

    for i in range(len(data)):
        if i % 500 == 0:
            print("i:", i)
        user_id = np.array([data[i][0]] * data[i][1]).reshape(-1, 1)
        item_id = np.array(list(data[i][2].keys()), dtype=int).reshape(-1, 1)
        if i == 0:
            matrix = np.concatenate((user_id, item_id), axis=1)
        else:
            m = np.concatenate((user_id, item_id), axis=1)
            matrix = np.vstack((matrix, m))

    #files.writetxt('pro_data/test.txt', matrix)
    files.writepkl('pro_data/test.pkl', matrix)
def get_scores():
    train_X = files.readpkl("pro_data/train_X.pkl")
    each_label_num = np.array([len(data) for data in train_X])
    prior_pr = each_label_num / np.sum(each_label_num)  # 先验概率

    ave = np.array([np.mean(train_X[i], axis=0)
                    for i in range(len(train_X))])  # 均值
    std = np.array([np.std(train_X[i], axis=0)
                    for i in range(len(train_X))])  # 标准差

    test = files.readpkl("pro_data/test.pkl")  # 二维矩阵
    scores = []
    for t in test:
        scores.append(NaiveBayes_Classifier(ave, std, t, prior_pr))
    scores = np.array(scores)

    files.writepkl("scores.pkl", scores)
def get_similarity(item_cnt):
    cnt = 0
    for ids in range(int(maxid/20)+1):
        filename = "new/sim_matrix/"+str(ids)+".pkl"
        matrix_file = "new/item_matrix/"+str(ids)+".pkl"
        try:
            matrix = f.readpkl(matrix_file)
            sim_matrix = f.readpkl(filename)
            for pair in matrix:
                item1 = pair[0]
                item2 = pair[1]
                if item1 not in sim_matrix:
                    sim_matrix[item1] = {}
                sim_matrix[item1][item2] = matrix[pair]/(item_cnt[item1]*item_cnt[item2])
            f.writepkl(filename,sim_matrix)
            cnt = cnt+1
            print ("finish",cnt)
        except FileNotFoundError:
            continue
def get_item_item_matrix(item_record):
    cnt = 0
    for ids in range(1,user_max,10):
        filename = "data/item_matrix2/"+str(int(ids/10))+".pkl" 
        #读取当前块存储的同现矩阵
        try:
            item_matrix = f.readpkl(filename)
            for user in range(ids,ids+10):
                for item1 in item_record[user]:#遍历用户评分的商品,组成商品对(item1,item2)
                    for item2 in item_record[user]:
                        item_pair = (item1,item2)
                        if item_pair not in item_matrix:
                            item_matrix[item_pair] = 0	#初始化字典条目
                        item_matrix[item_pair] = item_matrix[item_pair]+1	#同现矩阵计数+1
            f.writepkl(filename,item_matrix) #保存当前块
            cnt = cnt+1
            print("finish",cnt)
        except EOFError:
            continue
        except FileNotFoundError:
            continue
def loadtrain():
    data = files.readpkl("pro_data/train_stat.pkl")

    for i in range(len(data)):
        if i % 500 == 0:
            print("i:", i)
        user_id = np.array([data[i][0]] * data[i][1]).reshape(-1, 1)
        item_id = np.array(list(data[i][2].keys()), dtype=int).reshape(-1, 1)
        score = np.array(list(data[i][2].values()), dtype=int).reshape(-1, 1)
        if i == 0:
            matrix = np.concatenate((user_id, item_id, score), axis=1)
        else:
            m = np.concatenate((user_id, item_id, score), axis=1)
            matrix = np.vstack((matrix, m))

    _score = matrix[:, 2]
    train = []
    for i in range(101):
        index = np.argwhere(_score == i).reshape(-1)
        train.append(matrix[index])

    #files.writetxt('pro_data/train.txt', train)
    files.writepkl('pro_data/train.pkl', train)
def run(data,
        fea_num,
        method,
        noise=None):  # method=0 -> ReliefF || method=1 -> MRMR
    sampling_times = 50  # 抽样次数
    k_samples = 10  # 最近邻样本个数
    k_cross = 10  # k折交叉验证折数
    accuracy, auc = np.zeros((len(delta), 4)), np.zeros((len(delta), 4))

    for i in range(len(fea_num)):
        if method == 0:
            features = ReliefF(data, sampling_times, fea_num[i], k_samples)
        elif method == 1:
            features = MRMR(data, fea_num[i])
        _data = np.concatenate((data[:, features], data[:, -1].reshape(-1, 1)),
                               axis=1)  # 特征选择后的样本

        # k折交叉验证运行代码(未分层处理)
        for j in range(k_cross):
            np.random.shuffle(_data)
            train, test = _data[int(len(_data) / k_cross
                                    ):], _data[:int(len(_data) / k_cross)]
            clf = Classifier(train, test)
            _knn, _nb, _svm, _rf = clf.knn(), clf.NaiveBayes(), clf.SVM(
            ), clf.RandomForest()
            accuracy[i][0] += _knn['score']
            accuracy[i][1] += _nb['score']
            accuracy[i][2] += _svm['score']
            accuracy[i][3] += _rf['score']
            auc[i][0] += _knn['auc']
            auc[i][1] += _nb['auc']
            auc[i][2] += _svm['auc']
            auc[i][3] += _rf['auc']

    accuracy, auc = accuracy / k_cross, auc / k_cross

    if noise == None:
        if method == 0:
            f.writepkl("result/acc_ReliefF.pkl", accuracy)
            f.writepkl("result/auc_ReliefF.pkl", auc)
        elif method == 1:
            f.writepkl("result/acc_MRMR.pkl", accuracy)
            f.writepkl("result/auc_MRMR.pkl", auc)
    elif noise != None:
        if method == 0:
            f.writepkl("result/acc_ReliefF_noise" + str(noise) + ".pkl",
                       accuracy)
            f.writepkl("result/auc_ReliefF_noise" + str(noise) + ".pkl", auc)
        elif method == 1:
            f.writepkl("result/acc_MRMR_noise" + str(noise) + ".pkl", accuracy)
            f.writepkl("result/auc_MRMR_noise" + str(noise) + ".pkl", auc)
    data = []
    with codecs.open(filename, encoding="utf-8") as f:
        _data = f.readlines()
        for line in _data:
            line = line.rstrip('\r\n')
            if '|' in line:
                data.append([int(item) for item in line.split('|')])
                data[-1].append({})
            else:  # 使用时要把数据集中的空行去掉,不然报错
                data[-1][2][line] = -1
    return data


if __name__ == "__main__":
    data1 = stat_loadtrain('data/train.txt')
    files.writepkl('pro_data/train_stat.pkl', data1)

    data2 = stat_loadtest('data/test.txt')
    files.writepkl('pro_data/test_stat.pkl', data2)
    '''
    #data = files.readpkl('data/test_stat.pkl')
    #data = files.readpkl('data/train_stat.pkl')
    print(len(data), data[-1][0])

    items, scores = [], []
    for e in data:
        items += list(e[2].keys())
        scores += list(e[2].values())
    
    items = list(set(items))
    items.sort()
    for i in range(0, len(test_stat)):
        for key in test_stat[i][2].keys():
            scored_people, scored_sim = find_relative_people(i, key)
            if len(scored_people) == 0:
                test_stat[i][2][key] = 0
                continue
            sim_sum = 0
            for j in range(0, len(scored_sim)):
                sim_sum += scored_sim[j]
            # print(coefficience)
            # print('sim sum:%f' % sim_sum)
            sum = 0
            for j in range(0, len(scored_people)):
                sum += train_stat[scored_people[j]][2][key] * scored_sim[j]
            # print('item sum:%f' % sum)
            if sim_sum != 0:
                sum = sum / sim_sum
            # print('modified sum:%f'%sum)
            test_stat[i][2][key] = round(sum, 2)
        # break
        # print(test_stat[i])
        if i % 100 == 99:
            print("Finish %d*100 users" % counter)
            files.writepkl('data/rmse_stat%d.pkl' % counter,
                           test_stat[counter * 100:counter * 100 + 100])
            files.writepkl('data/rmse_origin%d.pkl' % counter,
                           frag[counter * 100:counter * 100 + 100])
            counter += 1
    files.writepkl('data/rmse_stat.pkl', test_stat)
    files.writepkl('data/rmse_origin.pkl', frag)
Beispiel #12
0

if __name__ == "__main__":
    person_list = initial(train_stat)
    counter = 0
    for i in range(0, len(test_stat)):
        for key in test_stat[i][2].keys():
            scored_people, scored_sim = find_relative_people(i, key)
            if len(scored_people) == 0:
                test_stat[i][2][key] = 0
                continue
            sim_sum = 0
            for j in range(0, len(scored_sim)):
                sim_sum += scored_sim[j]
            # print(coefficience)
            # print('sim sum:%f' % sim_sum)
            sum = 0
            for j in range(0, len(scored_people)):
                sum += train_stat[scored_people[j]][2][key] * scored_sim[j]
            # print('item sum:%f' % sum)
            if sim_sum != 0:
                sum = sum / sim_sum
            # print('modified sum:%f'%sum)
            test_stat[i][2][key] = round(sum, 2)
        # break
        # print(test_stat[i])
        if i % 100 == 99:
            print("Finish %d*100 users" % counter)
            counter += 1
    files.writepkl('data/test_res.pkl', test_stat)
def ini_item_item():
    for ids in range(1,user_max,10):
        filename = "data/item_matrix2/" + str(int(ids / 10)) + ".pkl"
        item_matrix = {}
        f.writepkl(filename,item_matrix)
    print("ini ok")
def ini_sim():
    for ids in range(int(maxid/20)+1):
        filename = "new/sim_matrix/"+str(ids)+".pkl"
        sim_matrix = {}
        f.writepkl(filename,sim_matrix)
    for user in item_record:
        for item_predict in item_record[user]:
            predict_score = 0
            filename = "data/sim_matrix/"+str(int(int(item_predict)/10))+".pkl" #读取物品相似度矩阵
            sim_matrix = f.readpkl(filename)
            try:
                for item in user_item_record[user]:
                    if item in sim_matrix[item_predict]:# 计算评分
                        predict_score = predict_score+user_item_record[user][item]
                        				*sim_matrix[item_predict][item]
                item_record[user][item_predict] = predict_score #存储对应的预测评分
            except KeyError:
                continue
        cnt = cnt+1
        print ("finish",cnt)
    f.writepkl("new/result.pkl",item_record)
def ini_item_item():
    for ids in range(1,user_max,10):
        filename = "data/item_matrix2/" + str(int(ids / 10)) + ".pkl"
        item_matrix = {}
        f.writepkl(filename,item_matrix)
    print("ini ok")

if __name__ == '__main__':
    train_record = f.readpkl("data/item_record.pkl")
    test_record = f.readpkl("data/test_record.pkl")
    item_cnt = f.readpkl("data/item_cnt.pkl")
    item_dict = {}
    predict2(train_record=train_record,test_record=test_record,item_cnt=item_cnt,item_dict= item_dict)
    print ("ok")