else: d[row[2]] = d.get(row[2]) + '===' + row[4] item_review_l = list(d.items()) print(item_review_l[0][1]) ''' #分词,去除停用词,stem, #构建词典 #构建词袋 #lda 输出 trainfile = '../data/[email protected]' train_df = preprocess.readdata(trainfile, ',') train_list = preprocess.create_train_list(train_df) testset_df = pd.read_csv('../data/[email protected]', header=None) #构建两个字典,分别是用户-情感向量,物品-topic向量 #1. 构建字典user-pref矩阵 uisv_names = ['user', 'item', 'sentiment', 'vector', 'pref'] uisv_df = pd.read_csv('../data/out3/uisv3.csv', header=None, names=uisv_names) user_l = uisv_df['user'].tolist() pref_l = uisv_df['pref'].tolist() u_dict = dict(zip(user_l, pref_l)) #2 构建item-vertor矩阵
sys.path.append("..") from utils.logger import get_logger from preprocess import preprocess from model import bmf import pandas as pd import numpy as np logger = get_logger('e_BMF_list') filename = 'j:/amazon/output/[email protected]' dataname = filename.split('/')[-1] #outpath outpath = 'j:/amazon/result/result4/' #0.读取数据 ratings = preprocess.readdata(filename, ',') #1.判断是否有重复元素,如果有,去除重复元素 ratings_d = preprocess.drop_duplicate(ratings) #2. 替换user_id 与 item_id ratings_r, users, items = preprocess.replace_user_and_item(ratings_d) #基本数据描述(包含数据总数目,用户数,物品数) #用户数 m = len(users) n = len(items) logger.info('dataset:' + dataname + ',ratings:' + str(len(ratings_r)) + ',user:'******',item:' + str(n)) #3. 切分数据 trainset, testset = preprocess.split_data(ratings_r, 0.8) trainset.to_csv(outpath + 'trainset' + '_' + dataname, index=None, header=None)
import sys sys.path.append('../') from preprocess import preprocess import numpy as np ratings_df = preprocess.readdata('j:/amazon/output/[email protected]',',') ratings_r_df,users,items = preprocess.replace_user_and_item(ratings_df) l = [] #row[0]=index,row[1]=user_id,row[2]=item_id for row in ratings_r_df.itertuples(): t = [(row[1],row[2]),row[3]] l.append(t) row = l[0] print(row) print(row[0][0],row[0][1],row[1]) ''' @desc:训练 @param:trainMatrix,k(int,代表factors数量) @return:u,v(m*k和n*k的矩阵) ''' def train(l,k): m = len(users) n = len(items) alpha = 0.01 lamda = 0.01 u = np.random.rand(m,k)