def fout(dict, embedDim, interval, distance): print 'Run task %s...' % (os.getpid(), ) plt.figure(figsize=(6, 4)) artistNum = 0 for artistId, artist in dict.items(): artistNum += 1 print artistNum savePath = os.path.join(utils.resultPath, artistId) if not os.path.exists(savePath): os.makedirs(savePath) # yPredictSum1 = np.zeros(testsize) # yPredictSum2 = np.zeros(distance) yPredictSum3 = np.zeros(distance) yPredictSum4 = np.zeros(distance) for songId, song in artist.getSongsOwned().items(): traceLength = np.array(song.getTrace()).shape[1] trainLength = pp.XTrainLength(traceLength, embedDim, interval, 7) # 训练集长度 if trainLength < 10: # 训练集长度不足10的歌曲 print 'iterated ' + str(traceLength) + ' ' + str(trainLength) # # SVR模型 # yPredict1 = genModel(artist, song, svrModel, embedDim, interval) # # 随机森林模型 # yPredict2 = genModel(artist, song, rfModel, embedDim, interval, distance) # GBRT模型 yPredict3 = itergenModel(artist, song, itergbrtModel, embedDim, interval, distance) # 完全随机森林模型 yPredict4 = itergenModel(artist, song, itererfModel, embedDim, interval, distance) else: print 'mix2 ' + str(traceLength) + ' ' + str(trainLength) # # SVR模型 # yPredict1 = genModel(artist, song, svrModel, embedDim, interval) # # 随机森林模型 # yPredict2 = genModel(artist, song, rfModel, embedDim, interval, distance) # GBRT模型 yPredict3 = genModel(artist, song, gbrtModel, embedDim, interval) # 完全随机森林模型 yPredict4 = genModel(artist, song, erfModel, embedDim, interval) plotResult(yPredict3, yPredict4) plt.savefig(os.path.join(savePath, 'song ' + songId + ".png")) plt.clf() # yPredictSum1 += yPredict1 # yPredictSum2 += yPredict2 yPredictSum3 += yPredict3 yPredictSum4 += yPredict4 plotResult(yPredictSum3, yPredictSum4) plt.savefig(os.path.join(savePath, 'artist ' + artistId + ".png")) plt.clf() # finalResultFile1 = os.path.join(finalResultPath, 'svr.csv') # writecsv(finalResultFile1, artistId, yPredictSum1) # finalResultFile2 = os.path.join(finalResultPath, 'rf.csv') # writecsv(finalResultFile2, artistId, yPredictSum2) finalResultFile3 = os.path.join(finalResultPath, 'gbrt.csv') writecsv(finalResultFile3, artistId, yPredictSum3) finalResultFile4 = os.path.join(finalResultPath, 'erf.csv') writecsv(finalResultFile4, artistId, yPredictSum4)
def train(artist, embedDim, interval): embedDimInit = embedDim intervalInit = interval distance = 7 playArray = [] songsList = [] shortSongsList = [] for song in artist.getSongsOwned().values(): playTrace = song.getTrace()[0] traceLength = len(playTrace) trainLength = pp.XTrainLength(traceLength, embedDim, interval, distance) # 训练集长度 if trainLength < 10: # 短歌曲不参与聚类训练 shortSongsList.append(song) continue if traceLength < utils.days: playTrace = np.hstack( (np.zeros(utils.days - traceLength), playTrace)) playArray.append(playTrace) songsList.append(song) apc = cluster.AffinityPropagation(damping=0.5, max_iter=500, convergence_iter=20, preference=None, affinity='euclidean') clusterIndex = apc.fit_predict(playArray) clusterDict = {} for index, song in zip(clusterIndex, songsList): # 将歌曲聚类 if index not in clusterDict: clusterDict[index] = [] songList = clusterDict.get(index) songList.append(song) yPredictSum = np.zeros(60) for index, songList in clusterDict.items(): print 'cluster' + str(index) embedDim = embedDimInit interval = intervalInit tracelist, meanList, varList = makeTraceList(songList) XTrainCluster, yTrainCluster = foldTrain(tracelist, embedDim, interval, distance) kfold = cross_validation.KFold(len(XTrainCluster), n_folds=5, shuffle=False) params = { 'n_estimators': randint(20, 200), 'loss': ['ls', 'lad', 'huber'], 'learning_rate': uniform(0.01, 0.19), 'subsample': uniform(0.5, 0.5), 'max_depth': randint(1, 5), 'min_samples_split': randint(1, 3), 'min_samples_leaf': randint(1, 3), 'max_features': randint(1, len(XTrainCluster[0])) } bestModels = [] for i in range(len(yTrainCluster[0])): gbrt = GradientBoostingRegressor() clf = grid_search.RandomizedSearchCV(gbrt, param_distributions=params, n_iter=30, scoring='mean_squared_error', cv=kfold, n_jobs=-1) clf.fit(XTrainCluster, yTrainCluster[:, i]) bestModels.append(clf.best_estimator_) for i in range(9): XTrainCluster, yTrainCluster = foldTrain(tracelist, embedDim, interval, distance) XPredictCluster = foldPredict(tracelist, embedDim, interval, distance) for k in range(len(songList)): # 对每首歌曲用同一个类别模型做预测 XPredict = XPredictCluster[k] subyPredict = [] for j in range(len(yTrainCluster[0])): bestModels[j].fit(XTrainCluster, yTrainCluster[:, j]) subyPredict.append(bestModels[j].predict(XPredict)) tracelist[k] = np.hstack( (tracelist[k], np.array(copy(subyPredict)))) # 将一个模型的预测值作为已知数据,训练下一个模型 embedDim += distance yPredictSum += clusterSum(tracelist, meanList, varList) yPredictSum += shortSongsPredict(shortSongsList, embedDimInit, interval) return yPredictSum
def fout(embedDim, interval, distance): artistObjectFile = os.path.join(utils.allResultPath, 'artistsObjectDict.pkl') artistsObjectDict = cPickle.load(open(artistObjectFile, 'r')) plt.figure(figsize=(6, 4)) artistNum = 0 for artistId, artist in artistsObjectDict.items(): artistNum += 1 print artistNum savePath = os.path.join(utils.resultPath, artistId) if not os.path.exists(savePath): os.makedirs(savePath) # yPredictSum1 = np.zeros(testsize) # yPredictSum2 = np.zeros(distance) yPredictSum3 = np.zeros(distance) yPredictSum4 = np.zeros(distance) for song in utils.clusterSongs(artist): traceLength = np.array(song.getTrace()).shape[1] trainLength = pp.XTrainLength(traceLength, embedDim, interval, 7) # 训练集长度 if traceLength <= embedDim: print 'iterated ' + str(traceLength) + ' ' + str(trainLength) # # SVR模型 # yPredict1 = genModel(artist, song, svrModel, 1, interval) # # 随机森林模型 # yPredict2 = genModel(artist, song, rfModel, 1, interval) # GBRT模型 yPredict3 = genModel(artist, song, itergbrtModel, 1, interval) # 完全随机森林模型 yPredict4 = genModel(artist, song, itererfModel, 1, interval) if trainLength < 10: # 训练集长度不足10的歌曲 print 'iterated ' + str(traceLength) + ' ' + str(trainLength) # # SVR模型 # yPredict1 = genModel(artist, song, svrModel, embedDim, interval) # # 随机森林模型 # yPredict2 = genModel(artist, song, rfModel, embedDim, interval) # GBRT模型 yPredict3 = genModel(artist, song, itergbrtModel, embedDim, interval) # 完全随机森林模型 yPredict4 = genModel(artist, song, itererfModel, embedDim, interval) else: print 'clustermix3 ' + str(traceLength) + ' ' + str( trainLength) # # SVR模型 # yPredict1 = genModel(artist, song, svrModel, embedDim, interval) # # 随机森林模型 # yPredict2 = genModel(artist, song, rfModel, embedDim, interval) # GBRT模型 yPredict3 = genModel(artist, song, gbrtModel, embedDim, interval) # 完全随机森林模型 yPredict4 = genModel(artist, song, erfModel, embedDim, interval) plotResult(yPredict3, yPredict4) plt.savefig( os.path.join( savePath, 'song ' + ''.join(list(song.getId())[:20]) + ".png")) plt.clf() # yPredictSum1 += yPredict1 # yPredictSum2 += yPredict2 yPredictSum3 += yPredict3 yPredictSum4 += yPredict4 plotResult(yPredictSum3, yPredictSum4) plt.savefig(os.path.join(savePath, 'artist ' + artistId + ".png")) plt.clf() # finalResultFile1 = os.path.join(finalResultPath, 'svr.csv') # writecsv(finalResultFile1, artistId, yPredictSum1) # finalResultFile2 = os.path.join(finalResultPath, 'rf.csv') # writecsv(finalResultFile2, artistId, yPredictSum2) finalResultFile3 = os.path.join(finalResultPath, 'gbrt.csv') writecsv(finalResultFile3, artistId, yPredictSum3) finalResultFile4 = os.path.join(finalResultPath, 'erf.csv') writecsv(finalResultFile4, artistId, yPredictSum4)
def test(embedDim, interval, distance): # distance是预测长度,也是测试集长度 artistObjectFile = os.path.join(utils.allResultPath, 'artistsObjectDict.pkl') artistsObjectDict = cPickle.load(open(artistObjectFile, 'r')) # artistF1Score1 = [] # artistF1Score2 = [] artistF1Score3 = [] artistF1Score4 = [] artistF1Score5 = [] plt.figure(figsize=(6, 8)) artistNum = 0 for artistId, artist in artistsObjectDict.items(): artistNum += 1 print artistNum savePath = os.path.join(utils.resultPath, artistId) if not os.path.exists(savePath): os.makedirs(savePath) yTestSum = np.zeros(distance) # yPredictSum1 = np.zeros(distance) # yPredictSum2 = np.zeros(distance) yPredictSum3 = np.zeros(distance) yPredictSum4 = np.zeros(distance) yPredictSum5 = np.zeros(distance) for songId, song in artist.getSongsOwned().items(): traceLength = np.array(song.getTrace()).shape[1] trainLength = pp.XTrainLength(traceLength, embedDim, interval, distance) if trainLength <= 8: # 不测试预测天数之后发行的歌曲 continue if trainLength < 8 + distance: # 训练集长度不足3的歌曲跳过 print 'iterated ' + str(traceLength) + ' ' + str(trainLength) # # SVR模型 # yPredict1, yTest = genModel(artist, song, svrModel, embedDim, interval, distance) # rmse1 = np.sqrt(mean_squared_error(yTest, yPredict1)) # nvar1 = utils.normalizedVariation(yTest, yPredict1) # # 随机森林模型 # yPredict2, yTest = genModel(artist, song, rfModel, embedDim, interval, distance) # rmse2 = np.sqrt(mean_squared_error(yTest, yPredict2)) # nvar2 = utils.normalizedVariation(yTest, yPredict2) # GBRT模型 yPredict3, yTest = itergenModel(artist, song, itergbrtModel, embedDim, interval, distance) rmse3 = np.sqrt(mean_squared_error(yTest, yPredict3)) nvar3 = utils.normalizedVariation(yTest, yPredict3) # 完全随机森林模型 yPredict4, yTest = itergenModel(artist, song, itererfModel, embedDim, interval, distance) rmse4 = np.sqrt(mean_squared_error(yTest, yPredict4)) nvar4 = utils.normalizedVariation(yTest, yPredict4) # xgboost模型 yPredict5, yTest = itergenModel(artist, song, iterxgbModel, embedDim, interval, distance) rmse5 = np.sqrt(mean_squared_error(yTest, yPredict5)) nvar5 = utils.normalizedVariation(yTest, yPredict5) else: print 'direct ' + str(traceLength) + ' ' + str(trainLength) # # SVR模型 # yPredict1, yTest = genModel(artist, song, svrModel, embedDim, interval, distance) # rmse1 = np.sqrt(mean_squared_error(yTest, yPredict1)) # nvar1 = utils.normalizedVariation(yTest, yPredict1) # # 随机森林模型 # yPredict2, yTest = genModel(artist, song, rfModel, embedDim, interval, distance) # rmse2 = np.sqrt(mean_squared_error(yTest, yPredict2)) # nvar2 = utils.normalizedVariation(yTest, yPredict2) # GBRT模型 yPredict3, yTest = genModel(artist, song, gbrtModel, embedDim, interval, distance) rmse3 = np.sqrt(mean_squared_error(yTest, yPredict3)) nvar3 = utils.normalizedVariation(yTest, yPredict3) # 完全随机森林模型 yPredict4, yTest = genModel(artist, song, erfModel, embedDim, interval, distance) rmse4 = np.sqrt(mean_squared_error(yTest, yPredict4)) nvar4 = utils.normalizedVariation(yTest, yPredict4) # xgboost模型 yPredict5, yTest = genModel(artist, song, xgbModel, embedDim, interval, distance) rmse5 = np.sqrt(mean_squared_error(yTest, yPredict5)) nvar5 = utils.normalizedVariation(yTest, yPredict5) plotResult(yPredict3, yPredict4, yPredict5, yTest) plt.title( # 'SVR=RMSE:' + str(rmse1) + '-nvar:' + str(nvar1) + '\n' + \ # 'RF=RMSE:' + str(rmse2) + '-nvar:' + str(nvar2) + '\n' + \ 'GBRT=RMSE:' + str(rmse3) + '-nvar:' + str(nvar3) + '\n' + \ 'erf=RMSE:' + str(rmse4) + '-nvar:' + str(nvar4) + '\n' + \ 'xgb=RMSE:' + str(rmse5) + '-nvar:' + str(nvar5) ) plt.savefig(os.path.join(savePath, 'song ' + songId + ".png")) plt.clf() yTestSum += yTest # yPredictSum1 += yPredict1 # yPredictSum2 += yPredict2 yPredictSum3 += yPredict3 yPredictSum4 += yPredict4 yPredictSum5 += yPredict5 # rmseSum1 = np.sqrt(mean_squared_error(yTestSum, yPredictSum1)) # nvarSum1 = utils.normalizedVariation(yTestSum, yPredictSum1) # rmseSum2 = np.sqrt(mean_squared_error(yTestSum, yPredictSum2)) # nvarSum2 = utils.normalizedVariation(yTestSum, yPredictSum2) rmseSum3 = np.sqrt(mean_squared_error(yTestSum, yPredictSum3)) nvarSum3 = utils.normalizedVariation(yTestSum, yPredictSum3) rmseSum4 = np.sqrt(mean_squared_error(yTestSum, yPredictSum4)) nvarSum4 = utils.normalizedVariation(yTestSum, yPredictSum4) rmseSum5 = np.sqrt(mean_squared_error(yTestSum, yPredictSum5)) nvarSum5 = utils.normalizedVariation(yTestSum, yPredictSum5) plotResult(yPredictSum3, yPredictSum4, yPredictSum5, yTestSum) plt.title( # 'SVR=RMSE:' + str(rmseSum1) + '-nvar:' + str(nvarSum1) + '\n' + \ # 'RF=RMSE:' + str(rmseSum2) + '-nvar:' + str(nvarSum2) + '\n' + \ 'GBRT=RMSE:' + str(rmseSum3) + '-nvar:' + str(nvarSum3) + '\n' + \ 'erf=RMSE:' + str(rmseSum4) + '-nvar:' + str(nvarSum4) + '\n' + \ 'xgb=RMSE:' + str(rmseSum5) + '-nvar:' + str(nvarSum5) ) plt.savefig(os.path.join(savePath, 'artist ' + artistId + ".png")) plt.clf() artistWeight = np.sqrt(np.sum(artist.getTotalTrace()[0, -distance:])) # artistF1Score1.append(artistWeight * (1 - nvarSum1)) # artistF1Score2.append(artistWeight * (1 - nvarSum2)) artistF1Score3.append(artistWeight * (1 - nvarSum3)) artistF1Score4.append(artistWeight * (1 - nvarSum4)) artistF1Score5.append(artistWeight * (1 - nvarSum5)) f1ScoreFile = os.path.join(utils.allResultPath, 'F1Score') with open(f1ScoreFile, 'a') as file: file.write(time.asctime()) file.write('embedDim=' + str(embedDim) + ', interval=' + str(interval) + ', distance=' + str(distance)) file.write('\n') # file.write('SVR:' + str(np.sum(artistF1Score1)) + '\n' + str(artistF1Score1) + '\n') # file.write('-RF:' + str(np.sum(artistF1Score2)) + '\n' + str(artistF1Score2) + '\n') file.write('-GBRT:' + str(np.sum(artistF1Score3)) + '\n' + str(artistF1Score3) + '\n') file.write('-erf:' + str(np.sum(artistF1Score4)) + '\n' + str(artistF1Score4) + '\n') file.write('-xgb:' + str(np.sum(artistF1Score5)) + '\n' + str(artistF1Score5) + '\n') file.write('\n')
def fout(embedDim, interval, distance): artistObjectFile = os.path.join(utils.allResultPath, 'artistsObjectDict.pkl') artistsObjectDict = cPickle.load(open(artistObjectFile, 'r')) modelParamsDict = {} plt.figure(figsize=(6, 4)) artistNum = 0 for artistId, artist in artistsObjectDict.items(): artistNum += 1 print artistNum modelParamsDict[artistId] = {} savePath = os.path.join(utils.resultPath, artistId) if not os.path.exists(savePath): os.makedirs(savePath) # yPredictSum1 = np.zeros(testsize) # yPredictSum2 = np.zeros(distance) yPredictSum3 = np.zeros(distance) yPredictSum4 = np.zeros(distance) yPredictSum5 = np.zeros(distance) for songId, song in artist.getSongsOwned().items(): traceLength = np.array(song.getTrace()).shape[1] trainLength = pp.XTrainLength(traceLength, embedDim, interval, distance) if trainLength < 8: # 训练集长度不足8的歌曲跳过 print 'iterated ' + str(traceLength) + ' ' + str(trainLength) # # SVR模型 # yPredict1 = genModel(artist, song, svrModel, embedDim, interval, testsize) # # 随机森林模型 # yPredict2 = genModel(artist, song, rfModel, embedDim, interval, distance) # GBRT模型 yPredict3 = itergenModel(artist, song, itergbrtModel, embedDim, interval, distance) # 完全随机森林模型 yPredict4 = itergenModel(artist, song, itererfModel, embedDim, interval, distance) # xgboost模型 yPredict5 = itergenModel(artist, song, iterxgbModel, embedDim, interval, distance) else: print 'direct ' + str(traceLength) + ' ' + str(trainLength) # # SVR模型 # yPredict1 = genModel(artist, song, svrModel, embedDim, interval, testsize) # # 随机森林模型 # yPredict2 = genModel(artist, song, rfModel, embedDim, interval, distance) # GBRT模型 yPredict3 = genModel(artist, song, gbrtModel, embedDim, interval, distance) # 完全随机森林模型 yPredict4 = genModel(artist, song, erfModel, embedDim, interval, distance) # xgboost模型 yPredict5 = genModel(artist, song, xgbModel, embedDim, interval, distance) plotResult(yPredict3, yPredict4, yPredict5) plt.savefig(os.path.join(savePath, 'song ' + songId + ".png")) plt.clf() # yPredictSum1 += yPredict1 # yPredictSum2 += yPredict2 yPredictSum3 += yPredict3 yPredictSum4 += yPredict4 yPredictSum5 += yPredict5 plotResult(yPredictSum3, yPredictSum4, yPredictSum5) plt.savefig(os.path.join(savePath, 'artist ' + artistId + ".png")) plt.clf() # finalResultFile1 = os.path.join(finalResultPath, 'svr.csv') # writecsv(finalResultFile1, artistId, yPredictSum1) # finalResultFile2 = os.path.join(finalResultPath, 'rf.csv') # writecsv(finalResultFile2, artistId, yPredictSum2) finalResultFile3 = os.path.join(finalResultPath, 'gbrt.csv') writecsv(finalResultFile3, artistId, yPredictSum3) finalResultFile4 = os.path.join(finalResultPath, 'erf.csv') writecsv(finalResultFile4, artistId, yPredictSum4) finalResultFile5 = os.path.join(finalResultPath, 'xgb.csv') writecsv(finalResultFile5, artistId, yPredictSum5)