def save(DATASET, NUM_FOLD, ENSEMBLE, ALGORITHM, sparse=False): NUM_GENES = None SEED = 1313 NTREES = 300 SUB_CROSS = 3 METRIC = 'NDCG' if DATASET == '2003_td_dataset': NUM_GENES = 64 elif DATASET == 'web10k': NUM_GENES = 136 elif DATASET == 'yahoo': NUM_GENES = 700 elif DATASET in ['movielens', 'lastfm', 'bibsonomy', 'youtube']: NUM_GENES = 13 else: print('DATASET INVĂLIDO') X_train, y_train, query_id_train = l2rCodesSerial.load_L2R_file( './dataset/' + DATASET + '/Fold' + NUM_FOLD + '/Norm.' + 'train' + '.txt', '1' * NUM_GENES, sparse) # X_test, y_test, query_id_test = l2rCodesSerial.load_L2R_file( # './dataset/' + DATASET + '/Fold' + NUM_FOLD + '/Norm.' + 'test' + '.txt', '1' * NUM_GENES, sparse) scoreTest = [0] * len(y_train) model = l2rCodesSerial.getTheModel(1, NTREES, 0.3, SEED, DATASET) model.fit(X_train, y_train) resScore = model.predict(X_train) c = 0 for i in resScore: scoreTest[c] = i c = c + 1 ndcg, queries = l2rCodesSerial.getEvaluation(scoreTest, query_id_train, y_train, DATASET, METRIC, "test") f = open( './baselines/' + DATASET + '/Fold' + NUM_FOLD + '/' + ALGORITHM + 'train.txt', "w+") for i in range(len(queries)): f.write(str(queries[i]) + '\n') # f.write(str(queries[i])) f.close()
def getPrecisionAndQueries(individuo, NUM_GENES, X_train, y_train, X_test, y_test, query_id_train, ENSEMBLE, NTREES, SEED, DATASET, METRIC): # list_mask = list(individuo) # features = [] # for i in range(NUM_GENES): # if list_mask[i] == '1': # features.append(i) # X_train_ind = X_train[:, features] # # X_test_ind = X_test[:, features] # # queriesList = l2rCodesSerial.getQueries(query_id_train) # scoreTrain = [0] * len(y_train) # # kf = model_selection.KFold(n_splits=3, shuffle=True, random_state=SEED) # XF_train_index = [] # XF_test_index = [] # # for qtrain_index, qtest_index in kf.split(queriesList): # # del XF_train_index[:] # del XF_test_index[:] # # for qtr in qtrain_index: # XF_train_index = XF_train_index + queriesList[qtr] # for qts in qtest_index: # XF_test_index = XF_test_index + queriesList[qts] # # XF_train, XF_test = X_train_ind[XF_train_index], X_train_ind[XF_test_index] #### # yf_train, yf_test = y_train[XF_train_index], y_train[XF_test_index] # # qf_train, qf_test = query_id_train[XF_train_index], query_id_train[XF_test_index] # # # resScore = None # model = l2rCodesSerial.getTheModel(ENSEMBLE, NTREES, 0.3, SEED, DATASET) # # model.fit(XF_train, yf_train) # resScore = model.predict(XF_test) # # # sortRespRel=[rel for (score, rel) in sorted(zip(resScore,yf_test), key=lambda pair: pair[0], reverse=True)] # c = 0 # for i in XF_test_index: # scoreTrain[i] = resScore[c] # c = c + 1 # ndcg, queries = l2rCodesSerial.getEvaluation(scoreTrain, query_id_train, y_train, DATASET, METRIC, "test") # return ndcg, queries, scoreTrain list_mask = list(individuo) features = [] for i in range(NUM_GENES): if list_mask[i] == '1': features.append(i) X_train_ind = X_train[:, features] scoreTrain = [0] * len(y_train) # model = linear_model.LinearRegression(n_jobs=-1) model = l2rCodesSerial.getTheModel(ENSEMBLE, NTREES, 0.3, SEED, DATASET) model.fit(X_train_ind, y_train) resScore = model.predict(X_train_ind) # sortRespRel=[rel for (score, rel) in sorted(zip(resScore,yf_test), key=lambda pair: pair[0], reverse=True)] c = 0 for i in range(len(resScore)): scoreTrain[i] = resScore[c] c = c + 1 ndcg, queries = l2rCodesSerial.getEvaluation(scoreTrain, query_id_train, y_train, DATASET, METRIC, "test") return ndcg, queries, scoreTrain
# NSGA best IND RF ENSEMBLE = 1 # random forest NTREES = 20 SEED = 1887 NUM_FOLD = '0' METRIC = "NDCG" sparse = True ALGORITHM = 'rf' print('reading and training NSGA bestind') X_train, y_train, query_id_train = l2rCodesSerial.load_L2R_file( './dataset/' + dataset + '/' + NUM_FOLD + '.' + 'train', bestindNSGA, sparse) X_test, y_test, query_id_test = l2rCodesSerial.load_L2R_file( './dataset/' + dataset + '/' + NUM_FOLD + '.' + 'test', bestindNSGA, sparse) # print(len(X_train[0])) model = l2rCodesSerial.getTheModel(ENSEMBLE, NTREES, 0.3, SEED, dataset) model.fit(X_train, y_train) resScore = model.predict(X_test) scoreTest = [0] * len(y_test) c = 0 for i in resScore: scoreTest[c] = i c = c + 1 ndcg, queries = l2rCodesSerial.getEvaluation(scoreTest, query_id_test, y_test, dataset, METRIC, "test") ndcgs.append(queries) diversitys.append(evaluateIndividuoSerial.getDiversity(scoreTest, y_test, query_id_test)) noveltys.append(evaluateIndividuoSerial.getNovelty(scoreTest, y_test, query_id_test)) # SPEA best IND RF