def calculate_best(method, sector, train_begin, train_end, validation_begin, validation_end, doff=True): # preprocessing read/parametrize/embed/align TRAIN train_aligned = data_processing.prepare(sector, train_begin, train_end) validation_aligned = data_processing.prepare(sector, validation_begin, validation_end) train = train_aligned[1] xtrain = train_aligned[0].fillna(0) validation = validation_aligned[1] xvalidation = validation_aligned[0].fillna(0) lags = 168 lagsx = 5 if doff: e, model, res, preds, ident = ML_predict(train, lags, 1, method=method) print("MEAN: %f" % e) x_validation, y_validation = data_processing.stride_data( validation, lags) valid_preds = continous_predictor(model, x_validation, y_validation) print("VALIDATION: %f" % utils.RMSE(y_validation, valid_preds) ) #todo: Investigate validation step, raises high RMSE valid_preds = np.array(valid_preds).reshape(-1, 1) #res = y - valid_preds e, model, res, exog_preds, ident2 = ML_predict(train, lags, 1, method, xtrain, lagsx) print("MEAN: %f" % e) x_validation, y_validation = data_processing.exog_stride_data( validation, lags, xvalidation, lagsx) exog_valid_preds = exog_continous_predictor(model, x_validation, y_validation, lags) print("VALIDATION: %f:" % utils.RMSE(y_validation, exog_valid_preds)) ident2 += 'g' return preds, exog_preds, ident, ident2
def exercicio2(): utils.print_header(2) np.random.seed(constants.SEED) x, y = load_concrete(os.path.join(constants.DATA_DIR, constants.FILENAME_CONCRETE_DATABASE), standardization=True) n_folds = 4 n_samples = x.shape[0] indices = np.arange(n_samples) fold_sizes = (n_samples // n_folds) * np.ones(n_folds, dtype=np.int) fold_sizes[:n_samples % n_folds] += 1 current = 0 folds = [] for fold_size in fold_sizes: start, stop = current, current + fold_size folds.append({ 'x': x[indices[start:stop]], 'y': y[indices[start:stop]], }) current = stop sigmas = [0.01] + list(np.arange(0.05, 0.55, 0.05)) val_perc = 0.2 # a percentage of the train data will be used for validation for k in range(n_folds): x_train = np.hstack([folds[(k + 1 + i) % n_folds]['x']] for i in range(n_folds - 1)).squeeze() y_train = np.hstack([folds[(k + 1 + i) % n_folds]['y']] for i in range(n_folds - 1)).squeeze() x_test, y_test = folds[k]['x'], folds[k]['y'] print('Choosing Sigma...') n_val = int(round(x_train.shape[0] * val_perc)) sigma_scores = {} for s in sigmas: y_pred = [GRNN(train_sample, x_train[n_val:, :], y_train[n_val:], s) for train_sample in x_train[:n_val, :]] sigma_scores[s] = utils.RMSE(y_train[:n_val], y_pred) print('\tSigma={:.2f} -> RMSE={:.2f}'.format(s, sigma_scores[s])) best_sigma = np.argmin([sigma_scores[s] for s in sigmas]) plt.plot(sigmas, [sigma_scores[s] for s in sigmas]) plt.title(r'Fold {}, Best $\sigma$={}'.format(k+1, sigmas[best_sigma])) plt.ylabel('RMSE') plt.xlabel(r'$\sigma$') plot_fname = os.path.join(constants.OUTPUT_DIR, 'exercicio2-fold-{}.pdf'.format(k + 1)) plt.savefig(plot_fname, bbox_inches='tight') plt.show() y_pred = [GRNN(test_sample, x_train, y_train, sigmas[best_sigma]) for test_sample in x_test] print('Test using best sigma={} -> RMSE={:.2f}'.format(sigmas[best_sigma], utils.RMSE(y_test, y_pred))) exit()
def exercicio5(): utils.print_header(5) years, times = load_runner( os.path.join(constants.DATA_DIR, constants.FILENAME_RUNNER_DATABASE)) N = years.shape[0] f, w0_hat, w1_hat = utils.linear_model(years, times) y_pred = np.array([f(year) for year in years]) tau_b = utils.KendallTauB(years, times) p = utils.Pearson(years, times) # Slide 59, Aula 4 def reject_kendall(tau, alpha): return abs(tau) > utils.get_z(alpha) * np.sqrt( (2 * (2 * N + 5)) / (9 * N * (N - 1))) # Slide 52, Aula 4 def reject_pearson(p, alpha): return abs((p * np.sqrt(N - 2)) / (np.sqrt(1 - (p**2)))) > utils.t_student(N - 2, alpha / 2) print('a)') print('\tLinear equation: {:.3f} {} {:.3f}x'.format( w0_hat, '+' if w1_hat >= 0 else '-', abs(w1_hat))) print('\tRMSE: {:.3f}'.format(utils.RMSE(y_pred, times))) plt.scatter(years, times, linewidths=0) plt.plot(years, f(years), c='r') plt.axhline(y=f(2016), color='g', linestyle='--') plt.scatter(2016, f(2016), c='g', linewidths=0) plt.tight_layout() plot_fname = os.path.join(constants.OUTPUT_DIR, 'exercicio5-a.pdf') plt.savefig(plot_fname, bbox_inches='tight') plt.show() print('b)') print('\tPrediction for 2016: {:.3f} seconds'.format(f(2016))) print('c)') print('\tKendall\'s tau: {:.3f}'.format(tau_b)) print('\tNull hypothesis rejected:\n\t- 95%: {}\n\t- 99%: {}'.format( reject_kendall(tau_b, 0.05), reject_kendall(tau_b, 0.01))) print('d)') print('\tPearson correlation coefficient: {:.3f}'.format(p)) if abs(p) > 0.85: print( '\t|p| > 0.85 and null hypothesis rejected:\n\t- 95%: {}\n\t- 99%: {}' .format(reject_pearson(p, 0.05), reject_pearson(p, 0.01))) exit()
def exercicio6(): utils.print_header(6) np.random.seed(constants.SEED) data = load_servo(os.path.join(constants.DATA_DIR, constants.FILENAME_SERVO_DATABASE), to_float=False) np.random.shuffle(data) train_data, test_data = utils.train_test_split(data) clf = utils.DecisionTreeRegressor(max_depth=2, min_samples_split=2) clf.fit(train_data[:, :-1], train_data[:, -1]) y_pred = clf.predict(test_data[:, :-1]) print('\tRMSE: {:.2f}'.format(utils.RMSE(test_data[:, -1], y_pred))) print('\tMAPE: {:.2f}%'.format(utils.MAPE(test_data[:, -1], y_pred))) clf.show() exit()
def ML_external(train_data, lag_size, folds, regressor, exog_data, lagsx): """ Predicts time-series using internal and external geolocated data :param train_data: Train/test internal data :param lag_size: size of the lag for internal data :param folds: Number of folds in CV process :param regressor: Defined regressor :param exog_data: Train/Test external data :param lagsx: size of the lag for external data :return: mean error, trained model, resiudals, prediction and (temporary) internal identifier """ error = [] for k in range(folds): train, test, ident = data_processing.leaveweek(train_data, 14) #todo: CV exog_train, exog_test, exog_ident = data_processing.leaveweek( exog_data, 14) x_train, y_train = data_processing.exog_stride_data( train, lag_size, exog_train, lagsx) y_train = y_train.reshape(-1, 1) x_test, y_test = data_processing.exog_stride_data( test, lag_size, exog_test, lagsx) y_test = y_test.reshape(-1, 1) regressor.fit(x_train, y_train) preds = exog_continous_predictor(regressor, x_test, y_test, lag_size) single_error = utils.RMSE(y_test, preds) error.append(single_error) x = np.mean(test).values[0] print("Relative error: %s:" % (str((single_error / x) * 100)) + "%") preds = np.array(preds).reshape(-1, 1) res = y_test - preds stacked = np.hstack((y_test, preds)) stacked_df = pd.DataFrame(data=stacked, columns=['org', 'pred']) preds = pd.DataFrame(preds, index=test.index[lag_size:] ) # todo: find another way for datetime embedding return np.mean(error), regressor, res, preds, ident
def train(self,ratings,maxiter=50): self.__meanrating = np.mean(ratings[:,2]) # TODO this shall be sparse matrix self.__ratings_csr = utils.record2matrix(record=ratings,nusers=self.nusers,nitems=self.nitems) self.__ratings_csc = self.__ratings_csr.tocsc() lastRMSE = None for i in range(maxiter): self.__updateitemparams() self.__updateuserparams() self.__updateitemfeatures() self.__updateuserfeatures() # Compute RMSE preds = self.predict(ratings[:,:2]) newRMSE = utils.RMSE(preds,ratings[:,2]) if lastRMSE and self.verbose: print 'RMSE of {iter}th epoch: {rmse}'.format(iter=i,rmse=newRMSE) lastRMSE = newRMSE continue if lastRMSE and np.abs(newRMSE - lastRMSE) < self.tolerance: print 'Converge with RMSE: {rmse}'.format(rmse=newRMSE) break lastRMSE = newRMSE else: if self.verbose: print 'Train stop. {reason}'.format(reason='Maximum Iteration!') # I always think the following code is amazing return self
def predict(self,inputmatrix): ''' Predict the recommendation values based on the user-user and item-item similarity ''' if self.copy: data = inputmatrix.copy() else: data = inputmatrix if sparse.issparse(data): # TODO The algorithm will support Sparse Matrix in the future data = data.toarray() nanpos = np.where(data!=data) data[nanpos] = self.nanvalue drmse = None lastrmse = None userdata = data index = 0 while (drmse is None) or (drmse > self.tolerance): meanusermatrix = np.mean(userdata,axis=1) usermatrix = userdata - meanusermatrix[:,np.newaxis] # Singular point userpred = meanusermatrix[:,np.newaxis]\ + self.usersimilarity.dot(usermatrix)/np.array([np.abs(self.usersimilarity).sum(axis=1)]).T #userpred = self.usersimilarity.dot(userdata)/np.array([np.abs(self.usersimilarity).sum(axis=1)]).T if not self.selfconsistence: break if lastrmse is None: lastrmse = utils.RMSE(userdata,userpred) userdata = userpred continue index += 1 currentrmse = utils.RMSE(userdata,userpred) drmse = lastrmse - currentrmse lastrmse = currentrmse userdata = userpred if self.verbose: print '{index}th Iteration -> SCR User-Prediction with RMSE: {rmse}'.format(index=index,rmse=currentrmse) drmse = None lastrmse = None itemdata = data index = 0 while (drmse is None) or (drmse > self.tolerance): itempred = itemdata.dot(self.itemsimilarity)/np.array([np.abs(self.itemsimilarity).sum(axis=1)]) if not self.selfconsistence: break if lastrmse is None: lastrmse = utils.RMSE(itemdata,itempred) itemdata = itempred continue index += 1 currentrmse = utils.RMSE(itemdata,itempred) drmse = lastrmse - currentrmse lastrmse = currentrmse itemdata = itempred if self.verbose: print '{index}th Iteration -> SCR Item-Prediction with RMSE: {rmse}'.format(index=index,rmse=currentrmse) return userpred,itempred
def exercicio6(): utils.print_header(6) np.random.seed(constants.SEED) # for reproducibility data = load_polinomio( os.path.join(constants.DATA_DIR, constants.FILENAME_POLINOMIO_DATABASE)) x_min, x_max = data[:, 0].min(), data[:, 0].max() np.random.shuffle(data) train = data[:np.round(data.shape[0] * 0.7).astype(int), :] test = data[np.round(data.shape[0] * 0.7).astype(int):, :] print('a)') f, w0, w1 = utils.linear_model(train[:, 0], train[:, 1]) print('\tLinear equation: {:.3f} {} {:.3f}x'.format( w0, '+' if w1 >= 0 else '-', abs(w1))) y_pred_train = f(train[:, 0]) y_pred_test = f(test[:, 0]) print('\tTrain -> RMSE: {:.3f}, MAPE: {:.3f}'.format( utils.RMSE(y_pred_train, train[:, 1]), utils.MAPE(y_pred_train, train[:, 1]))) print('\tTest -> RMSE: {:.3f}, MAPE: {:.3f}'.format( utils.RMSE(y_pred_test, test[:, 1]), utils.MAPE(y_pred_test, test[:, 1]))) a = plt.scatter(train[:, 0], train[:, 1], c='g', linewidths=0) b = plt.scatter(test[:, 0], test[:, 1], c='b', linewidths=0) plt.plot(train[:, 0], f(train[:, 0]), c='k') plt.legend((a, b), ('train', 'test'), loc='best', fontsize=10) plt.tight_layout() plot_fname = os.path.join(constants.OUTPUT_DIR, 'exercicio6-a.pdf') plt.savefig(plot_fname, bbox_inches='tight') plt.show() print('b)') x_train_train = train[:np.round(train.shape[0] * 0.7).astype(int), :] x_train_val = train[np.round(train.shape[0] * 0.7).astype(int):, :] scores = {} n_start, n_end = 1, 10 for n in range(n_start, n_end + 1): x_p = utils.x_polynomial(x_train_train[:, 0], n) w_hat = np.linalg.inv(x_p.T.dot(x_p)).dot(x_p.T).dot(x_train_train[:, 1]) y_pred = utils.x_polynomial(x_train_val[:, 0], n).dot(w_hat) scores[n] = { 'RMSE': utils.RMSE(y_pred, x_train_val[:, 1]), 'MAPE': utils.MAPE(y_pred, x_train_val[:, 1]), 'R_2': utils.R_2(y_pred, x_train_val[:, 1]), } fig, ax1 = plt.subplots() ax2 = ax1.twinx() a = ax1.plot(list(range(n_start, n_end + 1)), [scores[n]['RMSE'] for n in scores.keys()], c='g', label='RMSE') b = ax2.plot(list(range(n_start, n_end + 1)), [scores[n]['R_2'] for n in scores.keys()], c='r', label=r'R$^2$') lns = a + b ax1.legend(lns, [l.get_label() for l in lns], loc='best', fontsize=10) plt.tight_layout() plot_fname = os.path.join(constants.OUTPUT_DIR, 'exercicio6-b-tuning.pdf') plt.savefig(plot_fname, bbox_inches='tight') plt.show() r_2 = np.array([[n, scores[n]['R_2']] for n in scores.keys()]) n_best = int(r_2[r_2[:, 1].argsort()[::-1]][0, 0]) x_train_p = utils.x_polynomial(train[:, 0], n_best) x_test_p = utils.x_polynomial(test[:, 0], n_best) w_hat = np.linalg.inv(x_train_p.T.dot(x_train_p)).dot(x_train_p.T).dot( train[:, 1]) y_pred_train = x_train_p.dot(w_hat) y_pred_test = x_test_p.dot(w_hat) print('\tTuning:\n\t\tBest N [{}-{}]: {}\n\t\tR^2: {:.3f}'.format( n_start, n_end, n_best, scores[n_best]['R_2'])) print('\tParams: {}'.format(w_hat)) print('\tR^2: train({:.3f}), test({:.3f})'.format( utils.R_2(y_pred_train, train[:, 1]), utils.R_2(y_pred_test, test[:, 1]))) print('\tTrain -> RMSE: {:.3f}, MAPE: {:.3f}'.format( utils.RMSE(y_pred_train, train[:, 1]), utils.MAPE(y_pred_train, train[:, 1]))) print('\tTest -> RMSE: {:.3f}, MAPE: {:.3f}'.format( utils.RMSE(y_pred_test, test[:, 1]), utils.MAPE(y_pred_test, test[:, 1]))) # plot a = plt.scatter(train[:, 0], train[:, 1], c='g', linewidths=0) b = plt.scatter(test[:, 0], test[:, 1], c='b', linewidths=0) plt.plot(np.arange(x_min, x_max, 0.1), utils.x_polynomial(np.arange(x_min, x_max, 0.1), n_best).dot(w_hat), c='k') plt.legend((a, b), ('train', 'test'), loc='best', fontsize=10) plt.tight_layout() plot_fname = os.path.join(constants.OUTPUT_DIR, 'exercicio6-b.pdf') plt.savefig(plot_fname, bbox_inches='tight') plt.show() print('c)') w_hat, outliers = utils.RANSAC(train[:, 0], train[:, 1], n=n_best, tau=10, seed=constants.SEED) x_train_p = utils.x_polynomial(train[:, 0], n_best) x_test_p = utils.x_polynomial(test[:, 0], n_best) y_pred_train = x_train_p.dot(w_hat) y_pred_test = x_test_p.dot(w_hat) print('\tTrain -> RMSE: {:.3f}, MAPE: {:.3f}'.format( utils.RMSE(y_pred_train, train[:, 1]), utils.MAPE(y_pred_train, train[:, 1]))) print('\tTest -> RMSE: {:.3f}, MAPE: {:.3f}'.format( utils.RMSE(y_pred_test, test[:, 1]), utils.MAPE(y_pred_test, test[:, 1]))) # plot plt.plot(np.arange(x_min, x_max, 0.1), utils.x_polynomial(np.arange(x_min, x_max, 0.1), n_best).dot(w_hat), c='k') a = plt.scatter(train[:, 0], train[:, 1], c='g', linewidths=0) b = plt.scatter(test[:, 0], test[:, 1], c='b', linewidths=0) c = plt.scatter(outliers[0], outliers[1], c='r', linewidths=0) plt.legend((a, b, c), ('train', 'test', 'train_outliers'), loc='best', fontsize=10) plt.tight_layout() plot_fname = os.path.join(constants.OUTPUT_DIR, 'exercicio6-c.pdf') plt.savefig(plot_fname, bbox_inches='tight') plt.show() exit()
json_path), "No JSON configuration file found at {}".format(json_path) params = utils.Params(json_path) trainloader, testloader, ddv_list = datautils.fetch_noniid_dataloader( params) # ddv_list = [] # sum_list = [0 for _ in range(params.n_users)] # for i in range(params.n_users): # dist = [1 for _ in range(10)] # for _, target in trainloader[i]: # for elem in target: # dist[int(elem)-1] += 1 # total = sum(dist) # for k in range(10): # dist[k] = dist[k] / total # ddv_list.append(np.array(dist)) # # sum_list[i] = sum(dist) # # print(sum_list) # print(ddv_list) # I = np.array([0.1 for _ in range(10)]) print("I: {}".format(I)) for p in ddv_list: # print("D(I||p): ".format(np.sum(I * np.log(I / p))), end=" / ") KLD = utils.KL_divergence(p, I) IID_dist = utils.RMSE(p, I) print("D(p||I): {}".format(KLD)) print("IID proximity: {}".format(IID_dist)) print()