Esempio n. 1
0
def calculate_best(method,
                   sector,
                   train_begin,
                   train_end,
                   validation_begin,
                   validation_end,
                   doff=True):
    # preprocessing read/parametrize/embed/align TRAIN
    train_aligned = data_processing.prepare(sector, train_begin, train_end)
    validation_aligned = data_processing.prepare(sector, validation_begin,
                                                 validation_end)

    train = train_aligned[1]
    xtrain = train_aligned[0].fillna(0)

    validation = validation_aligned[1]
    xvalidation = validation_aligned[0].fillna(0)

    lags = 168
    lagsx = 5

    if doff:
        e, model, res, preds, ident = ML_predict(train, lags, 1, method=method)
        print("MEAN: %f" % e)

        x_validation, y_validation = data_processing.stride_data(
            validation, lags)

        valid_preds = continous_predictor(model, x_validation, y_validation)

        print("VALIDATION: %f" % utils.RMSE(y_validation, valid_preds)
              )  #todo: Investigate validation step, raises high RMSE
        valid_preds = np.array(valid_preds).reshape(-1, 1)
        #res = y - valid_preds

    e, model, res, exog_preds, ident2 = ML_predict(train, lags, 1, method,
                                                   xtrain, lagsx)
    print("MEAN: %f" % e)

    x_validation, y_validation = data_processing.exog_stride_data(
        validation, lags, xvalidation, lagsx)

    exog_valid_preds = exog_continous_predictor(model, x_validation,
                                                y_validation, lags)

    print("VALIDATION: %f:" % utils.RMSE(y_validation, exog_valid_preds))

    ident2 += 'g'
    return preds, exog_preds, ident, ident2
def exercicio2():
    utils.print_header(2)
    np.random.seed(constants.SEED)
    x, y = load_concrete(os.path.join(constants.DATA_DIR, constants.FILENAME_CONCRETE_DATABASE), standardization=True)
    n_folds = 4
    n_samples = x.shape[0]

    indices = np.arange(n_samples)
    fold_sizes = (n_samples // n_folds) * np.ones(n_folds, dtype=np.int)
    fold_sizes[:n_samples % n_folds] += 1
    current = 0
    folds = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        folds.append({
            'x': x[indices[start:stop]],
            'y': y[indices[start:stop]],
        })
        current = stop

    sigmas = [0.01] + list(np.arange(0.05, 0.55, 0.05))
    val_perc = 0.2  # a percentage of the train data will be used for validation
    for k in range(n_folds):
        x_train = np.hstack([folds[(k + 1 + i) % n_folds]['x']] for i in range(n_folds - 1)).squeeze()
        y_train = np.hstack([folds[(k + 1 + i) % n_folds]['y']] for i in range(n_folds - 1)).squeeze()
        x_test, y_test = folds[k]['x'], folds[k]['y']

        print('Choosing Sigma...')
        n_val = int(round(x_train.shape[0] * val_perc))
        sigma_scores = {}
        for s in sigmas:
            y_pred = [GRNN(train_sample, x_train[n_val:, :], y_train[n_val:], s) for train_sample in x_train[:n_val, :]]
            sigma_scores[s] = utils.RMSE(y_train[:n_val], y_pred)
            print('\tSigma={:.2f} -> RMSE={:.2f}'.format(s, sigma_scores[s]))
        best_sigma = np.argmin([sigma_scores[s] for s in sigmas])
        plt.plot(sigmas, [sigma_scores[s] for s in sigmas])
        plt.title(r'Fold {}, Best $\sigma$={}'.format(k+1, sigmas[best_sigma]))
        plt.ylabel('RMSE')
        plt.xlabel(r'$\sigma$')
        plot_fname = os.path.join(constants.OUTPUT_DIR, 'exercicio2-fold-{}.pdf'.format(k + 1))
        plt.savefig(plot_fname, bbox_inches='tight')
        plt.show()

        y_pred = [GRNN(test_sample, x_train, y_train, sigmas[best_sigma]) for test_sample in x_test]
        print('Test using best sigma={} -> RMSE={:.2f}'.format(sigmas[best_sigma], utils.RMSE(y_test, y_pred)))
    exit()
Esempio n. 3
0
def exercicio5():
    utils.print_header(5)

    years, times = load_runner(
        os.path.join(constants.DATA_DIR, constants.FILENAME_RUNNER_DATABASE))
    N = years.shape[0]

    f, w0_hat, w1_hat = utils.linear_model(years, times)
    y_pred = np.array([f(year) for year in years])

    tau_b = utils.KendallTauB(years, times)
    p = utils.Pearson(years, times)

    # Slide 59, Aula 4
    def reject_kendall(tau, alpha):
        return abs(tau) > utils.get_z(alpha) * np.sqrt(
            (2 * (2 * N + 5)) / (9 * N * (N - 1)))

    # Slide 52, Aula 4
    def reject_pearson(p, alpha):
        return abs((p * np.sqrt(N - 2)) /
                   (np.sqrt(1 - (p**2)))) > utils.t_student(N - 2, alpha / 2)

    print('a)')
    print('\tLinear equation: {:.3f} {} {:.3f}x'.format(
        w0_hat, '+' if w1_hat >= 0 else '-', abs(w1_hat)))
    print('\tRMSE: {:.3f}'.format(utils.RMSE(y_pred, times)))
    plt.scatter(years, times, linewidths=0)
    plt.plot(years, f(years), c='r')
    plt.axhline(y=f(2016), color='g', linestyle='--')
    plt.scatter(2016, f(2016), c='g', linewidths=0)
    plt.tight_layout()
    plot_fname = os.path.join(constants.OUTPUT_DIR, 'exercicio5-a.pdf')
    plt.savefig(plot_fname, bbox_inches='tight')
    plt.show()

    print('b)')
    print('\tPrediction for 2016: {:.3f} seconds'.format(f(2016)))

    print('c)')
    print('\tKendall\'s tau: {:.3f}'.format(tau_b))
    print('\tNull hypothesis rejected:\n\t- 95%: {}\n\t- 99%: {}'.format(
        reject_kendall(tau_b, 0.05), reject_kendall(tau_b, 0.01)))

    print('d)')
    print('\tPearson correlation coefficient: {:.3f}'.format(p))
    if abs(p) > 0.85:
        print(
            '\t|p| > 0.85 and null hypothesis rejected:\n\t- 95%: {}\n\t- 99%: {}'
            .format(reject_pearson(p, 0.05), reject_pearson(p, 0.01)))

    exit()
def exercicio6():
    utils.print_header(6)
    np.random.seed(constants.SEED)

    data = load_servo(os.path.join(constants.DATA_DIR,
                                   constants.FILENAME_SERVO_DATABASE),
                      to_float=False)
    np.random.shuffle(data)
    train_data, test_data = utils.train_test_split(data)

    clf = utils.DecisionTreeRegressor(max_depth=2, min_samples_split=2)
    clf.fit(train_data[:, :-1], train_data[:, -1])
    y_pred = clf.predict(test_data[:, :-1])
    print('\tRMSE: {:.2f}'.format(utils.RMSE(test_data[:, -1], y_pred)))
    print('\tMAPE: {:.2f}%'.format(utils.MAPE(test_data[:, -1], y_pred)))
    clf.show()
    exit()
Esempio n. 5
0
def ML_external(train_data, lag_size, folds, regressor, exog_data, lagsx):
    """
    Predicts time-series using internal and external geolocated data
    :param train_data: Train/test internal data
    :param lag_size: size of the lag for internal data
    :param folds: Number of folds in CV process
    :param regressor: Defined regressor
    :param exog_data: Train/Test external data
    :param lagsx: size of the lag for external data
    :return: mean error, trained model, resiudals, prediction and (temporary) internal identifier
    """
    error = []
    for k in range(folds):
        train, test, ident = data_processing.leaveweek(train_data,
                                                       14)  #todo: CV
        exog_train, exog_test, exog_ident = data_processing.leaveweek(
            exog_data, 14)

        x_train, y_train = data_processing.exog_stride_data(
            train, lag_size, exog_train, lagsx)
        y_train = y_train.reshape(-1, 1)

        x_test, y_test = data_processing.exog_stride_data(
            test, lag_size, exog_test, lagsx)
        y_test = y_test.reshape(-1, 1)

        regressor.fit(x_train, y_train)

        preds = exog_continous_predictor(regressor, x_test, y_test, lag_size)

        single_error = utils.RMSE(y_test, preds)
        error.append(single_error)
        x = np.mean(test).values[0]
        print("Relative error: %s:" % (str((single_error / x) * 100)) + "%")
        preds = np.array(preds).reshape(-1, 1)
        res = y_test - preds

        stacked = np.hstack((y_test, preds))
        stacked_df = pd.DataFrame(data=stacked, columns=['org', 'pred'])

    preds = pd.DataFrame(preds, index=test.index[lag_size:]
                         )  # todo: find another way for datetime embedding
    return np.mean(error), regressor, res, preds, ident
Esempio n. 6
0
    def train(self,ratings,maxiter=50):
        self.__meanrating = np.mean(ratings[:,2])

        # TODO this shall be sparse matrix
        self.__ratings_csr = utils.record2matrix(record=ratings,nusers=self.nusers,nitems=self.nitems)
        self.__ratings_csc = self.__ratings_csr.tocsc()
        
        lastRMSE = None
        for i in range(maxiter):
            self.__updateitemparams()
            self.__updateuserparams()

            self.__updateitemfeatures()
            self.__updateuserfeatures()

            # Compute RMSE
            preds = self.predict(ratings[:,:2])
            newRMSE = utils.RMSE(preds,ratings[:,2])

            if lastRMSE and self.verbose:
                print 'RMSE of {iter}th epoch: {rmse}'.format(iter=i,rmse=newRMSE)
                lastRMSE = newRMSE
                continue

            if lastRMSE and np.abs(newRMSE - lastRMSE) < self.tolerance:
                print 'Converge with RMSE: {rmse}'.format(rmse=newRMSE)
                break

            lastRMSE = newRMSE
        
        else:
            if self.verbose:
                print 'Train stop. {reason}'.format(reason='Maximum Iteration!')

        # I always think the following code is amazing
        return self
Esempio n. 7
0
    def predict(self,inputmatrix):
        '''
        Predict the recommendation values based on the user-user and item-item similarity
        '''
        if self.copy:
            data = inputmatrix.copy()
        else:
            data = inputmatrix

        if sparse.issparse(data):               # TODO The algorithm will support Sparse Matrix in the future
            data = data.toarray()

        nanpos = np.where(data!=data)
        data[nanpos] = self.nanvalue

        drmse = None
        lastrmse = None
        userdata = data
        index = 0

        while (drmse is None) or (drmse > self.tolerance):          
            meanusermatrix = np.mean(userdata,axis=1)
            usermatrix = userdata - meanusermatrix[:,np.newaxis]
            # Singular point
            userpred = meanusermatrix[:,np.newaxis]\
                + self.usersimilarity.dot(usermatrix)/np.array([np.abs(self.usersimilarity).sum(axis=1)]).T

            #userpred = self.usersimilarity.dot(userdata)/np.array([np.abs(self.usersimilarity).sum(axis=1)]).T

            if not self.selfconsistence:
                break
            
            if lastrmse is None:
                lastrmse = utils.RMSE(userdata,userpred)
                userdata = userpred
                continue

            index += 1
            currentrmse = utils.RMSE(userdata,userpred)
            drmse = lastrmse - currentrmse
            lastrmse = currentrmse
            userdata = userpred

            if self.verbose:
                print '{index}th Iteration -> SCR User-Prediction with RMSE: {rmse}'.format(index=index,rmse=currentrmse)
            
        drmse = None
        lastrmse = None  
        itemdata = data
        index = 0
        while (drmse is None) or (drmse > self.tolerance):
    
            itempred = itemdata.dot(self.itemsimilarity)/np.array([np.abs(self.itemsimilarity).sum(axis=1)])

            if not self.selfconsistence:
                break

            if lastrmse is None:
                lastrmse = utils.RMSE(itemdata,itempred)
                itemdata = itempred
                continue
            
            index += 1
            currentrmse =  utils.RMSE(itemdata,itempred)
            drmse = lastrmse - currentrmse
            lastrmse = currentrmse
            itemdata = itempred


            if self.verbose:
                print '{index}th Iteration -> SCR Item-Prediction with RMSE: {rmse}'.format(index=index,rmse=currentrmse)

        return userpred,itempred
Esempio n. 8
0
def exercicio6():
    utils.print_header(6)
    np.random.seed(constants.SEED)  # for reproducibility
    data = load_polinomio(
        os.path.join(constants.DATA_DIR,
                     constants.FILENAME_POLINOMIO_DATABASE))
    x_min, x_max = data[:, 0].min(), data[:, 0].max()
    np.random.shuffle(data)
    train = data[:np.round(data.shape[0] * 0.7).astype(int), :]
    test = data[np.round(data.shape[0] * 0.7).astype(int):, :]

    print('a)')
    f, w0, w1 = utils.linear_model(train[:, 0], train[:, 1])
    print('\tLinear equation: {:.3f} {} {:.3f}x'.format(
        w0, '+' if w1 >= 0 else '-', abs(w1)))
    y_pred_train = f(train[:, 0])
    y_pred_test = f(test[:, 0])
    print('\tTrain -> RMSE: {:.3f}, MAPE: {:.3f}'.format(
        utils.RMSE(y_pred_train, train[:, 1]),
        utils.MAPE(y_pred_train, train[:, 1])))
    print('\tTest -> RMSE: {:.3f}, MAPE: {:.3f}'.format(
        utils.RMSE(y_pred_test, test[:, 1]),
        utils.MAPE(y_pred_test, test[:, 1])))
    a = plt.scatter(train[:, 0], train[:, 1], c='g', linewidths=0)
    b = plt.scatter(test[:, 0], test[:, 1], c='b', linewidths=0)
    plt.plot(train[:, 0], f(train[:, 0]), c='k')
    plt.legend((a, b), ('train', 'test'), loc='best', fontsize=10)
    plt.tight_layout()
    plot_fname = os.path.join(constants.OUTPUT_DIR, 'exercicio6-a.pdf')
    plt.savefig(plot_fname, bbox_inches='tight')
    plt.show()

    print('b)')
    x_train_train = train[:np.round(train.shape[0] * 0.7).astype(int), :]
    x_train_val = train[np.round(train.shape[0] * 0.7).astype(int):, :]

    scores = {}
    n_start, n_end = 1, 10
    for n in range(n_start, n_end + 1):
        x_p = utils.x_polynomial(x_train_train[:, 0], n)
        w_hat = np.linalg.inv(x_p.T.dot(x_p)).dot(x_p.T).dot(x_train_train[:,
                                                                           1])
        y_pred = utils.x_polynomial(x_train_val[:, 0], n).dot(w_hat)

        scores[n] = {
            'RMSE': utils.RMSE(y_pred, x_train_val[:, 1]),
            'MAPE': utils.MAPE(y_pred, x_train_val[:, 1]),
            'R_2': utils.R_2(y_pred, x_train_val[:, 1]),
        }

    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    a = ax1.plot(list(range(n_start, n_end + 1)),
                 [scores[n]['RMSE'] for n in scores.keys()],
                 c='g',
                 label='RMSE')
    b = ax2.plot(list(range(n_start, n_end + 1)),
                 [scores[n]['R_2'] for n in scores.keys()],
                 c='r',
                 label=r'R$^2$')
    lns = a + b
    ax1.legend(lns, [l.get_label() for l in lns], loc='best', fontsize=10)
    plt.tight_layout()
    plot_fname = os.path.join(constants.OUTPUT_DIR, 'exercicio6-b-tuning.pdf')
    plt.savefig(plot_fname, bbox_inches='tight')
    plt.show()

    r_2 = np.array([[n, scores[n]['R_2']] for n in scores.keys()])
    n_best = int(r_2[r_2[:, 1].argsort()[::-1]][0, 0])

    x_train_p = utils.x_polynomial(train[:, 0], n_best)
    x_test_p = utils.x_polynomial(test[:, 0], n_best)
    w_hat = np.linalg.inv(x_train_p.T.dot(x_train_p)).dot(x_train_p.T).dot(
        train[:, 1])
    y_pred_train = x_train_p.dot(w_hat)
    y_pred_test = x_test_p.dot(w_hat)

    print('\tTuning:\n\t\tBest N [{}-{}]: {}\n\t\tR^2: {:.3f}'.format(
        n_start, n_end, n_best, scores[n_best]['R_2']))
    print('\tParams: {}'.format(w_hat))
    print('\tR^2: train({:.3f}), test({:.3f})'.format(
        utils.R_2(y_pred_train, train[:, 1]),
        utils.R_2(y_pred_test, test[:, 1])))
    print('\tTrain -> RMSE: {:.3f}, MAPE: {:.3f}'.format(
        utils.RMSE(y_pred_train, train[:, 1]),
        utils.MAPE(y_pred_train, train[:, 1])))
    print('\tTest -> RMSE: {:.3f}, MAPE: {:.3f}'.format(
        utils.RMSE(y_pred_test, test[:, 1]),
        utils.MAPE(y_pred_test, test[:, 1])))
    # plot
    a = plt.scatter(train[:, 0], train[:, 1], c='g', linewidths=0)
    b = plt.scatter(test[:, 0], test[:, 1], c='b', linewidths=0)
    plt.plot(np.arange(x_min, x_max, 0.1),
             utils.x_polynomial(np.arange(x_min, x_max, 0.1),
                                n_best).dot(w_hat),
             c='k')
    plt.legend((a, b), ('train', 'test'), loc='best', fontsize=10)
    plt.tight_layout()
    plot_fname = os.path.join(constants.OUTPUT_DIR, 'exercicio6-b.pdf')
    plt.savefig(plot_fname, bbox_inches='tight')
    plt.show()

    print('c)')
    w_hat, outliers = utils.RANSAC(train[:, 0],
                                   train[:, 1],
                                   n=n_best,
                                   tau=10,
                                   seed=constants.SEED)
    x_train_p = utils.x_polynomial(train[:, 0], n_best)
    x_test_p = utils.x_polynomial(test[:, 0], n_best)
    y_pred_train = x_train_p.dot(w_hat)
    y_pred_test = x_test_p.dot(w_hat)
    print('\tTrain -> RMSE: {:.3f}, MAPE: {:.3f}'.format(
        utils.RMSE(y_pred_train, train[:, 1]),
        utils.MAPE(y_pred_train, train[:, 1])))
    print('\tTest -> RMSE: {:.3f}, MAPE: {:.3f}'.format(
        utils.RMSE(y_pred_test, test[:, 1]),
        utils.MAPE(y_pred_test, test[:, 1])))
    # plot
    plt.plot(np.arange(x_min, x_max, 0.1),
             utils.x_polynomial(np.arange(x_min, x_max, 0.1),
                                n_best).dot(w_hat),
             c='k')
    a = plt.scatter(train[:, 0], train[:, 1], c='g', linewidths=0)
    b = plt.scatter(test[:, 0], test[:, 1], c='b', linewidths=0)
    c = plt.scatter(outliers[0], outliers[1], c='r', linewidths=0)
    plt.legend((a, b, c), ('train', 'test', 'train_outliers'),
               loc='best',
               fontsize=10)
    plt.tight_layout()
    plot_fname = os.path.join(constants.OUTPUT_DIR, 'exercicio6-c.pdf')
    plt.savefig(plot_fname, bbox_inches='tight')
    plt.show()
    exit()
Esempio n. 9
0
        json_path), "No JSON configuration file found at {}".format(json_path)
    params = utils.Params(json_path)

    trainloader, testloader, ddv_list = datautils.fetch_noniid_dataloader(
        params)

    # ddv_list = []
    # sum_list = [0 for _ in range(params.n_users)]
    # for i in range(params.n_users):
    #     dist = [1 for _ in range(10)]
    #     for _, target in trainloader[i]:
    #         for elem in target:
    #             dist[int(elem)-1] += 1
    #     total = sum(dist)
    #     for k in range(10):
    #         dist[k] = dist[k] / total
    #     ddv_list.append(np.array(dist))
    #     # sum_list[i] = sum(dist)
    # # print(sum_list)
    # print(ddv_list)
    #
    I = np.array([0.1 for _ in range(10)])
    print("I: {}".format(I))

    for p in ddv_list:
        # print("D(I||p): ".format(np.sum(I * np.log(I / p))), end=" / ")
        KLD = utils.KL_divergence(p, I)
        IID_dist = utils.RMSE(p, I)
        print("D(p||I): {}".format(KLD))
        print("IID proximity: {}".format(IID_dist))
        print()