def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prediction', type=str)
    parser.add_argument('--test_listfile', type=str,
                        default=os.path.join(os.path.dirname(__file__), '../../data/length-of-stay/test/listfile.csv'))
    parser.add_argument('--n_iters', type=int, default=1000)
    parser.add_argument('--save_file', type=str, default='los_results.json')
    args = parser.parse_args()

    pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32,
                                                                   'y_true': np.float32})
    test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32,
                                                                   'y_true': np.float32})

    df = test_df.merge(pred_df, left_on=['stay', 'period_length'], right_on=['stay', 'period_length'],
                       how='left', suffixes=['_l', '_r'])
    # assert (df['prediction'].isnull().sum() == 0)
    print((df['y_true_l'] != df['y_true_r']).sum())
    # assert (df['y_true_l'].equals(df['y_true_r']))
    df = df.dropna(subset=['y_true_r'])

    metrics = [('Kappa', 'kappa'),
               ('MAD', 'mad'),
               ('MSE', 'mse'),
               ('MAPE', 'mape')]

    data = np.zeros((df.shape[0], 2))
    data[:, 0] = np.array(df['prediction'])
    data[:, 1] = np.array(df['y_true_l'])

    results = dict()
    results['n_iters'] = args.n_iters
    ret = print_metrics_regression(data[:, 1], data[:, 0], verbose=0)
    for (m, k) in metrics:
        results[m] = dict()
        results[m]['value'] = ret[k]
        results[m]['runs'] = []

    for i in range(args.n_iters):
        cur_data = sk_utils.resample(data, n_samples=len(data))
        ret = print_metrics_regression(cur_data[:, 1], cur_data[:, 0], verbose=0)
        for (m, k) in metrics:
            results[m]['runs'].append(ret[k])

    for (m, k) in metrics:
        runs = results[m]['runs']
        results[m]['mean'] = np.mean(runs)
        results[m]['median'] = np.median(runs)
        results[m]['std'] = np.std(runs)
        results[m]['2.5% percentile'] = np.percentile(runs, 2.5)
        results[m]['97.5% percentile'] = np.percentile(runs, 97.5)
        del results[m]['runs']

    print("Saving the results in {} ...".format(args.save_file))
    with open(args.save_file, 'w') as f:
        json.dump(results, f)

    print(results)
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prediction', type=str)
    parser.add_argument('--test_listfile', type=str, default='../data/length-of-stay/test/listfile.csv')
    parser.add_argument('--n_iters', type=int, default=1000)
    parser.add_argument('--save_file', type=str, default='los_results.json')
    args = parser.parse_args()

    pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32,
                                                                   'y_true': np.float32})
    test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32,
                                                                   'y_true': np.float32})

    df = test_df.merge(pred_df, left_on=['stay', 'period_length'], right_on=['stay', 'period_length'],
                       how='left', suffixes=['_l', '_r'])
    assert (df['prediction'].isnull().sum() == 0)
    assert (df['y_true_l'].equals(df['y_true_r']))

    metrics = [('Kappa', 'kappa'),
               ('MAD', 'mad'),
               ('MSE', 'mse'),
               ('MAPE', 'mape')]

    data = np.zeros((df.shape[0], 2))
    data[:, 0] = np.array(df['prediction'])
    data[:, 1] = np.array(df['y_true_l'])

    results = dict()
    results['n_iters'] = args.n_iters
    ret = print_metrics_regression(data[:, 1], data[:, 0], verbose=0)
    for (m, k) in metrics:
        results[m] = dict()
        results[m]['value'] = ret[k]
        results[m]['runs'] = []

    for i in range(args.n_iters):
        cur_data = sk_utils.resample(data, n_samples=len(data))
        ret = print_metrics_regression(cur_data[:, 1], cur_data[:, 0], verbose=0)
        for (m, k) in metrics:
            results[m]['runs'].append(ret[k])

    for (m, k) in metrics:
        runs = results[m]['runs']
        results[m]['mean'] = np.mean(runs)
        results[m]['median'] = np.median(runs)
        results[m]['std'] = np.std(runs)
        results[m]['2.5% percentile'] = np.percentile(runs, 2.5)
        results[m]['97.5% percentile'] = np.percentile(runs, 97.5)
        del results[m]['runs']

    print "Saving the results in {} ...".format(args.save_file)
    with open(args.save_file, 'w') as f:
        json.dump(results, f)

    print results
Esempio n. 3
0
    def calc_metrics(self, data_gen, history, dataset, logs):
        y_true = []
        predictions = []
        # for i in range(data_gen.steps):
        #     if self.verbose == 1:
        #         print("\tdone {}/{}".format(i, data_gen.steps), end='\r')
        #     (x, y_processed, y) = data_gen.getitem(i, return_y_true=True)
        #     pred = self.model.predict(x, batch_size=self.batch_size, verbose=self.verbose)
        #     pass
        # aflanders: debug
        # if i == 0:
        #     print(f"type(x): {type(x)}  type(self.batch_size): {type(self.batch_size)}")
        #     print(f"tf.executing_eagerly():{tf.executing_eagerly()}")
        # aflanders: debug
        #pred = self.model.predict(x, batch_size=self.batch_size, verbose=self.verbose)
        pred = self.model.predict(data_gen,
                                  batch_size=self.batch_size,
                                  verbose=self.verbose,
                                  steps=data_gen.steps,
                                  workers=self.workers,
                                  use_multiprocessing=True)
        # if isinstance(x, list) and len(x) == 2:  # deep supervision
        #     if pred.shape[-1] == 1:  # regression
        #         pred_flatten = pred.flatten()
        #     else:  # classification
        #         pred_flatten = pred.reshape((-1, 10))
        #     for m, t, p in zip(x[1].flatten(), y.flatten(), pred_flatten):
        #         if np.equal(m, 1):
        #             y_true.append(t)
        #             predictions.append(p)
        # else:
        y = data_gen.get_y(len(pred))

        # if pred.shape[-1] == 1:
        #     y_true += list(y.flatten())
        #     predictions += list(pred.flatten())
        # else:
        y_true += list(y)
        predictions += list(pred)

        print('\n')
        if self.partition == 'log':
            predictions = [
                metrics.get_estimate_log(x, 10) for x in predictions
            ]
            ret = metrics.print_metrics_log_bins(y_true, predictions)
        if self.partition == 'custom':
            predictions = [
                metrics.get_estimate_custom(x, 10) for x in predictions
            ]
            ret = metrics.print_metrics_custom_bins(y_true, predictions)
        if self.partition == 'none':
            ret = metrics.print_metrics_regression(y_true, predictions)
        for k, v in ret.items():
            logs[dataset + '_' + k] = v
        history.append(ret)
Esempio n. 4
0
 def calc_metrics(self, data_gen, history, dataset, logs):
     y_true = []
     predictions = []
     for i in range(data_gen.steps):
         if self.verbose == 1:
             print("\tdone {}/{}".format(i, data_gen.steps), end='\r')
         if self.use_time:
             ([x, t], y_processed, y) = data_gen.next(return_y_true=True)
             pred = self.model.predict([x, t], batch_size=self.batch_size)
         else:
             (x, y_processed, y) = data_gen.next(return_y_true=True)
             pred = self.model.predict(x, batch_size=self.batch_size)
         if isinstance(x, list) and len(x) == 2:  # deep supervision
             if pred.shape[-1] == 1:  # regression
                 pred_flatten = pred.flatten()
             else:  # classification
                 pred_flatten = pred.reshape((-1, 10))
             for m, t, p in zip(x[1].flatten(), y.flatten(), pred_flatten):
                 if np.equal(m, 1):
                     y_true.append(t)
                     predictions.append(p)
         else:
             if pred.shape[-1] == 1:
                 y_true += list(y.flatten())
                 predictions += list(pred.flatten())
             else:
                 y_true += list(y)
                 predictions += list(pred)
     print('\n')
     if self.partition == 'log':
         predictions = [
             metrics.get_estimate_log(x, 10) for x in predictions
         ]
         ret = metrics.print_metrics_log_bins(y_true, predictions)
     if self.partition == 'custom':
         predictions = [
             metrics.get_estimate_custom(x, 10) for x in predictions
         ]
         ret = metrics.print_metrics_custom_bins(y_true, predictions)
     if self.partition == 'none':
         ret = metrics.print_metrics_regression(y_true, predictions)
     for k, v in ret.items():
         logs[dataset + '_' + k] = v
     history.append(ret)
Esempio n. 5
0
            names += list(cur_names)
            ts += list(cur_ts)

    if stochastic:
        aleatoric = [np.mean(x * (1. - x), axis=0) for x in predictions]
        epistemic = [np.var(x, axis=0) for x in predictions]
        predictions = [np.mean(x, axis=0) for x in predictions]

    if args.partition == 'log':
        predictions = [metrics.get_estimate_log(x, 10) for x in predictions]
        metrics.print_metrics_log_bins(labels, predictions)
    if args.partition == 'custom':
        predictions = [metrics.get_estimate_custom(x, 10) for x in predictions]
        metrics.print_metrics_custom_bins(labels, predictions)
    if args.partition == 'none':
        metrics.print_metrics_regression(labels, predictions)
        predictions = [x[0] for x in predictions]

    path = os.path.join(
        os.path.join(args.output_dir, "test_predictions",
                     os.path.basename(args.load_state)) + ".csv")

    if stochastic:
        ee = np.mean(np.array(epistemic))
        aa = np.mean(np.array(aleatoric))
        print("Epistemic uncertainty =", ee)
        print("Aleatoric uncertainty =", aa)
        print("Uncertainty =", ee + aa)
        utils.save_results(names,
                           ts,
                           predictions,
Esempio n. 6
0
        print "\n ================ decompensation ================"
        decomp_pred = np.array(decomp_pred)
        decomp_pred = np.stack([1-decomp_pred, decomp_pred], axis=1)
        decomp_ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred)

    ## los
    if args.los_C > 0:
        print "\n ================ length of stay ================"
        if args.partition == 'log':
            los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_log_bins(los_y_true, los_pred)
        if args.partition == 'custom':
            los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_custom_bins(los_y_true, los_pred)
        if args.partition == 'none':
            los_ret = metrics.print_metrics_regression(los_y_true, los_pred)

    ## pheno
    if args.pheno_C > 0:
        print "\n =================== phenotype =================="
        pheno_pred = np.array(pheno_pred)
        pheno_ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred)

    # TODO: save activations if needed

elif args.mode == 'test_single':
    # ensure that the code uses test_reader
    del train_reader
    del val_reader
    del train_data_gen
    del val_data_gen
Esempio n. 7
0
if not os.path.exists("activations"):
    os.mkdir("activations")

if not os.path.exists("results"):
    os.mkdir("results")

with open(os.path.join("results", "log_" + file_name + ".txt"),
          "w") as resfile:

    resfile.write("mad, mse, mape, kappa\n")

    print "Scores on train set"
    pred = linreg.predict(train_X)
    pred[pred > 8] = 8
    ret = metrics.print_metrics_regression(train_y, np.exp(pred) - 1)
    resfile.write("%.6f,%.6f,%.6f,%.6f\n" %
                  (ret['mad'], ret['mse'], ret['mape'], ret['kappa']))

    print "Scores on validation set"
    pred = linreg.predict(val_X)
    pred[pred > 8] = 8
    ret = metrics.print_metrics_regression(val_y, np.exp(pred) - 1)
    resfile.write("%.6f,%.6f,%.6f,%.6f\n" %
                  (ret['mad'], ret['mse'], ret['mape'], ret['kappa']))

############################### TESTING #############################
# predict on test
del train_reader
del val_reader
del train_X
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of length-of-stay task',
                        default=os.path.join(os.path.dirname(__file__),
                                             '../../../data/length-of-stay/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = LengthOfStayReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'))

    val_reader = LengthOfStayReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'))

    test_reader = LengthOfStayReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'))

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_names,
     train_ts) = read_and_extract_features(train_reader, n_train, args.period,
                                           args.features)

    (val_X, val_y, val_names,
     val_ts) = read_and_extract_features(val_reader, n_val, args.period,
                                         args.features)

    (test_X, test_y, test_names,
     test_ts) = read_and_extract_features(test_reader,
                                          test_reader.get_number_of_examples(),
                                          args.period, args.features)

    print(train_X.shape)
    assert False

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    file_name = "{}.{}".format(args.period, args.features)

    linreg = LinearRegression()
    linreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              "w") as res_file:
        ret = print_metrics_regression(train_y, linreg.predict(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_regression(val_y, linreg.predict(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = linreg.predict(test_X)

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_regression(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, test_ts, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Esempio n. 9
0
    def calc_metrics(self, data_gen, history, dataset, logs):
        ihm_y_true = []
        decomp_y_true = []
        los_y_true = []
        pheno_y_true = []

        ihm_pred = []
        decomp_pred = []
        los_pred = []
        pheno_pred = []

        for i in range(data_gen.steps):
            if self.verbose == 1:
                print("\tdone {}/{}".format(i, data_gen.steps), end='\r')
            (X, y, los_y_reg) = data_gen.next(return_y_true=True)
            outputs = self.model.predict(X, batch_size=self.batch_size)

            ihm_M = X[1]
            decomp_M = X[2]
            los_M = X[3]

            if not data_gen.target_repl:  # no target replication
                (ihm_p, decomp_p, los_p, pheno_p) = outputs
                (ihm_t, decomp_t, los_t, pheno_t) = y
            else:  # target replication
                (ihm_p, _, decomp_p, los_p, pheno_p, _) = outputs
                (ihm_t, _, decomp_t, los_t, pheno_t, _) = y

            los_t = los_y_reg  # real value not the label

            # ihm
            for (m, t, p) in zip(ihm_M.flatten(), ihm_t.flatten(),
                                 ihm_p.flatten()):
                if np.equal(m, 1):
                    ihm_y_true.append(t)
                    ihm_pred.append(p)

            # decomp
            for (m, t, p) in zip(decomp_M.flatten(), decomp_t.flatten(),
                                 decomp_p.flatten()):
                if np.equal(m, 1):
                    decomp_y_true.append(t)
                    decomp_pred.append(p)

            # los
            if los_p.shape[-1] == 1:  # regression
                for (m, t, p) in zip(los_M.flatten(), los_t.flatten(),
                                     los_p.flatten()):
                    if np.equal(m, 1):
                        los_y_true.append(t)
                        los_pred.append(p)
            else:  # classification
                for (m, t, p) in zip(los_M.flatten(), los_t.flatten(),
                                     los_p.reshape((-1, 10))):
                    if np.equal(m, 1):
                        los_y_true.append(t)
                        los_pred.append(p)

            # pheno
            for (t, p) in zip(pheno_t.reshape((-1, 25)),
                              pheno_p.reshape((-1, 25))):
                pheno_y_true.append(t)
                pheno_pred.append(p)
        print('\n')

        # ihm
        print("\n ================= 48h mortality ================")
        ihm_pred = np.array(ihm_pred)
        ihm_pred = np.stack([1 - ihm_pred, ihm_pred], axis=1)
        ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred)
        for k, v in ret.items():
            logs[dataset + '_ihm_' + k] = v

        # decomp
        print("\n ================ decompensation ================")
        decomp_pred = np.array(decomp_pred)
        decomp_pred = np.stack([1 - decomp_pred, decomp_pred], axis=1)
        ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred)
        for k, v in ret.items():
            logs[dataset + '_decomp_' + k] = v

        # los
        print("\n ================ length of stay ================")
        if self.partition == 'log':
            los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred]
            ret = metrics.print_metrics_log_bins(los_y_true, los_pred)
        if self.partition == 'custom':
            los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
            ret = metrics.print_metrics_custom_bins(los_y_true, los_pred)
        if self.partition == 'none':
            ret = metrics.print_metrics_regression(los_y_true, los_pred)
        for k, v in ret.items():
            logs[dataset + '_los_' + k] = v

        # pheno
        print("\n =================== phenotype ==================")
        pheno_pred = np.array(pheno_pred)
        ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred)
        for k, v in ret.items():
            logs[dataset + '_pheno_' + k] = v

        history.append(logs)
Esempio n. 10
0
     else:
         if pred.shape[-1] == 1:
             y_true += list(y.flatten())
             predictions += list(pred.flatten())
         else:
             y_true += list(y)
             predictions += list(pred)
 print('\n')
 if args.partition == 'log':
     predictions = [metrics.get_estimate_log(x, 10) for x in predictions]
     ret = metrics.print_metrics_log_bins(y_true, predictions)
 if args.partition == 'custom':
     predictions = [metrics.get_estimate_custom(x, 10) for x in predictions]
     ret = metrics.print_metrics_custom_bins(y_true, predictions)
 if args.partition == 'none':
     ret = metrics.print_metrics_regression(y_true, predictions)
 cur_val = ret['mse']
 
 scheduler.step(cur_val)
 current_lr = optimizer.param_groups[0]['lr']
 if current_lr < 1e-5:
     with open(os.path.join(save_path, 'log.txt'), 'a') as fout:
         print('Early stop at step {}'.format(step), file=fout)
     exit()
 
 with open(os.path.join(save_path, 'log.txt'), 'a') as fout:
     print(ret, file=fout)
 is_best = cur_val < best_val
 if is_best:
     best_val = cur_val
     ### save model
Esempio n. 11
0
    if args.decomp_C > 0:
        print "\n ================ decompensation ================"
        decomp_pred = np.array(decomp_pred)
        decomp_ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred)

    # los
    if args.los_C > 0:
        print "\n ================ length of stay ================"
        if args.partition == 'log':
            los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_log_bins(los_y_true, los_pred)
        if args.partition == 'custom':
            los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_custom_bins(los_y_true, los_pred)
        if args.partition == 'none':
            los_ret = metrics.print_metrics_regression(los_y_true, los_pred)

    # pheno
    if args.pheno_C > 0:
        print "\n =================== phenotype =================="
        pheno_pred = np.array(pheno_pred)
        pheno_ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred)

    print "Saving the predictions in test_predictions/task directories ..."

    # ihm
    ihm_path = os.path.join("test_predictions/ihm", os.path.basename(args.load_state)) + ".csv"
    ihm_utils.save_results(ihm_names, ihm_pred, ihm_y_true, ihm_path)

    # decomp
    decomp_path = os.path.join("test_predictions/decomp", os.path.basename(args.load_state)) + ".csv"
Esempio n. 12
0
linreg = LinearRegression()
linreg.fit(train_X, train_y)

if not os.path.exists("activations"):
    os.mkdir("activations")

if not os.path.exists("results"):
    os.mkdir("results")

with open(os.path.join("results", file_name + ".txt"), "w") as resfile:
    
    resfile.write("mad, mse, mape, kappa\n")
    
    print "Scores on train set"
    ret = metrics.print_metrics_regression(train_y, linreg.predict(train_X))
    resfile.write("%.6f,%.6f,%.6f,%.6f\n" % (
        ret['mad'],
        ret['mse'],
        ret['mape'],
        ret['kappa']))
    
    print "Scores on validation set"
    ret = metrics.print_metrics_regression(val_y, linreg.predict(val_X))
    resfile.write("%.6f,%.6f,%.6f,%.6f\n" % (
        ret['mad'],
        ret['mse'],
        ret['mape'],
        ret['kappa']))

############################### TESTING #############################
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    train_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/train/',
                                      listfile='../../../data/length-of-stay/train_listfile.csv')

    val_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/train/',
                                    listfile='../../../data/length-of-stay/val_listfile.csv')

    test_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/test/',
                                     listfile='../../../data/length-of-stay/test_listfile.csv')

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_names, train_ts) = read_and_extract_features(
        train_reader, n_train, args.period, args.features)

    (val_X, val_y, val_names, val_ts) = read_and_extract_features(
        val_reader, n_val, args.period, args.features)

    (test_X, test_y, test_names, test_ts) = read_and_extract_features(
        test_reader, test_reader.get_number_of_examples(), args.period, args.features)

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    file_name = "{}.{}".format(args.period, args.features)

    linreg = LinearRegression()
    linreg.fit(train_X, train_y)

    common_utils.create_directory('results')

    with open(os.path.join("results", 'train_{}.json'.format(file_name)), "w") as res_file:
        ret = print_metrics_regression(train_y, linreg.predict(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_regression(val_y, linreg.predict(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = linreg.predict(test_X)

    with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_regression(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, test_ts, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
Esempio n. 14
0
def process_one_chunk(mode, chunk_index):
    assert (mode == "train" or mode == "test")

    if (mode == "train"):
        reader = train_reader
    if (mode == "test"):
        reader = val_reader

    (data, ts, ys, header) = utils.read_chunk(reader, chunk_size)
    data = utils.preprocess_chunk(data, ts, discretizer, normalizer)

    if (mode == "train"):
        network.set_datasets((data, ys), None)
    if (mode == "test"):
        network.set_datasets(None, (data, ys))

    network.shuffle_train_set()

    y_true = []
    predictions = []
    avg_loss = 0.0
    sum_loss = 0.0
    prev_time = time.time()
    n_batches = network.get_batches_per_epoch(mode)

    for i in range(0, n_batches):
        step_data = network.step(mode)
        prediction = step_data["prediction"]
        answers = step_data["answers"]
        current_loss = step_data["current_loss"]
        current_loss_mse = step_data["loss_mse"]
        current_loss_reg = step_data["loss_reg"]
        log = step_data["log"]

        avg_loss += current_loss
        sum_loss += current_loss

        for x in answers:
            y_true.append(x)

        for x in prediction:
            predictions.append(x)

        if ((i + 1) % args.log_every == 0):
            cur_time = time.time()
            print ("  %sing: %d.%d / %d \t loss: %.3f = %.3f + %.3f \t avg_loss: %.3f \t"\
                   "%s \t time: %.2fs" % (mode, chunk_index, i * args.batch_size,
                        n_batches * args.batch_size,
                        current_loss, current_loss_mse, current_loss_reg,
                        avg_loss / args.log_every, log, cur_time - prev_time))
            avg_loss = 0
            prev_time = cur_time

        if np.isnan(current_loss):
            raise Exception("current loss IS NaN. This should never happen :)")

    sum_loss /= n_batches
    print "\n  %s loss = %.5f" % (mode, sum_loss)

    if args.network in ['lstm', 'lstm_log']:
        metrics.print_metrics_regression(y_true, predictions)
    if args.network == 'lstm_cf_log':
        metrics.print_metrics_log_bins(y_true, predictions)
    if args.network == 'lstm_cf_custom':
        metrics.print_metrics_custom_bins(y_true, predictions)

    return sum_loss
Esempio n. 15
0
            predictions.append(x)

        if ((i + 1) % args.log_every == 0):
            cur_time = time.time()
            print ("  testing: %d / %d \t loss: %.3f \t avg_loss: %.3f \t"\
                   " time: %.2fs" % ((i+1) * args.batch_size,
                        n_batches * args.batch_size, current_loss,
                        avg_loss / args.log_every, cur_time - prev_time))
            avg_loss = 0
            prev_time = cur_time

        if np.isnan(current_loss):
            raise Exception("current loss IS NaN. This should never happen :)")

    sum_loss /= n_batches
    print "\n  test loss = %.5f" % sum_loss

    if args.network in ['lstm', 'lstm_log']:
        metrics.print_metrics_regression(y_true, predictions)
    if args.network == 'lstm_cf_log':
        metrics.print_metrics_log_bins(y_true, predictions)
    if args.network == 'lstm_cf_custom':
        metrics.print_metrics_custom_bins(y_true, predictions)

    with open("activations.txt", "w") as fout:
        fout.write("prediction, y_true")
        for (x, y) in zip(predictions, y_true):
            fout.write("%.6f, %.6f\n" % (x, y))

else:
    raise Exception("unknown mode")
Esempio n. 16
0
        for i in range(test_data_gen.steps):
            print "\rpredicting {} / {}".format(i, test_data_gen.steps),

            ret = test_data_gen.next(return_y_true=True)
            (x, y_processed, y) = ret["data"]
            cur_names = ret["names"]
            cur_ts = ret["ts"]

            x = np.array(x)
            pred = model.predict_on_batch(x)
            predictions += list(pred)
            labels += list(y)
            names += list(cur_names)
            ts += list(cur_ts)

    if args.partition == 'log':
        predictions = [metrics.get_estimate_log(x, 10) for x in predictions]
        metrics.print_metrics_log_bins(labels, predictions)
    if args.partition == 'custom':
        predictions = [metrics.get_estimate_custom(x, 10) for x in predictions]
        metrics.print_metrics_custom_bins(labels, predictions)
    if args.partition == 'none':
        metrics.print_metrics_regression(labels, predictions)
        predictions = [x[0] for x in predictions]

    path = os.path.join("test_predictions", os.path.basename(args.load_state)) + ".csv"
    utils.save_results(names, ts, predictions, labels, path)

else:
    raise ValueError("Wrong value for args.mode")