コード例 #1
0
ファイル: create_missing.py プロジェクト: bvanbreugel/ReGAIN
    nsample = args.n
    logger = utilmlab.init_logger(odir if len(odir) else ".")
    islabel = args.istarget
    fn_missing_csv = args.o
    fn_csv = args.oref
    fn_json = args.properties
    uniform_miss = args.uniform_miss
    miss_corr = args.miss_corr
    
    is_normalize_0_1 = args.normalize01

    dataset = args.dataset

    rval, dset = data_loader_mlab.get_dataset(dataset, nsample)
    assert rval == 0
    data_loader_mlab.dataset_log_properties(logger, dset)

    df = dset['df']
    features = dset['features']
    labels = dset['targets']
    features_drop = []
    for el in features:
        # drop columns with missing data as we cannot then calculate the rmse
        if sum(dset['df'][el].isnull()):
            features_drop.append(el)
    if len(features_drop):
        logger.info('dropping features {}'.format(features_drop))
        time.sleep(2)

    features = [el for el in features if el not in features_drop]
コード例 #2
0
def load_create_data(data_type,
                     data_out,
                     is_logging_enabled=True,
                     fn_csv=None,
                     label_nm=None):

    df_train, df_test, dset = None, None, None
    features = None
    if data_type in data_loader_mlab.get_available_datasets() + ['show'] \
       or fn_csv is not None:
        if fn_csv is not None:
            rval, dset = data_loader_mlab.load_dataset_from_csv(
                logger, fn_csv, label_nm)
        else:
            rval, dset = data_loader_mlab.get_dataset(data_type)
        assert rval == 0
        data_loader_mlab.dataset_log_properties(logger, dset)
        if is_logging_enabled:
            logger.info('warning no seed')
        df = dset['df']
        features = dset['features']
        labels = dset['targets']
        nsample = len(df)
        train_ratio = 0.8
        idx = np.random.permutation(nsample)
        ntrain = int(nsample * train_ratio)
        df_train = df.iloc[idx[:ntrain]]
        df_test = df.iloc[idx[ntrain:]]

        col_drop = utilmlab.col_with_nan(df)
        if is_logging_enabled and len(col_drop):
            print('warning: dropping features {}'
                  ', contains nan'.format(col_drop))
            time.sleep(2)

        features = [el for el in features if el not in col_drop]

        x_train = df_train[features].values
        y_train = df_train[labels].values
        x_test = df_test[features].values
        y_test = df_test[labels].values

        g_train, g_test = None, None

        y_train = one_hot_encoder(np.ravel(y_train))
        y_test = one_hot_encoder(np.ravel(y_test))
        if is_logging_enabled:
            logger.info('y: train:{} test:{}'.format(set(np.ravel(y_train)),
                                                     set(np.ravel(y_test))))
    else:
        x_train, y_train, g_train = generate_data(n=train_N,
                                                  data_type=data_type,
                                                  seed=train_seed,
                                                  out=data_out)
        x_test, y_test, g_test = generate_data(n=test_N,
                                               data_type=data_type,
                                               seed=test_seed,
                                               out=data_out)
    if is_logging_enabled:
        logger.info('{} {} {} {}'.format(x_train.shape, y_train.shape,
                                         x_test.shape, y_test.shape))
    return x_train, y_train, g_train, x_test, y_test, \
        g_test, df_train, df_test, dset, features