Esempio n. 1
0
def load_create_data(data_type,
                     data_out,
                     is_logging_enabled=True,
                     fn_csv=None,
                     label_nm=None):

    df_train, df_test, dset = None, None, None
    features = None
    if data_type in data_loader_mlab.get_available_datasets() + ['show'] \
       or fn_csv is not None:
        if fn_csv is not None:
            rval, dset = data_loader_mlab.load_dataset_from_csv(
                logger, fn_csv, label_nm)
        else:
            rval, dset = data_loader_mlab.get_dataset(data_type)
        assert rval == 0
        data_loader_mlab.dataset_log_properties(logger, dset)
        if is_logging_enabled:
            logger.info('warning no seed')
        df = dset['df']
        features = dset['features']
        labels = dset['targets']
        nsample = len(df)
        train_ratio = 0.8
        idx = np.random.permutation(nsample)
        ntrain = int(nsample * train_ratio)
        df_train = df.iloc[idx[:ntrain]]
        df_test = df.iloc[idx[ntrain:]]

        col_drop = utilmlab.col_with_nan(df)
        if is_logging_enabled and len(col_drop):
            print('warning: dropping features {}'
                  ', contains nan'.format(col_drop))
            time.sleep(2)

        features = [el for el in features if el not in col_drop]

        x_train = df_train[features].values
        y_train = df_train[labels].values
        x_test = df_test[features].values
        y_test = df_test[labels].values

        g_train, g_test = None, None

        y_train = one_hot_encoder(np.ravel(y_train))
        y_test = one_hot_encoder(np.ravel(y_test))
        if is_logging_enabled:
            logger.info('y: train:{} test:{}'.format(set(np.ravel(y_train)),
                                                     set(np.ravel(y_test))))
    else:
        x_train, y_train, g_train = generate_data(n=train_N,
                                                  data_type=data_type,
                                                  seed=train_seed,
                                                  out=data_out)
        x_test, y_test, g_test = generate_data(n=test_N,
                                               data_type=data_type,
                                               seed=test_seed,
                                               out=data_out)
    if is_logging_enabled:
        logger.info('{} {} {} {}'.format(x_train.shape, y_train.shape,
                                         x_test.shape, y_test.shape))
    return x_train, y_train, g_train, x_test, y_test, \
        g_test, df_train, df_test, dset, features
Esempio n. 2
0
    p_miss = args.pmiss
    odir = os.path.dirname(args.o)
    nsample = args.n
    logger = utilmlab.init_logger(odir if len(odir) else ".")
    islabel = args.istarget
    fn_missing_csv = args.o
    fn_csv = args.oref
    fn_json = args.properties
    uniform_miss = args.uniform_miss
    miss_corr = args.miss_corr
    
    is_normalize_0_1 = args.normalize01

    dataset = args.dataset

    rval, dset = data_loader_mlab.get_dataset(dataset, nsample)
    assert rval == 0
    data_loader_mlab.dataset_log_properties(logger, dset)

    df = dset['df']
    features = dset['features']
    labels = dset['targets']
    features_drop = []
    for el in features:
        # drop columns with missing data as we cannot then calculate the rmse
        if sum(dset['df'][el].isnull()):
            features_drop.append(el)
    if len(features_drop):
        logger.info('dropping features {}'.format(features_drop))
        time.sleep(2)
Esempio n. 3
0
            df_tmp, prop_df_one_hot = utilmlab.df_cat_to_one_hot(
                df[features],
                is_verbose=is_verbose,
                is_cat_one_hot=is_cat_one_hot)
            Data = df_tmp.values
        else:
            Data = df[features].values
        Missing = np.where(np.isnan(Data), 0.0, 1.0)
        Data = np.where(Missing, Data, 0)
        if fn_ref_csv is not None:
            df_ref = pd.read_csv(fn_ref_csv)
        logger.info('features: #{} {} label:{}'.format(len(features), features,
                                                       label))
    else:
        logger.info('loading {} using dataloader'.format(dataset))
        rval, dset = data_loader_mlab.get_dataset(dataset)
        assert rval == 0
        data_loader_mlab.dataset_log_properties(logger, dset)
        features = dset['features']
        Data = dset['df'][dset['features']].values.astype(np.float)

    # Parameters
    No = len(Data)
    Dim = len(Data[0, :])

    # Hidden state dimensions
    H_Dim1 = Dim
    H_Dim2 = Dim

    if True:
        if fn_icsv is not None: