def load_data(data):
    # Adapter
    adapter = LearningDataAdapter(for_learning=True)
    adapter.adapt_file(data)
    X_num, X_cat = adapter.X_num, adapter.X_cat
    w, y = adapter.w, adapter.y
    # Preprocessor
    processor = PreProcess()
    processor.fit(
        X_num, X_cat, {
            'imputer': '../preprocess/imputer.pkl',
            'scaler': '../preprocess/scaler.pkl',
            'encoder': '../preprocess/encoder.pkl'
        })
    X = processor.transform(X_num, X_cat)
    # Resample to balance labels
    X, Y = binary_upsampling(X, y)
    # Create a dictionary of features
    assert X.shape[0] == Y.shape[0]
    X_dict = {}
    for i, feature in enumerate(X.T):
        X_dict[str(i)] = feature
    feature_columns = [
        tf.feature_column.numeric_column(k) for k in X_dict.keys()
    ]

    return X_dict, Y, feature_columns
Example #2
0
def get_data(data, fit=True):
    # Preprocessor
    process_path = {'imputer': '../preprocess/imputer.pkl', 'scaler': '../preprocess/scaler.pkl', 'encoder': '../preprocess/encoder.pkl'}
    processor = PreProcess()
    X, y, w = load_data(data, process_path, fit=fit)
    if fit:
        # Resample to balance labels
        X, Y, _ = binary_upsampling(X, y, w)
    else:
        Y = y
    # Return labels as 1D array
    assert X.shape[0] == Y.shape[0]
    return X, Y.reshape((len(Y),1))
def load_data(data, for_learning=True):
    # Adapter
    adapter = LearningDataAdapter(for_learning)
    adapter.adapt_file(data)
    X_num, X_cat = adapter.X_num, adapter.X_cat
    w, y = adapter.w, adapter.y
    # Preprocessor
    processor = PreProcess()
    processor.fit(
        X_num, X_cat, {
            'imputer': '../preprocess/imputer.pkl',
            'scaler': '../preprocess/scaler.pkl',
            'encoder': '../preprocess/encoder.pkl'
        })
    X = processor.transform(X_num, X_cat)
    # Resample to balance labels
    X, Y = binary_upsampling(X, y)
    # Return labels as 1D array
    assert X.shape[0] == Y.shape[0]
    return X, Y.reshape((len(Y), 1))
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['CUDA_VISIBLE_DEVICES'] = '1'

    print 'Loading data and preprocessing ......'
    start_time = time.time()
    process_path = {
        'imputer': '../preprocess/imputer.pkl',
        'scaler': '../preprocess/scaler.pkl',
        'encoder': '../preprocess/encoder.pkl'
    }
    X, y, w = load_data('../data/train.csv', process_path, fit=False)
    X_validate, y_validate, w_validate = load_data('../data/validate.csv',
                                                   process_path,
                                                   fit=False)
    # resample
    X_train, y_train, w_train = binary_upsampling(X, y, w)
    Y_train = np.array([y_train, -(y_train - 1)]).T
    Y_validate = np.array([y_validate, -(y_validate - 1)]).T
    end = time.time()
    print 'Input Shapes : '
    print 'X_train: {0}    Y_train: {1}    w_train: {2}'.format(
        X_train.shape, Y_train.shape, w_train.shape)
    print 'X_validate: {0}    Y_validate: {1}    w_validate: {2}'.format(
        X_validate.shape, Y_validate.shape, w_validate.shape)
    print 'Done. Took {} seconds.'.format(end - start_time)
    print

    print 'Training ......'
    start = time.time()
    # train
    config = {