def load_data(data): # Adapter adapter = LearningDataAdapter(for_learning=True) adapter.adapt_file(data) X_num, X_cat = adapter.X_num, adapter.X_cat w, y = adapter.w, adapter.y # Preprocessor processor = PreProcess() processor.fit( X_num, X_cat, { 'imputer': '../preprocess/imputer.pkl', 'scaler': '../preprocess/scaler.pkl', 'encoder': '../preprocess/encoder.pkl' }) X = processor.transform(X_num, X_cat) # Resample to balance labels X, Y = binary_upsampling(X, y) # Create a dictionary of features assert X.shape[0] == Y.shape[0] X_dict = {} for i, feature in enumerate(X.T): X_dict[str(i)] = feature feature_columns = [ tf.feature_column.numeric_column(k) for k in X_dict.keys() ] return X_dict, Y, feature_columns
def get_data(data, fit=True): # Preprocessor process_path = {'imputer': '../preprocess/imputer.pkl', 'scaler': '../preprocess/scaler.pkl', 'encoder': '../preprocess/encoder.pkl'} processor = PreProcess() X, y, w = load_data(data, process_path, fit=fit) if fit: # Resample to balance labels X, Y, _ = binary_upsampling(X, y, w) else: Y = y # Return labels as 1D array assert X.shape[0] == Y.shape[0] return X, Y.reshape((len(Y),1))
def load_data(data, for_learning=True): # Adapter adapter = LearningDataAdapter(for_learning) adapter.adapt_file(data) X_num, X_cat = adapter.X_num, adapter.X_cat w, y = adapter.w, adapter.y # Preprocessor processor = PreProcess() processor.fit( X_num, X_cat, { 'imputer': '../preprocess/imputer.pkl', 'scaler': '../preprocess/scaler.pkl', 'encoder': '../preprocess/encoder.pkl' }) X = processor.transform(X_num, X_cat) # Resample to balance labels X, Y = binary_upsampling(X, y) # Return labels as 1D array assert X.shape[0] == Y.shape[0] return X, Y.reshape((len(Y), 1))
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = '1' print 'Loading data and preprocessing ......' start_time = time.time() process_path = { 'imputer': '../preprocess/imputer.pkl', 'scaler': '../preprocess/scaler.pkl', 'encoder': '../preprocess/encoder.pkl' } X, y, w = load_data('../data/train.csv', process_path, fit=False) X_validate, y_validate, w_validate = load_data('../data/validate.csv', process_path, fit=False) # resample X_train, y_train, w_train = binary_upsampling(X, y, w) Y_train = np.array([y_train, -(y_train - 1)]).T Y_validate = np.array([y_validate, -(y_validate - 1)]).T end = time.time() print 'Input Shapes : ' print 'X_train: {0} Y_train: {1} w_train: {2}'.format( X_train.shape, Y_train.shape, w_train.shape) print 'X_validate: {0} Y_validate: {1} w_validate: {2}'.format( X_validate.shape, Y_validate.shape, w_validate.shape) print 'Done. Took {} seconds.'.format(end - start_time) print print 'Training ......' start = time.time() # train config = {