def test_encode_data_method(): m = 'random' fp = 'data/train_data.csv' nrows = 10 df = pd.read_csv(fp, sep='\t', nrows=nrows) with pytest.raises(AttributeError): encode_data(df, m)
def test_encode_data(): fp = 'data/train_data.csv' nrows = 10 df = pd.read_csv(fp, sep='\t', nrows=nrows) x, y = encode_data(df) assert x.shape == (nrows, 280) assert len(y) == nrows
def evaluate_model(): """ Use this function to validate DeepMiRNA model :return: keras history object with the results """ # get training parameters model_name = gv.TRAIN_FINAL_MODEL_NAME train_set_fp = os.path.join(gv.ROOT_DIR, gv.TRAIN_SET_LOCATION) true_labels_fp = os.path.join(gv.ROOT_DIR, gv.TRUE_LABELS) ohe_duplexes_fp = os.path.join(gv.ROOT_DIR, gv.ONE_HOT_ENCODED_DUPLEXES) batch_size = gv.BATCH_SIZE n_epochs = gv.N_EPOCHS keep_prob = gv.KEEP_PROB training_df = pd.read_csv(train_set_fp, sep='\t', usecols=gv.TRAIN_SET_COLUMNS) # check if encoded data already exists if Path(ohe_duplexes_fp).exists(): # load data _logger.info(' One-hot encoded training set found. Loading data ...') ytrain = np.loadtxt(true_labels_fp) with h5py.File(ohe_duplexes_fp, 'r') as hf: xtrain = hf['encoded_training_set'][:] else : _logger.info(' Encoding the training set. This might take some time') xtrain, ytrain = encode_data(training_df) # save for next computation _logger.info(' Saving encoded data to disk.') np.savetxt(true_labels_fp, ytrain) with h5py.File(ohe_duplexes_fp, 'w') as hf: hf.create_dataset('encoded_training_set', data=xtrain) _logger.info(' Building and compiling the model') model = _create_mlp_model(xtrain.shape[1], keep_prob) _logger.info(' Training started') history = train_eval(model, model_name, xtrain, ytrain, batch_size, n_epochs) _plot_model_history(history) _logger.info(' model {} saved.'.format(model_name)) _logger.info(' Best model achieved {:.2f} accuracy and {:.2f} validation loss on the validation set.' .format(max(history.history['acc']), min(history.history['val_loss']))) return history
def train_model(): """ Train DeepMiRNA model over the whole training set and obtain the final model :return: keras history object """ # get training parameters model_name = gv.TRAIN_FINAL_MODEL_NAME train_set_fp = os.path.join(gv.ROOT_DIR, gv.TRAIN_SET_LOCATION) true_labels_fp = os.path.join(gv.ROOT_DIR, gv.TRUE_LABELS) ohe_duplexes_fp = os.path.join(gv.ROOT_DIR, gv.ONE_HOT_ENCODED_DUPLEXES) batch_size = gv.BATCH_SIZE n_epochs = gv.N_EPOCHS keep_prob = gv.KEEP_PROB training_df = pd.read_csv(train_set_fp, sep='\t', usecols=gv.TRAIN_SET_COLUMNS) # check if encoded data already exists if Path(ohe_duplexes_fp).exists(): # load data _logger.info(' One-hot encoded training set found. Loading data ...') ytrain = np.loadtxt(true_labels_fp) with h5py.File(ohe_duplexes_fp, 'r') as hf: xtrain = hf['encoded_training_set'][:] else : _logger.info(' Encoding the training set. This might take some time') xtrain, ytrain = encode_data(training_df) # save for next computation _logger.info(' Saving encoded data to disk.') np.savetxt(true_labels_fp, ytrain) with h5py.File(ohe_duplexes_fp, 'w') as hf: hf.create_dataset('encoded_training_set', data=xtrain) _logger.info(' Building and compiling the model') model = _create_mlp_model(xtrain.shape[1], keep_prob) _logger.info(' Training started') history = train(model, model_name, xtrain, ytrain, batch_size, n_epochs) return history
def test_encode_data_header(): d = {'a': [0], 'b': [1], 'c': [2]} df = pd.DataFrame(d) with pytest.raises(SystemExit): encode_data(df)