def evaluate_slant(dataset_path, preds_path): test_set_df = pd.read_csv(dataset_path, sep='\t') result_df = pd.read_csv(preds_path, sep=' ') assert test_set_df.shape[0] == result_df.shape[0] sl_preds = np.array(result_df[['X0', 'X1']]).astype(float) sl_preds = 1 - sl_preds y_true = np.array(test_set_df['sl'] == 'X1').astype(int) y_true = 1 - y_true hardpreds = np.argmax(sl_preds, axis=1) eval_results, cm = eval_funcs.eval_classifier(y_true, sl_preds) BASE_FPR = np.linspace(0, 1, 101) fpr, tpr, _ = sklearn.metrics.roc_curve(y_true, sl_preds[:, 1]) tpr = interp(BASE_FPR, fpr, tpr) tpr[0] = 0.0 eval_results['fpr'] = BASE_FPR eval_results['roc_curve'] = tpr return eval_results, cm
def main(cfg, rep, fold, output_path): X = np.load(cfg['features_path']) y = np.load(cfg['targets_path'])['y'] # load train/test split data = np.load(cfg['splits_path']) train_sets = data['train_sets'] valid_sets = data['valid_sets'] test_sets = data['test_sets'] # create training and testing data frames train_ix = train_sets[rep, fold, :] valid_ix = valid_sets[rep, fold, :] test_ix = test_sets[rep, fold, :] train_ix = train_ix + valid_ix Xtrain = X[train_ix, :] ytrain = y[train_ix] clf = RandomForestClassifier(n_estimators=cfg['n_estimators'], n_jobs=cfg['n_jobs'], verbose=cfg['verbose'], class_weight='balanced') clf = clf.fit(Xtrain, ytrain) Xtest = X[test_ix, :] ytest = y[test_ix] yhat = clf.predict(Xtest) preds = np.zeros((yhat.shape[0], 2)) preds[:, 0] = yhat preds[:, 1] = 1 - yhat print(ytest.shape) print(preds.shape) r, cm = eval_funcs.eval_classifier(ytest, preds) pprint.pprint(r)
def main(cfg, rep, fold, output_path, print_results=True, return_model=False): K.clear_session() dataset_path = cfg['task_path'] train_test_path = cfg['splits_path'] # load dataset df = pd.read_csv(dataset_path) # create output Y = np_utils.to_categorical(df[cfg['target_col']]) # load train/test split data = np.load(train_test_path) train_sets = data['train_sets'] valid_sets = data['valid_sets'] test_sets = data['test_sets'] # create training and testing data frames train_ix = train_sets[rep, fold, :] valid_ix = valid_sets[rep, fold, :] test_ix = test_sets[rep, fold, :] print("Dataset size: %d, Total: %d" % (df.shape[0], np.sum(train_ix + valid_ix + test_ix))) if not cfg.get("early_stopping", True): train_ix = train_ix + valid_ix if cfg.get("train_on_full_dataset", False): print(colored("******** TRAINING ON FULL DATASET ***********", "red")) train_ix = np.ones_like(train_ix) if cfg.get("test_on_full_dataset", False): print(colored("******** TESTING ON FULL DATASET ***********", "green")) test_ix = np.ones_like(test_ix) train_df = df.iloc[train_ix] valid_df = df.iloc[valid_ix] test_df = df.iloc[test_ix] print("Train size: %d, valid: %d, test: %d" % (train_df.shape[0], valid_df.shape[0], test_df.shape[0])) train_Y = Y[train_ix, :] valid_Y = Y[valid_ix, :] test_Y = Y[test_ix, :] if cfg.get("bootstrap_training", False): print(colored("******** BOOTSTRAPPING TRAINING ***********", "blue")) rix = rng.choice(train_df.shape[0], train_df.shape[0], replace=True) train_df = train_df.iloc[rix] train_Y = train_Y[rix, :] fsets, feature_labels = load_features(cfg) # # NN definition # input_node = layers.Input(shape=(len(feature_labels), ), name='input_features') if cfg['type'] == 'orm': linear_layer = layers.Dense(1, activation='linear') latent_variable = linear_layer(input_node) ordinal_layer = OrdinalLayer(Y.shape[1]) output_node = ordinal_layer(latent_variable) else: output_layer = layers.Dense(Y.shape[1], activation='softmax') output_node = output_layer(input_node) if cfg['balanced_loss']: loss = weighted_categorical_xentropy else: loss = 'categorical_crossentropy' model = keras.models.Model(inputs=input_node, outputs=output_node) model.compile(cfg['optimizer'], loss) if cfg.get("trained_model_path", None) is not None: if cfg.get("add_repfold_to_trained_model_path", True): cfg["trained_model_path"] = "%s_%d_%d" % ( cfg["trained_model_path"], rep, fold) # train if cfg.get("train_model", True): train_iterator = create_data_iterator(train_df, train_Y, fsets, cfg) valid_iterator = create_data_iterator(valid_df, valid_Y, fsets, cfg) if cfg.get("early_stopping", True): callbacks = [ keras.callbacks.EarlyStopping(monitor='val_loss', patience=cfg['patience'], restore_best_weights=True) ] else: callbacks = [] if cfg['epochs'] > 0: model.fit_generator(train_iterator(), steps_per_epoch=np.ceil(train_df.shape[0] / cfg['batch_size']), epochs=cfg['epochs'], verbose=cfg['verbose'], validation_data=valid_iterator(), validation_steps=np.ceil(valid_df.shape[0] / cfg['batch_size']), callbacks=callbacks) if cfg.get("trained_model_path", None) is not None: print("Saving model to %s" % cfg['trained_model_path']) model.save_weights(cfg["trained_model_path"]) if cfg.get("save_tjs", False): tfjs.converters.save_keras_model(model, cfg["tjs_path"]) else: print("Loading from %s" % cfg['trained_model_path']) model.load_weights(cfg["trained_model_path"]).expect_partial() if return_model: return model, fsets test_iterator = create_data_iterator(test_df, test_Y, fsets, cfg, False) preds = model.predict(test_iterator(), steps=np.ceil(test_df.shape[0] / cfg['batch_size'])) y_target = np.argmax(test_Y, axis=1) ix = np.sum(np.isnan(preds), axis=1) > 0 print("Nan: %d" % np.sum(ix)) print(test_df[ix]) r, cm = eval_funcs.eval_classifier(y_target, preds) if print_results: eval_funcs.print_eval_classifier(r) if cfg['type'] == 'orm': np.savez(output_path, preds=preds, y_target=y_target, cfg=cfg, r=r, cm=cm, rep=rep, biases=linear_layer.get_weights()[1], weights=linear_layer.get_weights()[0], thresholds=ordinal_layer.get_thresholds(), labels=feature_labels, fold=fold) else: weights_list = output_layer.get_weights() np.savez(output_path, preds=preds, y_target=y_target, cfg=cfg, r=r, cm=cm, rep=rep, weights=weights_list[0], biases=weights_list[1], labels=feature_labels, fold=fold) return None
def main(cfg, rep, fold, output_path, print_results=True): dataset_path = cfg['task_path'] train_test_path = cfg['splits_path'] # load dataset df = pd.read_csv(dataset_path) # create output Y = np_utils.to_categorical(df[cfg['target_col']]) # read features all_features, labels = read_features(cfg) # load train/test split data = np.load(train_test_path) train_sets = data['train_sets'] valid_sets = data['valid_sets'] test_sets = data['test_sets'] # create training and testing data frames train_ix = train_sets[rep, fold,:] valid_ix = valid_sets[rep,fold,:] test_ix = test_sets[rep,fold,:] if not cfg.get("early_stopping", True): train_ix = train_ix + valid_ix if cfg.get("train_on_full_dataset", False): print(colored("******** TRAINING ON FULL DATASET ***********", "red")) train_ix = train_ix + test_ix if cfg.get("test_on_full_dataset", False): print(colored("******** TESTING ON FULL DATASET ***********", "green")) test_ix = train_ix + test_ix + valid_ix train_df = df.iloc[train_ix] valid_df = df.iloc[valid_ix] test_df = df.iloc[test_ix] train_Y = Y[train_ix,:] valid_Y = Y[valid_ix,:] test_Y = Y[test_ix, :] F = all_features[df['id'], :] train_F = F[train_ix, :] valid_F = F[valid_ix, :] test_F = F[test_ix, :] if cfg.get("bootstrap_training", False): print(colored("******** BOOTSTRAPPING TRAINING ***********", "blue")) rix = rng.choice(train_df.shape[0], train_df.shape[0], replace=True) train_df = train_df.iloc[rix] train_Y = train_Y[rix,:] train_F = train_F[rix,:] # ordinal model input_node = layers.Input(shape=(F.shape[1],)) if cfg['type'] == 'orm': linear_layer = layers.Dense(1, activation='linear') latent_variable = linear_layer(input_node) ordinal_layer = OrdinalLayer(train_Y.shape[1]) output_node = ordinal_layer(latent_variable) else: output_layer = layers.Dense(Y.shape[1], activation='softmax') output_node = output_layer(input_node) if cfg['balanced_loss']: loss = weighted_categorical_xentropy else: loss = 'categorical_crossentropy' model = keras.models.Model(inputs=input_node, outputs=output_node) model.compile(cfg['optimizer'], loss=loss) if cfg.get("train_model", True): # setup early stopping if cfg.get("early_stopping", True): callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=cfg['patience'], restore_best_weights=True)] else: callbacks = [] # train model.fit( x=train_F, y=train_Y, batch_size=int(cfg['batch_size_p'] * train_Y.shape[0]), epochs=cfg['epochs'], verbose=cfg['verbose'], validation_data=(valid_F, valid_Y), callbacks=callbacks) if cfg.get("trained_model_path", None) is not None: print("Saving model") model.save_weights(cfg["trained_model_path"]) else: model.load_weights(cfg["trained_model_path"]).expect_partial() preds = model.predict(test_F) y_target = np.argmax(test_Y, axis=1) r, cm = eval_funcs.eval_classifier(y_target, preds) if print_results: eval_funcs.print_eval_classifier(r) if cfg['type'] == 'orm': np.savez(output_path, preds = preds, y_target = y_target, cfg = cfg, r=r, cm=cm, rep=rep, biases=linear_layer.get_weights()[1], weights=linear_layer.get_weights()[0], thresholds=ordinal_layer.get_thresholds(), labels=labels, fold=fold) else: weights_list = output_layer.get_weights() np.savez(output_path, preds = preds, y_target = y_target, cfg = cfg, r=r, cm=cm, rep=rep, weights=weights_list[0], biases=weights_list[1], labels=labels, fold=fold)
def main(cfg, rep, fold, output_path, print_results=True, return_model=False): dataset_path = cfg['task_path'] train_test_path = cfg['splits_path'] # load dataset df = pd.read_csv(dataset_path) # load input features single_gene_spec = [s for s in cfg['spec'] if not s['pairwise']] pairwise_gene_spec = [s for s in cfg['spec'] if s['pairwise']] single_fsets, single_fsets_shapes = feature_loader.load_feature_sets(single_gene_spec, False) pairwise_fsets, pairwise_fsets_shapes = feature_loader.load_feature_sets(pairwise_gene_spec, False) # create output Y = np_utils.to_categorical(df[cfg['target_col']]) # load train/test split data = np.load(train_test_path) train_sets = data['train_sets'] valid_sets = data['valid_sets'] test_sets = data['test_sets'] # create training and testing data frames train_ix = train_sets[rep, fold,:] valid_ix = valid_sets[rep,fold,:] test_ix = test_sets[rep,fold,:] if cfg.get("train_on_full_dataset", False): print(colored("******** TRAINING ON FULL DATASET ***********", "red")) train_ix = train_ix + test_ix if cfg.get("test_on_full_dataset", False): print(colored("******** TESTING ON FULL DATASET ***********", "green")) test_ix = train_ix + test_ix + valid_ix train_df = df.iloc[train_ix] valid_df = df.iloc[valid_ix] test_df = df.iloc[test_ix] train_Y = Y[train_ix,:] valid_Y = Y[valid_ix,:] test_Y = Y[test_ix, :] # # NN definition # # single gene features inputs_a = nn_arch.create_input_nodes(single_gene_spec, single_fsets_shapes, base_name="a") inputs_b = nn_arch.create_input_nodes(single_gene_spec, single_fsets_shapes, base_name="b") single_gene_arch = nn_arch.create_input_architecture(output_size=cfg['embedding_size'], output_activation=cfg['embedding_activation'], name='single_input', spec=single_gene_spec) output_a = single_gene_arch(inputs_a, name="input_a") output_b = single_gene_arch(inputs_b, name="input_b") # pairwise features inputs_ab = nn_arch.create_input_nodes(pairwise_gene_spec, pairwise_fsets_shapes, base_name="ab") pairwise_gene_arch = nn_arch.create_input_architecture(output_size=cfg['embedding_size'], output_activation=cfg['embedding_activation'], name='pairwise_input', spec=pairwise_gene_spec) output_ab = pairwise_gene_arch(inputs_ab, name="input_ab") merged = nn_arch.concatenate([output_a + output_b, output_ab], name="preoutput") output_node = layers.Dense(Y.shape[1], activation='softmax')(merged) if cfg['balanced_loss']: loss = weighted_categorical_xentropy else: loss = 'categorical_crossentropy' model = keras.models.Model(inputs=inputs_a + inputs_b + inputs_ab, outputs=output_node) opt = tf.keras.optimizers.Nadam(learning_rate=cfg.get('learning_rate', 0.001)) model.compile(opt, loss) model.outputs[0]._uses_learning_phase = True # train if cfg.get("train_model", True): # create data iterators (necessary because some feature sets are too large to put in ram) train_iterator = create_data_iterator(train_df, train_Y, single_fsets, pairwise_fsets, cfg, cfg['scramble']) valid_iterator = create_data_iterator(valid_df, valid_Y, single_fsets, pairwise_fsets, cfg, False) callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=cfg['patience'], restore_best_weights=True)] model.fit_generator(train_iterator(), steps_per_epoch=np.ceil(train_df.shape[0] / cfg['batch_size']), epochs=cfg['epochs'], verbose=cfg['verbose'], validation_data=valid_iterator(), validation_steps=np.ceil(valid_df.shape[0] / cfg['batch_size']), callbacks=callbacks) if cfg.get("trained_model_path", None) is not None: print("Saving model") model.save_weights(cfg["trained_model_path"]) else: model.load_weights(cfg["trained_model_path"]).expect_partial() if return_model: return model, [FeatureTransform(single_fsets, pairwise_fsets)] test_F = feature_transform(test_df, single_fsets, pairwise_fsets) preds = model.predict(test_F) y_target = np.argmax(test_Y, axis=1) r, cm = eval_funcs.eval_classifier(y_target, preds) if print_results: eval_funcs.print_eval_classifier(r) np.savez(output_path, preds = preds, y_target = y_target, cfg = cfg, r=r, cm=cm, rep=rep, fold=fold)
def main(cfg, rep, fold, output_path, print_results=True): dataset_path = cfg['task_path'] train_test_path = cfg['splits_path'] # load dataset df = pd.read_csv(dataset_path) # create output Y = keras.utils.to_categorical(df[cfg['target_col']]) print(Y.shape) # load train/test split data = np.load(train_test_path) train_sets = data['train_sets'] valid_sets = data['valid_sets'] test_sets = data['test_sets'] # create training and testing data frames train_ix = train_sets[rep, fold,:] valid_ix = valid_sets[rep,fold,:] test_ix = test_sets[rep,fold,:] if cfg.get("train_on_full_dataset", False): print(colored("******** TRAINING ON FULL DATASET ***********", "red")) train_ix = train_ix + test_ix if cfg.get("test_on_full_dataset", False): print(colored("******** TESTING ON FULL DATASET ***********", "green")) test_ix = train_ix + test_ix + valid_ix train_df = df.iloc[train_ix] train_genes = set(train_df['a_id']) | set(train_df['b_id']) valid_df = df.iloc[valid_ix] valid_genes = set(valid_df['a_id']) | set(valid_df['b_id']) print("in Valid but not train: %d" % len(valid_genes - train_genes)) test_df = df.iloc[test_ix] train_X = [np.array(train_df['a_id']), np.array(train_df['b_id'])] valid_X = [np.array(valid_df['a_id']), np.array(valid_df['b_id'])] test_X = [np.array(test_df['a_id']), np.array(test_df['b_id'])] train_Y = Y[train_ix,:] valid_Y = Y[valid_ix,:] print("Validation sums:") print(np.sum(valid_Y, axis=0)) test_Y = Y[test_ix, :] # # NN definition # # single gene features n_genes = np.maximum(np.max(df['a_id']), np.max(df['b_id'])) + 1 emb = tf.keras.layers.Embedding(n_genes, cfg['embedding_size']) input_a = tf.keras.layers.Input(shape=(1,)) input_b = tf.keras.layers.Input(shape=(1,)) embd_a = emb(input_a) embd_b = emb(input_b) merged = embd_a + embd_b merged = tf.squeeze(merged, axis=1) output_node = layers.Dense(Y.shape[1], activation='softmax')(merged) if cfg['balanced_loss']: loss = weighted_categorical_xentropy else: loss = 'categorical_crossentropy' model = keras.models.Model(inputs=[input_a, input_b], outputs=output_node) model.compile(cfg['optimizer'], loss) print(model.summary()) #exit() model.outputs[0]._uses_learning_phase = True # train if cfg.get("train_model", True): callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=cfg['patience'], restore_best_weights=True)] print("Batch size: %d" % int(cfg['batch_size_p'] * train_Y.shape[0])) model.fit( train_X, train_Y, epochs=cfg['epochs'], verbose=cfg['verbose'], validation_data=(valid_X, valid_Y), batch_size=int(cfg['batch_size_p'] * train_Y.shape[0]), callbacks=callbacks ) if cfg.get("trained_model_path", None) is not None: print("Saving model") model.save_weights(cfg["trained_model_path"]) else: model.load_weights(cfg["trained_model_path"]).expect_partial() preds = model.predict(test_X) y_target = np.argmax(test_Y, axis=1) r, cm = eval_funcs.eval_classifier(y_target, preds) if print_results: eval_funcs.print_eval_classifier(r) np.savez(output_path, preds = preds, y_target = y_target, cfg = cfg, r=r, cm=cm, rep=rep, fold=fold)
def main(cfg, rep, fold, output_path, print_results=True): dataset_path = cfg['task_path'] train_test_path = cfg['splits_path'] # load dataset df = pd.read_csv(dataset_path) # create output Y = np_utils.to_categorical(df[cfg['target_col']]) # load input features single_gene_spec = [s for s in cfg['spec'] if not s['pairwise']] single_fsets, single_fsets_shapes = feature_loader.load_feature_sets( single_gene_spec, scramble=False) # load train/test split data = np.load(train_test_path) train_sets = data['train_sets'] valid_sets = data['valid_sets'] test_sets = data['test_sets'] # create training and testing data frames train_ix = train_sets[rep, fold, :] valid_ix = valid_sets[rep, fold, :] test_ix = test_sets[rep, fold, :] if cfg.get("train_on_full_dataset", False): print(colored("******** TRAINING ON FULL DATASET ***********", "red")) train_ix = train_ix + test_ix if cfg.get("test_on_full_dataset", False): print(colored("******** TESTING ON FULL DATASET ***********", "green")) test_ix = train_ix + test_ix + valid_ix train_df = df.iloc[train_ix] valid_df = df.iloc[valid_ix] test_df = df.iloc[test_ix] train_Y = Y[train_ix, :] valid_Y = Y[valid_ix, :] test_Y = Y[test_ix, :] # setup feature sets train_fsets = [ single_fsets[i][train_df['id'], :] for i in range(len(single_fsets)) ] if cfg['scramble']: rix = rng.permutation(train_df.shape[0]) train_fsets = [f[rix, :] for f in train_fsets] valid_fsets = [ single_fsets[i][valid_df['id'], :] for i in range(len(single_fsets)) ] test_fsets = [ single_fsets[i][test_df['id'], :] for i in range(len(single_fsets)) ] # # NN definition # # single gene features inputs_a = nn_arch.create_input_nodes(single_gene_spec, single_fsets_shapes, base_name="a") single_gene_arch = nn_arch.create_input_architecture( output_size=cfg['embedding_size'], output_activation=cfg['embedding_activation'], name='single_input', spec=single_gene_spec) output_a = single_gene_arch(inputs_a, name="input_a") output_node = layers.Dense(Y.shape[1], activation='softmax')(output_a) if cfg['balanced_loss']: loss = weighted_categorical_xentropy else: loss = 'categorical_crossentropy' model = keras.models.Model(inputs=inputs_a, outputs=output_node) opt = tf.keras.optimizers.Nadam( learning_rate=cfg.get('learning_rate', 0.001)) model.compile(opt, loss=loss) if cfg.get("train_model", True): # setup early stopping callbacks = [ keras.callbacks.EarlyStopping(monitor='val_loss', patience=cfg['patience'], restore_best_weights=True) ] # train model.fit(x=train_fsets, y=train_Y, batch_size=int(cfg['batch_size_p'] * train_Y.shape[0]), epochs=cfg['epochs'], verbose=cfg['verbose'], validation_data=(valid_fsets, valid_Y), callbacks=callbacks) if cfg.get("trained_model_path", None) is not None: print("Saving model") model.save_weights(cfg["trained_model_path"]) else: model.load_weights(cfg["trained_model_path"]).expect_partial() preds = model.predict(test_fsets) y_target = np.argmax(test_Y, axis=1) r, cm = eval_funcs.eval_classifier(y_target, preds) if print_results: eval_funcs.print_eval_classifier(r) np.savez(output_path, preds=preds, y_target=y_target, cfg=cfg, r=r, cm=cm, rep=rep, fold=fold)