def evaluate_slant(dataset_path, preds_path):

    test_set_df = pd.read_csv(dataset_path, sep='\t')
    result_df = pd.read_csv(preds_path, sep=' ')

    assert test_set_df.shape[0] == result_df.shape[0]

    sl_preds = np.array(result_df[['X0', 'X1']]).astype(float)
    sl_preds = 1 - sl_preds

    y_true = np.array(test_set_df['sl'] == 'X1').astype(int)
    y_true = 1 - y_true

    hardpreds = np.argmax(sl_preds, axis=1)

    eval_results, cm = eval_funcs.eval_classifier(y_true, sl_preds)

    BASE_FPR = np.linspace(0, 1, 101)
    fpr, tpr, _ = sklearn.metrics.roc_curve(y_true, sl_preds[:, 1])
    tpr = interp(BASE_FPR, fpr, tpr)
    tpr[0] = 0.0

    eval_results['fpr'] = BASE_FPR
    eval_results['roc_curve'] = tpr

    return eval_results, cm
def main(cfg, rep, fold, output_path):

    X = np.load(cfg['features_path'])
    y = np.load(cfg['targets_path'])['y']

    # load train/test split
    data = np.load(cfg['splits_path'])
    train_sets = data['train_sets']
    valid_sets = data['valid_sets']
    test_sets = data['test_sets']

    # create training and testing data frames
    train_ix = train_sets[rep, fold, :]
    valid_ix = valid_sets[rep, fold, :]
    test_ix = test_sets[rep, fold, :]

    train_ix = train_ix + valid_ix

    Xtrain = X[train_ix, :]
    ytrain = y[train_ix]

    clf = RandomForestClassifier(n_estimators=cfg['n_estimators'],
                                 n_jobs=cfg['n_jobs'],
                                 verbose=cfg['verbose'],
                                 class_weight='balanced')
    clf = clf.fit(Xtrain, ytrain)

    Xtest = X[test_ix, :]
    ytest = y[test_ix]

    yhat = clf.predict(Xtest)
    preds = np.zeros((yhat.shape[0], 2))
    preds[:, 0] = yhat
    preds[:, 1] = 1 - yhat

    print(ytest.shape)
    print(preds.shape)

    r, cm = eval_funcs.eval_classifier(ytest, preds)
    pprint.pprint(r)
Example #3
0
def main(cfg, rep, fold, output_path, print_results=True, return_model=False):
    K.clear_session()

    dataset_path = cfg['task_path']
    train_test_path = cfg['splits_path']

    # load dataset
    df = pd.read_csv(dataset_path)

    # create output
    Y = np_utils.to_categorical(df[cfg['target_col']])

    # load train/test split
    data = np.load(train_test_path)
    train_sets = data['train_sets']
    valid_sets = data['valid_sets']
    test_sets = data['test_sets']

    # create training and testing data frames
    train_ix = train_sets[rep, fold, :]
    valid_ix = valid_sets[rep, fold, :]
    test_ix = test_sets[rep, fold, :]

    print("Dataset size: %d, Total: %d" %
          (df.shape[0], np.sum(train_ix + valid_ix + test_ix)))

    if not cfg.get("early_stopping", True):
        train_ix = train_ix + valid_ix

    if cfg.get("train_on_full_dataset", False):
        print(colored("******** TRAINING ON FULL DATASET ***********", "red"))
        train_ix = np.ones_like(train_ix)

    if cfg.get("test_on_full_dataset", False):
        print(colored("******** TESTING ON FULL DATASET ***********", "green"))
        test_ix = np.ones_like(test_ix)

    train_df = df.iloc[train_ix]
    valid_df = df.iloc[valid_ix]
    test_df = df.iloc[test_ix]

    print("Train size: %d, valid: %d, test: %d" %
          (train_df.shape[0], valid_df.shape[0], test_df.shape[0]))

    train_Y = Y[train_ix, :]
    valid_Y = Y[valid_ix, :]
    test_Y = Y[test_ix, :]

    if cfg.get("bootstrap_training", False):
        print(colored("******** BOOTSTRAPPING TRAINING ***********", "blue"))
        rix = rng.choice(train_df.shape[0], train_df.shape[0], replace=True)
        train_df = train_df.iloc[rix]
        train_Y = train_Y[rix, :]

    fsets, feature_labels = load_features(cfg)

    #
    # NN definition
    #
    input_node = layers.Input(shape=(len(feature_labels), ),
                              name='input_features')
    if cfg['type'] == 'orm':
        linear_layer = layers.Dense(1, activation='linear')
        latent_variable = linear_layer(input_node)
        ordinal_layer = OrdinalLayer(Y.shape[1])
        output_node = ordinal_layer(latent_variable)
    else:
        output_layer = layers.Dense(Y.shape[1], activation='softmax')
        output_node = output_layer(input_node)

    if cfg['balanced_loss']:
        loss = weighted_categorical_xentropy
    else:
        loss = 'categorical_crossentropy'

    model = keras.models.Model(inputs=input_node, outputs=output_node)
    model.compile(cfg['optimizer'], loss)

    if cfg.get("trained_model_path", None) is not None:
        if cfg.get("add_repfold_to_trained_model_path", True):
            cfg["trained_model_path"] = "%s_%d_%d" % (
                cfg["trained_model_path"], rep, fold)

    # train
    if cfg.get("train_model", True):
        train_iterator = create_data_iterator(train_df, train_Y, fsets, cfg)
        valid_iterator = create_data_iterator(valid_df, valid_Y, fsets, cfg)

        if cfg.get("early_stopping", True):
            callbacks = [
                keras.callbacks.EarlyStopping(monitor='val_loss',
                                              patience=cfg['patience'],
                                              restore_best_weights=True)
            ]
        else:
            callbacks = []

        if cfg['epochs'] > 0:
            model.fit_generator(train_iterator(),
                                steps_per_epoch=np.ceil(train_df.shape[0] /
                                                        cfg['batch_size']),
                                epochs=cfg['epochs'],
                                verbose=cfg['verbose'],
                                validation_data=valid_iterator(),
                                validation_steps=np.ceil(valid_df.shape[0] /
                                                         cfg['batch_size']),
                                callbacks=callbacks)

        if cfg.get("trained_model_path", None) is not None:
            print("Saving model to %s" % cfg['trained_model_path'])
            model.save_weights(cfg["trained_model_path"])

        if cfg.get("save_tjs", False):
            tfjs.converters.save_keras_model(model, cfg["tjs_path"])

    else:
        print("Loading from %s" % cfg['trained_model_path'])
        model.load_weights(cfg["trained_model_path"]).expect_partial()

    if return_model:
        return model, fsets

    test_iterator = create_data_iterator(test_df, test_Y, fsets, cfg, False)

    preds = model.predict(test_iterator(),
                          steps=np.ceil(test_df.shape[0] / cfg['batch_size']))
    y_target = np.argmax(test_Y, axis=1)

    ix = np.sum(np.isnan(preds), axis=1) > 0
    print("Nan: %d" % np.sum(ix))

    print(test_df[ix])

    r, cm = eval_funcs.eval_classifier(y_target, preds)

    if print_results:
        eval_funcs.print_eval_classifier(r)

    if cfg['type'] == 'orm':
        np.savez(output_path,
                 preds=preds,
                 y_target=y_target,
                 cfg=cfg,
                 r=r,
                 cm=cm,
                 rep=rep,
                 biases=linear_layer.get_weights()[1],
                 weights=linear_layer.get_weights()[0],
                 thresholds=ordinal_layer.get_thresholds(),
                 labels=feature_labels,
                 fold=fold)
    else:

        weights_list = output_layer.get_weights()

        np.savez(output_path,
                 preds=preds,
                 y_target=y_target,
                 cfg=cfg,
                 r=r,
                 cm=cm,
                 rep=rep,
                 weights=weights_list[0],
                 biases=weights_list[1],
                 labels=feature_labels,
                 fold=fold)

    return None
def main(cfg, rep, fold, output_path, print_results=True):
    
    dataset_path = cfg['task_path']
    train_test_path = cfg['splits_path']
    
    # load dataset
    df = pd.read_csv(dataset_path)
    
    # create output
    Y = np_utils.to_categorical(df[cfg['target_col']])
    
    # read features
    all_features, labels = read_features(cfg)
    
    # load train/test split 
    data = np.load(train_test_path)
    train_sets = data['train_sets']
    valid_sets = data['valid_sets']
    test_sets = data['test_sets']

    # create training and testing data frames
    train_ix = train_sets[rep, fold,:]
    valid_ix = valid_sets[rep,fold,:]
    test_ix = test_sets[rep,fold,:]

    if not cfg.get("early_stopping", True):
        train_ix = train_ix + valid_ix
        
    if cfg.get("train_on_full_dataset", False):
        print(colored("******** TRAINING ON FULL DATASET ***********", "red"))
        train_ix = train_ix + test_ix
    
    if cfg.get("test_on_full_dataset", False):
        print(colored("******** TESTING ON FULL DATASET ***********", "green"))
        test_ix = train_ix + test_ix + valid_ix
        
    train_df = df.iloc[train_ix]
    valid_df = df.iloc[valid_ix]
    test_df = df.iloc[test_ix]

    train_Y = Y[train_ix,:]
    valid_Y = Y[valid_ix,:]
    test_Y = Y[test_ix, :]

    F = all_features[df['id'], :]
    train_F = F[train_ix, :]
    valid_F = F[valid_ix, :]
    test_F = F[test_ix, :]
    
    if cfg.get("bootstrap_training", False):
        print(colored("******** BOOTSTRAPPING TRAINING ***********", "blue"))
        rix = rng.choice(train_df.shape[0], train_df.shape[0], replace=True)
        train_df = train_df.iloc[rix]
        train_Y = train_Y[rix,:]
        train_F = train_F[rix,:]

    # ordinal model
    input_node = layers.Input(shape=(F.shape[1],))
    if cfg['type'] == 'orm':
        linear_layer = layers.Dense(1, activation='linear')
        latent_variable = linear_layer(input_node)
        ordinal_layer = OrdinalLayer(train_Y.shape[1])
        output_node = ordinal_layer(latent_variable)
    else:
        output_layer = layers.Dense(Y.shape[1], activation='softmax')
        output_node = output_layer(input_node)

    if cfg['balanced_loss']:
        loss = weighted_categorical_xentropy
    else:
        loss = 'categorical_crossentropy'
    
    model = keras.models.Model(inputs=input_node, outputs=output_node)
    
    model.compile(cfg['optimizer'], loss=loss)

    if cfg.get("train_model", True):

        # setup early stopping
        if cfg.get("early_stopping", True):
            callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', 
                patience=cfg['patience'], restore_best_weights=True)]
        else:
            callbacks = []
        
        # train
        model.fit(
            x=train_F,
            y=train_Y,
            batch_size=int(cfg['batch_size_p'] * train_Y.shape[0]),
            epochs=cfg['epochs'],
            verbose=cfg['verbose'],
            validation_data=(valid_F, valid_Y),
            callbacks=callbacks)

        if cfg.get("trained_model_path", None) is not None:
            print("Saving model")
            model.save_weights(cfg["trained_model_path"])
    else:
        model.load_weights(cfg["trained_model_path"]).expect_partial()
    
    preds = model.predict(test_F)
    y_target = np.argmax(test_Y, axis=1)

    r, cm = eval_funcs.eval_classifier(y_target, preds)
    
    if print_results:
        eval_funcs.print_eval_classifier(r)

    if cfg['type'] == 'orm':
        np.savez(output_path,
            preds = preds,
            y_target = y_target,
            cfg = cfg, 
            r=r,
            cm=cm,
            rep=rep,
            biases=linear_layer.get_weights()[1],
            weights=linear_layer.get_weights()[0],
            thresholds=ordinal_layer.get_thresholds(),
            labels=labels,
            fold=fold)
    else:
        weights_list = output_layer.get_weights()

        np.savez(output_path,
            preds = preds,
            y_target = y_target,
            cfg = cfg, 
            r=r,
            cm=cm,
            rep=rep,
            weights=weights_list[0],
            biases=weights_list[1],
            labels=labels,
            fold=fold)
def main(cfg, rep, fold, output_path, print_results=True, return_model=False):

    dataset_path = cfg['task_path']
    train_test_path = cfg['splits_path']

    # load dataset
    df = pd.read_csv(dataset_path)
    
    # load input features
    single_gene_spec = [s for s in cfg['spec'] if not s['pairwise']]
    pairwise_gene_spec = [s for s in cfg['spec'] if s['pairwise']]
    single_fsets, single_fsets_shapes = feature_loader.load_feature_sets(single_gene_spec, False)
    pairwise_fsets, pairwise_fsets_shapes = feature_loader.load_feature_sets(pairwise_gene_spec, False)
    
    # create output
    Y = np_utils.to_categorical(df[cfg['target_col']])
    
    # load train/test split 
    data = np.load(train_test_path)
    train_sets = data['train_sets']
    valid_sets = data['valid_sets']
    test_sets = data['test_sets']

    # create training and testing data frames
    train_ix = train_sets[rep, fold,:]
    valid_ix = valid_sets[rep,fold,:]
    test_ix = test_sets[rep,fold,:]

    if cfg.get("train_on_full_dataset", False):
        print(colored("******** TRAINING ON FULL DATASET ***********", "red"))
        train_ix = train_ix + test_ix
    
    if cfg.get("test_on_full_dataset", False):
        print(colored("******** TESTING ON FULL DATASET ***********", "green"))
        test_ix = train_ix + test_ix + valid_ix
        
    train_df = df.iloc[train_ix]
    valid_df = df.iloc[valid_ix]
    test_df = df.iloc[test_ix]

    train_Y = Y[train_ix,:]
    valid_Y = Y[valid_ix,:]
    test_Y = Y[test_ix, :]

    #
    # NN definition
    #

    # single gene features
    inputs_a = nn_arch.create_input_nodes(single_gene_spec, single_fsets_shapes, base_name="a")
    inputs_b = nn_arch.create_input_nodes(single_gene_spec, single_fsets_shapes, base_name="b")
    single_gene_arch = nn_arch.create_input_architecture(output_size=cfg['embedding_size'], 
        output_activation=cfg['embedding_activation'], 
        name='single_input', spec=single_gene_spec)
    output_a = single_gene_arch(inputs_a, name="input_a")
    output_b = single_gene_arch(inputs_b, name="input_b")

    # pairwise features
    inputs_ab = nn_arch.create_input_nodes(pairwise_gene_spec, pairwise_fsets_shapes, base_name="ab")
    pairwise_gene_arch = nn_arch.create_input_architecture(output_size=cfg['embedding_size'], 
        output_activation=cfg['embedding_activation'], 
        name='pairwise_input', spec=pairwise_gene_spec)
    output_ab = pairwise_gene_arch(inputs_ab, name="input_ab")


    merged = nn_arch.concatenate([output_a + output_b, output_ab], name="preoutput")
    output_node = layers.Dense(Y.shape[1], activation='softmax')(merged)

    if cfg['balanced_loss']:
        loss = weighted_categorical_xentropy
    else:
        loss = 'categorical_crossentropy'
    
    model = keras.models.Model(inputs=inputs_a + inputs_b + inputs_ab, outputs=output_node)


    opt = tf.keras.optimizers.Nadam(learning_rate=cfg.get('learning_rate', 0.001))

    model.compile(opt, loss)
    model.outputs[0]._uses_learning_phase = True
    
    # train
    if cfg.get("train_model", True):
        
        # create data iterators (necessary because some feature sets are too large to put in ram)
        train_iterator = create_data_iterator(train_df, train_Y, single_fsets, pairwise_fsets, cfg, cfg['scramble'])
        valid_iterator = create_data_iterator(valid_df, valid_Y, single_fsets, pairwise_fsets, cfg, False)

        callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=cfg['patience'], restore_best_weights=True)]

        model.fit_generator(train_iterator(),
            steps_per_epoch=np.ceil(train_df.shape[0] / cfg['batch_size']),
            epochs=cfg['epochs'],
            verbose=cfg['verbose'],
            validation_data=valid_iterator(),
            validation_steps=np.ceil(valid_df.shape[0] / cfg['batch_size']),
            callbacks=callbacks)

        if cfg.get("trained_model_path", None) is not None:
            print("Saving model")
            model.save_weights(cfg["trained_model_path"])

    else:
        model.load_weights(cfg["trained_model_path"]).expect_partial()
    
    if return_model:
        return model, [FeatureTransform(single_fsets, pairwise_fsets)]
    test_F = feature_transform(test_df, single_fsets, pairwise_fsets)
    preds = model.predict(test_F)
    y_target = np.argmax(test_Y, axis=1)

    r, cm = eval_funcs.eval_classifier(y_target, preds)
    
    if print_results:
        eval_funcs.print_eval_classifier(r)
    
    np.savez(output_path,
        preds = preds,
        y_target = y_target,
        cfg = cfg, 
        r=r,
        cm=cm,
        rep=rep,
        fold=fold)
def main(cfg, rep, fold, output_path, print_results=True):

    dataset_path = cfg['task_path']
    train_test_path = cfg['splits_path']

    # load dataset
    df = pd.read_csv(dataset_path)
    
    # create output
    Y = keras.utils.to_categorical(df[cfg['target_col']])
    print(Y.shape)
    # load train/test split 
    data = np.load(train_test_path)
    train_sets = data['train_sets']
    valid_sets = data['valid_sets']
    test_sets = data['test_sets']

    # create training and testing data frames
    train_ix = train_sets[rep, fold,:]
    valid_ix = valid_sets[rep,fold,:]
    test_ix = test_sets[rep,fold,:]

    if cfg.get("train_on_full_dataset", False):
        print(colored("******** TRAINING ON FULL DATASET ***********", "red"))
        train_ix = train_ix + test_ix
    
    if cfg.get("test_on_full_dataset", False):
        print(colored("******** TESTING ON FULL DATASET ***********", "green"))
        test_ix = train_ix + test_ix + valid_ix
        
    train_df = df.iloc[train_ix]
    train_genes = set(train_df['a_id']) | set(train_df['b_id'])

    valid_df = df.iloc[valid_ix]
    valid_genes = set(valid_df['a_id']) | set(valid_df['b_id'])
    print("in Valid but not train: %d" % len(valid_genes - train_genes))

    test_df = df.iloc[test_ix]

    train_X = [np.array(train_df['a_id']), np.array(train_df['b_id'])]
    valid_X = [np.array(valid_df['a_id']), np.array(valid_df['b_id'])]
    test_X = [np.array(test_df['a_id']), np.array(test_df['b_id'])]
    train_Y = Y[train_ix,:]
    valid_Y = Y[valid_ix,:]
    print("Validation sums:")
    print(np.sum(valid_Y, axis=0))
    test_Y = Y[test_ix, :]

    #
    # NN definition
    #

    # single gene features
    n_genes = np.maximum(np.max(df['a_id']), np.max(df['b_id'])) + 1
   
    emb = tf.keras.layers.Embedding(n_genes, cfg['embedding_size'])

    input_a = tf.keras.layers.Input(shape=(1,))
    input_b = tf.keras.layers.Input(shape=(1,))

    embd_a = emb(input_a)
    embd_b = emb(input_b)
    
    merged = embd_a + embd_b
    merged = tf.squeeze(merged, axis=1)
    output_node = layers.Dense(Y.shape[1], activation='softmax')(merged)

    if cfg['balanced_loss']:
        loss = weighted_categorical_xentropy
    else:
        loss = 'categorical_crossentropy'
    
    model = keras.models.Model(inputs=[input_a, input_b], outputs=output_node)
    model.compile(cfg['optimizer'], loss)
    print(model.summary())
    #exit()
    model.outputs[0]._uses_learning_phase = True
    
    # train
    if cfg.get("train_model", True):
        
        callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=cfg['patience'], restore_best_weights=True)]

        print("Batch size: %d" % int(cfg['batch_size_p'] * train_Y.shape[0]))
        model.fit(
            train_X,
            train_Y,
            epochs=cfg['epochs'],
            verbose=cfg['verbose'],
            validation_data=(valid_X, valid_Y),
            batch_size=int(cfg['batch_size_p'] * train_Y.shape[0]),
            callbacks=callbacks
        )
       
        if cfg.get("trained_model_path", None) is not None:
            print("Saving model")
            model.save_weights(cfg["trained_model_path"])

    else:
        model.load_weights(cfg["trained_model_path"]).expect_partial()
    
    preds = model.predict(test_X)
    y_target = np.argmax(test_Y, axis=1)

    r, cm = eval_funcs.eval_classifier(y_target, preds)
    
    if print_results:
        eval_funcs.print_eval_classifier(r)
    
    np.savez(output_path,
        preds = preds,
        y_target = y_target,
        cfg = cfg, 
        r=r,
        cm=cm,
        rep=rep,
        fold=fold)
Example #7
0
def main(cfg, rep, fold, output_path, print_results=True):

    dataset_path = cfg['task_path']
    train_test_path = cfg['splits_path']

    # load dataset
    df = pd.read_csv(dataset_path)

    # create output
    Y = np_utils.to_categorical(df[cfg['target_col']])

    # load input features
    single_gene_spec = [s for s in cfg['spec'] if not s['pairwise']]
    single_fsets, single_fsets_shapes = feature_loader.load_feature_sets(
        single_gene_spec, scramble=False)

    # load train/test split
    data = np.load(train_test_path)
    train_sets = data['train_sets']
    valid_sets = data['valid_sets']
    test_sets = data['test_sets']

    # create training and testing data frames
    train_ix = train_sets[rep, fold, :]
    valid_ix = valid_sets[rep, fold, :]
    test_ix = test_sets[rep, fold, :]

    if cfg.get("train_on_full_dataset", False):
        print(colored("******** TRAINING ON FULL DATASET ***********", "red"))
        train_ix = train_ix + test_ix

    if cfg.get("test_on_full_dataset", False):
        print(colored("******** TESTING ON FULL DATASET ***********", "green"))
        test_ix = train_ix + test_ix + valid_ix

    train_df = df.iloc[train_ix]
    valid_df = df.iloc[valid_ix]
    test_df = df.iloc[test_ix]

    train_Y = Y[train_ix, :]
    valid_Y = Y[valid_ix, :]
    test_Y = Y[test_ix, :]

    # setup feature sets
    train_fsets = [
        single_fsets[i][train_df['id'], :] for i in range(len(single_fsets))
    ]
    if cfg['scramble']:
        rix = rng.permutation(train_df.shape[0])
        train_fsets = [f[rix, :] for f in train_fsets]
    valid_fsets = [
        single_fsets[i][valid_df['id'], :] for i in range(len(single_fsets))
    ]
    test_fsets = [
        single_fsets[i][test_df['id'], :] for i in range(len(single_fsets))
    ]

    #
    # NN definition
    #

    # single gene features
    inputs_a = nn_arch.create_input_nodes(single_gene_spec,
                                          single_fsets_shapes,
                                          base_name="a")
    single_gene_arch = nn_arch.create_input_architecture(
        output_size=cfg['embedding_size'],
        output_activation=cfg['embedding_activation'],
        name='single_input',
        spec=single_gene_spec)
    output_a = single_gene_arch(inputs_a, name="input_a")
    output_node = layers.Dense(Y.shape[1], activation='softmax')(output_a)

    if cfg['balanced_loss']:
        loss = weighted_categorical_xentropy
    else:
        loss = 'categorical_crossentropy'

    model = keras.models.Model(inputs=inputs_a, outputs=output_node)

    opt = tf.keras.optimizers.Nadam(
        learning_rate=cfg.get('learning_rate', 0.001))
    model.compile(opt, loss=loss)

    if cfg.get("train_model", True):
        # setup early stopping
        callbacks = [
            keras.callbacks.EarlyStopping(monitor='val_loss',
                                          patience=cfg['patience'],
                                          restore_best_weights=True)
        ]
        # train
        model.fit(x=train_fsets,
                  y=train_Y,
                  batch_size=int(cfg['batch_size_p'] * train_Y.shape[0]),
                  epochs=cfg['epochs'],
                  verbose=cfg['verbose'],
                  validation_data=(valid_fsets, valid_Y),
                  callbacks=callbacks)

        if cfg.get("trained_model_path", None) is not None:
            print("Saving model")
            model.save_weights(cfg["trained_model_path"])

    else:
        model.load_weights(cfg["trained_model_path"]).expect_partial()

    preds = model.predict(test_fsets)
    y_target = np.argmax(test_Y, axis=1)

    r, cm = eval_funcs.eval_classifier(y_target, preds)

    if print_results:
        eval_funcs.print_eval_classifier(r)

    np.savez(output_path,
             preds=preds,
             y_target=y_target,
             cfg=cfg,
             r=r,
             cm=cm,
             rep=rep,
             fold=fold)