Ejemplo n.º 1
0
class CometSession(Session):
    com_ex: Experiment

    def __init__(self, source_paths: Union[List[str], str], **kwargs) -> None:
        self.com_ex = Experiment(**kwargs)
        for path in source_paths:
            if isfile(path):
                fs = open(path, mode="r")
                self.com_ex.log_code(code_name=basename(path), code=fs)
                fs.close()
            else:
                print(f"CometSession: Warning, No such file - {path}")

    def log_parameters(self, params: Dict[str, Any]) -> None:
        self.com_ex.log_parameters(params)

    def log_metric(self, val_name: str, value: Any) -> None:
        self.com_ex.log_metric(val_name, value)
Ejemplo n.º 2
0
# -*- coding: utf-8 -*-
"""
Created on Thu May 30 16:05:37 2019

@author: gdrei
"""
from comet_ml import Experiment
experiment = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                        project_name="general_test", workspace="gdreiman1")
experiment.log_code = True
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import sklearn
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
import matplotlib.pyplot as plt
import pickle
#%%
def comet_SVM(save_path):
#encode labels
pickle_off = open(save_path,'rb')
activity_table=pickle.load(pickle_off)
pickle_off.close()
#get length of MFP
fp_length = len(activity_table.iloc[5]['MFP'])
Ejemplo n.º 3
0
def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
          customize):

    try:
        from comet_ml import Experiment
        experiment = Experiment(
            project_name="particleflow-tf",
            auto_metric_logging=True,
            auto_param_logging=True,
            auto_histogram_weight_logging=True,
            auto_histogram_gradient_logging=False,
            auto_histogram_activation_logging=False,
        )
    except Exception as e:
        print("Failed to initialize comet-ml dashboard")
        experiment = None
    """Train a model defined by config"""
    config_file_path = config
    config, config_file_stem = parse_config(config,
                                            nepochs=nepochs,
                                            weights=weights)

    if plot_freq:
        config["callbacks"]["plot_freq"] = plot_freq

    if customize:
        config = customization_functions[customize](config)

    if recreate or (weights is None):
        outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_",
                                       suffix=platform.node())
    else:
        outdir = str(Path(weights).parent)

    # Decide tf.distribute.strategy depending on number of available GPUs
    strategy, num_gpus = get_strategy()
    #if "CPU" not in strategy.extended.worker_devices[0]:
    #    nvidia_smi_call = "nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f {}/nvidia_smi_log.csv".format(outdir)
    #    p = subprocess.Popen(shlex.split(nvidia_smi_call))

    ds_train, num_train_steps = get_datasets(config["train_test_datasets"],
                                             config, num_gpus, "train")
    ds_test, num_test_steps = get_datasets(config["train_test_datasets"],
                                           config, num_gpus, "test")
    ds_val, ds_info = get_heptfds_dataset(
        config["validation_dataset"], config, num_gpus, "test",
        config["setup"]["num_events_validation"])
    ds_val = ds_val.batch(5)

    if ntrain:
        ds_train = ds_train.take(ntrain)
        num_train_steps = ntrain
    if ntest:
        ds_test = ds_test.take(ntest)
        num_test_steps = ntest

    print("num_train_steps", num_train_steps)
    print("num_test_steps", num_test_steps)
    total_steps = num_train_steps * config["setup"]["num_epochs"]
    print("total_steps", total_steps)

    if experiment:
        experiment.set_name(outdir)
        experiment.log_code("mlpf/tfmodel/model.py")
        experiment.log_code("mlpf/tfmodel/utils.py")
        experiment.log_code(config_file_path)

    shutil.copy(config_file_path, outdir + "/config.yaml"
                )  # Copy the config file to the train dir for later reference

    with strategy.scope():
        lr_schedule, optim_callbacks = get_lr_schedule(config,
                                                       steps=total_steps)
        opt = get_optimizer(config, lr_schedule)

        if config["setup"]["dtype"] == "float16":
            model_dtype = tf.dtypes.float16
            policy = mixed_precision.Policy("mixed_float16")
            mixed_precision.set_global_policy(policy)
            opt = mixed_precision.LossScaleOptimizer(opt)
        else:
            model_dtype = tf.dtypes.float32

        model = make_model(config, model_dtype)

        # Build the layers after the element and feature dimensions are specified
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        initial_epoch = 0
        if weights:
            # We need to load the weights in the same trainable configuration as the model was set up
            configure_model_weights(
                model, config["setup"].get("weights_config", "all"))
            model.load_weights(weights, by_name=True)
            initial_epoch = int(weights.split("/")[-1].split("-")[1])
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        config = set_config_loss(config, config["setup"]["trainable"])
        configure_model_weights(model, config["setup"]["trainable"])
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        print("model weights")
        tw_names = [m.name for m in model.trainable_weights]
        for w in model.weights:
            print("layer={} trainable={} shape={} num_weights={}".format(
                w.name, w.name in tw_names, w.shape, np.prod(w.shape)))

        loss_dict, loss_weights = get_loss_dict(config)
        model.compile(
            loss=loss_dict,
            optimizer=opt,
            sample_weight_mode="temporal",
            loss_weights=loss_weights,
            metrics={
                "cls": [
                    FlattenedCategoricalAccuracy(name="acc_unweighted",
                                                 dtype=tf.float64),
                    FlattenedCategoricalAccuracy(use_weights=True,
                                                 name="acc_weighted",
                                                 dtype=tf.float64),
                ] + [
                    SingleClassRecall(
                        icls, name="rec_cls{}".format(icls), dtype=tf.float64)
                    for icls in range(config["dataset"]["num_output_classes"])
                ]
            },
        )
        model.summary()

    callbacks = prepare_callbacks(config["callbacks"],
                                  outdir,
                                  ds_val,
                                  ds_info,
                                  comet_experiment=experiment)
    callbacks.append(optim_callbacks)

    fit_result = model.fit(
        ds_train.repeat(),
        validation_data=ds_test.repeat(),
        epochs=initial_epoch + config["setup"]["num_epochs"],
        callbacks=callbacks,
        steps_per_epoch=num_train_steps,
        validation_steps=num_test_steps,
        initial_epoch=initial_epoch,
    )

    history_path = Path(outdir) / "history"
    history_path = str(history_path)
    with open("{}/history.json".format(history_path), "w") as fi:
        json.dump(fit_result.history, fi)

    weights = get_best_checkpoint(outdir)
    print("Loading best weights that could be found from {}".format(weights))
    model.load_weights(weights, by_name=True)

    model.save(outdir + "/model_full", save_format="tf")

    print("Training done.")
Ejemplo n.º 4
0
def comet_Fold(save_path, embedding_type, model_type, bin_labels):
    from comet_ml import Experiment
    exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                     project_name="80_10_baseline",
                     workspace="gdreiman1",
                     disabled=False)
    exp.log_code = True
    #turn off comet logging comments
    import os
    os.environ['COMET_LOGGING_FILE_LEVEL'] = 'WARNING'
    import warnings
    warnings.filterwarnings('ignore')
    import pickle
    import pandas as pd
    import numpy as np
    import sklearn as sklearn
    from sklearn.metrics import roc_curve,
    from sklearn.metrics import precision_recall_fscore_support as prf
    from sklearn.linear_model import SGDClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    from thundersvm import SVC
    import matplotlib.pyplot as plt
    import seaborn as sns
    import ntpath
    from imblearn.over_sampling import RandomOverSampler
    #choosing a 4:1 Inactive to Active ratio
    ros = RandomOverSampler(sampling_strategy=0.33)
    from imblearn.under_sampling import RandomUnderSampler
    rus = RandomUnderSampler(sampling_strategy=0.33)
    import tensorflow as tf
    tf.logging.set_verbosity(tf.logging.ERROR)

    #        tf.enable_eager_execution()
    #        from keras import backend as K
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout, GaussianNoise
    from tensorflow.keras.layers import Lambda
    from tensorflow.keras.utils import to_categorical
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
    '''Comet Saving Zone'''
    def comet_addtional_info(exp, save_path, metrics_dict, X_test, y_test,
                             embedding_type, model_type):
        #get base file name
        folder, base = ntpath.split(save_path)
        #split file name at second _ assumes file save in AID_xxx_endinfo.pkl
        AID, _, end_info = base.rpartition('_')
        exp.add_tag(AID)
        #save data location, AID info, and version info
        exp.log_dataset_info(name=AID, version=end_info, path=save_path)
        #save some informatvie tags:
        tags = [AID, end_info, model_type]
        exp.add_tags(tags)
        exp.add_tag(embedding_type)
        #save metrics_dict in data_folder with comet experiement number associated
        exp_num = exp.get_key()
        model_save = Path(folder + '/' + model_type + '_' + embedding_type +
                          '_' + exp_num + 'metrics_dict.pkl')
        pickle_on = open(model_save, 'wb')
        pickle.dump(metrics_dict, pickle_on)
        pickle_on.close()
        #log trained model location
        exp.log_other('Metrics Dict Path', model_save)
        #tell comet that the experiement is over
        exp.end()

    def get_Scaled_Data(train_ind, test_ind, X_mfp, activity_table, labels,
                        bin_labels):
        #get start and end index for molchars
        MC_start = activity_table.columns.get_loc('Chi0')
        #need to add 1 bc exclusive indexing
        MC_end = activity_table.columns.get_loc('VSA_EState9') + 1
        # standardize data
        scaler = StandardScaler(copy=False)
        #return requested datatype
        if embedding_type == 'MFPMolChars':
            X_train_molchars_std = scaler.fit_transform(
                np.array(activity_table.iloc[train_ind,
                                             MC_start:MC_end]).astype(float))
            X_test_molchars_std = scaler.transform(
                np.array(activity_table.iloc[test_ind,
                                             MC_start:MC_end]).astype(float))
            X_train = np.concatenate(
                (X_mfp[train_ind, :], X_train_molchars_std), axis=1)
            X_test = np.concatenate((X_mfp[test_ind, :], X_test_molchars_std),
                                    axis=1)
        elif embedding_type == 'MFP':
            X_train = X_mfp[train_ind, :]
            X_test = X_mfp[test_ind, :]
        elif embedding_type == 'MolChars':
            X_train_molchars_std = scaler.fit_transform(
                np.array(activity_table.iloc[train_ind,
                                             MC_start:MC_end]).astype(float))
            X_test_molchars_std = scaler.transform(
                np.array(activity_table.iloc[test_ind,
                                             MC_start:MC_end]).astype(float))
            X_train = X_train_molchars_std
            X_test = X_test_molchars_std
        y_train = labels[train_ind]
        y_test = labels[test_ind]
        #remapping active to 1 and everything else to zero
        bin_y_train, bin_y_test = np.array([
            1 if x == 0 else 0 for x in y_train
        ]), np.array([1 if x == 0 else 0 for x in y_test])
        if bin_labels == True:
            y_test = bin_y_test
            y_train = bin_y_train
        return X_train, X_test, y_train, y_test

    def train_SVM(X_train, X_test, y_train, y_test, split_ID):
        sgd_linear_SVM = SGDClassifier(loss='hinge',
                                       penalty='l2',
                                       alpha=0.0001,
                                       l1_ratio=0.15,
                                       fit_intercept=True,
                                       max_iter=500000,
                                       tol=0.001,
                                       shuffle=True,
                                       verbose=0,
                                       epsilon=0.1,
                                       n_jobs=-1,
                                       random_state=None,
                                       learning_rate='optimal',
                                       eta0=0.0,
                                       power_t=0.5,
                                       early_stopping=False,
                                       validation_fraction=0.1,
                                       n_iter_no_change=5,
                                       class_weight='balanced',
                                       warm_start=False,
                                       average=False)
        sgd_linear_SVM_model = sgd_linear_SVM.fit(X_train, y_train)

        sgd_lSVM_preds = sgd_linear_SVM_model.predict(X_test)
        prec, rec, f_1, supp = prf(y_test, sgd_lSVM_preds, average=None)
        class_rep = sklearn.metrics.classification_report(
            y_test, sgd_lSVM_preds)
        exp.log_other('Classification Report' + split_ID, class_rep)
        mcc = sklearn.metrics.matthews_corrcoef(y_test, sgd_lSVM_preds)

        #if first iteration, report model parameters to comet
        if split_ID == '0':
            exp.log_parameters(sgd_linear_SVM_model.get_params())
        return prec, rec, f_1, supp, mcc

    def train_kSVM(X_train, X_test, y_train, y_test, split_ID):
        kSVM = SVC(kernel='rbf',
                   degree=3,
                   gamma='auto',
                   coef0=0.0,
                   C=1.0,
                   tol=0.001,
                   probability=False,
                   class_weight='balanced',
                   shrinking=False,
                   cache_size=None,
                   verbose=False,
                   max_iter=-1,
                   n_jobs=-1,
                   max_mem_size=-1,
                   random_state=None,
                   decision_function_shape='ovo')
        kSVM_model = kSVM.fit(X_train, y_train)

        kSVM_preds = kSVM_model.predict(X_test)
        prec, rec, f_1, supp = prf(y_test, kSVM_preds, average=None)
        class_rep = sklearn.metrics.classification_report(y_test, kSVM_preds)
        exp.log_other('Classification Report' + split_ID, class_rep)
        mcc = sklearn.metrics.matthews_corrcoef(y_test, kSVM_preds)

        #if first iteration, report model parameters to comet
        if split_ID == '0':
            exp.log_parameters(kSVM_preds.get_params())
        return prec, rec, f_1, supp, mcc

    def train_RF(X_train, X_test, y_train, y_test, split_ID):

        rf = RandomForestClassifier(n_estimators=100,
                                    random_state=2562,
                                    class_weight="balanced_subsample",
                                    n_jobs=-1)
        rand_for = rf.fit(X_train, y_train)
        rf_preds = rand_for.predict(X_test)
        prec, rec, f_1, supp = prf(y_test, rf_preds, average=None)
        class_rep = sklearn.metrics.classification_report(y_test, rf_preds)
        exp.log_other('Classification Report' + split_ID, class_rep)
        mcc = sklearn.metrics.matthews_corrcoef(y_test, rf_preds)

        #if first iteration, report model parameters to comet
        if split_ID == '0':
            exp.log_parameters(rand_for.get_params())
        return prec, rec, f_1, supp, mcc

    def train_LGBM(X_train, X_test, y_train, y_test, split_ID):
        import lightgbm as lgb
        #make model class
        lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt',
                                        num_leaves=31,
                                        max_depth=-1,
                                        learning_rate=0.1,
                                        n_estimators=500,
                                        subsample_for_bin=200000,
                                        objective='binary',
                                        is_unbalance=True,
                                        min_split_gain=0.0,
                                        min_child_weight=0.001,
                                        min_child_samples=20,
                                        subsample=1.0,
                                        subsample_freq=0,
                                        colsample_bytree=1.0,
                                        reg_alpha=0.0,
                                        reg_lambda=0.0,
                                        random_state=None,
                                        n_jobs=-1,
                                        silent=True,
                                        importance_type='split')
        #train model
        lgbm = lgbm_model.fit(X_train, y_train)
        lgbm_preds = lgbm.predict(X_test)
        prec, rec, f_1, supp = prf(y_test, lgbm_preds, average=None)
        class_rep = sklearn.metrics.classification_report(y_test, lgbm_preds)
        exp.log_other('Classification Report' + split_ID, class_rep)
        mcc = sklearn.metrics.matthews_corrcoef(y_test, lgbm_preds)

        #if first iteration, report model parameters to comet
        if split_ID == '0':
            exp.log_parameters(lgbm.get_params())
        return prec, rec, f_1, supp, mcc

    def train_DNN(X_train, X_test, y_train, y_test, split_ID):
        #        import tensorflow as tf
        ##        tf.enable_eager_execution()
        ##        from keras import backend as K
        #        from tensorflow.keras.models import Sequential
        #        from tensorflow.keras.layers import Dense, Dropout, GaussianNoise
        #        from tensorflow.keras.layers import Lambda
        #        from tensorflow.keras.utils import to_categorical
        #        def focal_loss(y_true, y_pred):
        #            gamma = 2.0
        #            alpha = 0.25
        #            pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        #            pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        #    #        pt_1 = K.clip(pt_1, 1e-3, .999)
        #    #        pt_0 = K.clip(pt_0, 1e-3, .999)
        #
        #            return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log( pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0 ))

        #bias for predictions
        fl_pi = 0.01
        final_bias = -np.log((1 - fl_pi) / fl_pi)
        num_labels = len(set(y_test))
        from sklearn.utils import class_weight
        class_weights = class_weight.compute_class_weight(
            'balanced', np.unique(y_train), y_train)
        tf.keras.backend.clear_session()
        tf.logging.set_verbosity(tf.logging.ERROR)

        fast_NN = Sequential(name='quick')

        fast_NN.add(Dense(512, activation='sigmoid', name='input'))
        #        fast_NN.add(GaussianNoise(.5))
        #fast_NN.add(Dropout(0.5))
        fast_NN.add(
            Dense(128,
                  activation='relu',
                  name='first',
                  bias_initializer=tf.keras.initializers.Constant(value=0.1)))
        #        fast_NN.add(Dropout(0.5))
        fast_NN.add(
            Dense(64,
                  activation='relu',
                  name='second',
                  bias_initializer=tf.keras.initializers.Constant(value=0.1)))
        #fast_NN.add(Dropout(0.5))
        fast_NN.add(
            Dense(16,
                  activation='relu',
                  name='third',
                  bias_initializer=tf.keras.initializers.Constant(value=0.1)))
        #fast_NN.add(Dropout(0.25))
        fast_NN.add(
            Dense(num_labels,
                  activation='softmax',
                  name='predict',
                  bias_initializer=tf.keras.initializers.Constant(
                      value=final_bias)))
        fast_NN.compile(loss='categorical_crossentropy',
                        optimizer='adam',
                        metrics=['categorical_accuracy'])
        fast_NN_model = fast_NN.fit(X_train,
                                    to_categorical(y_train),
                                    validation_data=(X_test,
                                                     to_categorical(y_test)),
                                    epochs=5,
                                    batch_size=500,
                                    class_weight=class_weights,
                                    shuffle=True,
                                    verbose=0)
        NN_test_preds = fast_NN.predict(X_test)
        prec, rec, f_1, supp = prf(y_test,
                                   np.argmax(NN_test_preds, axis=1),
                                   average=None)
        class_rep = sklearn.metrics.classification_report(
            y_test, np.argmax(NN_test_preds, axis=1))
        exp.log_other('Classification Report' + split_ID, class_rep)
        mcc = sklearn.metrics.matthews_corrcoef(
            y_test, np.argmax(NN_test_preds, axis=1))
        tf.reset_default_graph()
        #if first iteration, report model parameters to comet
        if split_ID == '0':
            exp.log_parameters(lgbm.get_params())
        return prec, rec, f_1, supp, mcc

    #from https://stackoverflow.com/questions/60jimmy27558/flatten-nested-dictionaries-compressing-keys

    def flatten(d, parent_key='', sep='_'):
        import collections
        items = []
        for k, v in d.items():
            new_key = parent_key + sep + k if parent_key else k
            if isinstance(v, collections.MutableMapping):
                items.extend(flatten(v, new_key, sep=sep).items())
            else:
                items.append((new_key, v))
        return dict(items)

    def calc_and_save_metrics(X_train, X_test, y_train, y_test, split_index,
                              model_type, embedding_type, AID, metric_names,
                              metric_dict_list, split_info, split_num,
                              little_split_num):
        '''Takes in test and train data + labels, computes metrics and saves them
        as a dict inside of the provided list. Returns this list.'''
        prec, rec, f_1, supp, mcc = classifier_train(X_train, X_test, y_train,
                                                     y_test, split_info)
        results_array = np.concatenate((prec, rec, f_1, supp)).tolist() + [mcc]
        if little_split_num == 'NaN':
            split_size = '80%'
        else:
            split_size = '10%'
        metric_dict_list.append(
            dict(
                zip(metric_names, [
                    model_type, embedding_type, AID, split_num,
                    little_split_num, split_size, split_index, split_info
                ] + results_array)))
        return metric_dict_list

    '''Begin the actual experiment'''
    #get data cleaned
    pickle_off = open(save_path, 'rb')
    activity_table = pickle.load(pickle_off)
    pickle_off.close()
    #get AID
    folder, base = ntpath.split(save_path)
    #split file name at second _ assumes file save in AID_xxx_endinfo.pkl
    AID, _, end_info = base.rpartition('_')
    #get length of MFP
    fp_length = len(activity_table.iloc[5]['MFP'])
    #reshape mfp
    X_mfp = np.concatenate(np.array(activity_table['MFP'])).ravel()
    X_mfp = X_mfp.reshape((-1, fp_length))
    le = LabelEncoder()
    labels = le.fit_transform(activity_table['PUBCHEM_ACTIVITY_OUTCOME'])
    #split data:
    from sklearn.model_selection import StratifiedShuffleSplit
    #this is outer 5fold cross validation i.e. 80/20 split
    big_splitter = StratifiedShuffleSplit(n_splits=5,
                                          test_size=0.2,
                                          random_state=2562)
    #inner replicateing the start with 10% of data (or 12.5% of 80% intial split)
    little_splitter = StratifiedShuffleSplit(n_splits=8,
                                             test_size=0.2,
                                             train_size=0.125,
                                             random_state=2562)
    #this holds all the metrics values that will be stored in comet
    metric_names = [
        'Classifier', 'Embedding', 'AID', '80% Split Number',
        '10% Split Number', 'Train Split Size', 'ID', 'Split Info',
        'prec_Inactive', 'prec_Active', 'rec_Inactive', 'rec_Active',
        'f_1_Inactive', 'f_1_Active', 'supp_Inactive', 'supp_Active', 'mcc'
    ]

    #determine model type
    classifier_dict = {
        'SVM': train_SVM,
        'RF': train_RF,
        'LGBM': train_LGBM,
        'DNN': train_DNN,
        'kSVM': train_kSVM
    }
    #set dummy variable to func that trains specified model
    classifier_train = classifier_dict[model_type]
    metric_dict_list = []
    #using labels as a dummy for X
    for split_num, [train_ind,
                    test_ind] in enumerate(big_splitter.split(labels, labels)):
        #indexs which split the data comes from X.X ie big.little
        split_index = str(split_num)
        little_split_num = 'NaN'
        '''Regular Sample'''
        split_info = 'Split' + split_index + ' 80% train' + 'BaseRatio'
        #get test/train index
        X_train, X_test, y_train, y_test = get_Scaled_Data(
            train_ind, test_ind, X_mfp, activity_table, labels, bin_labels)
        #train model and get back classwise metrics
        over_X_train, over_y_train = ros.fit_resample(X_train, y_train)
        metric_dict_list = calc_and_save_metrics(
            over_X_train, X_test, over_y_train, y_test, split_index,
            model_type, embedding_type, AID, metric_names, metric_dict_list,
            split_info, split_num, little_split_num)
        '''Over Sample'''
        split_info = 'Split' + split_index + ' 80% train' + 'OverSample'
        #get test/train index
        X_train, X_test, y_train, y_test = get_Scaled_Data(
            train_ind, test_ind, X_mfp, activity_table, labels, bin_labels)
        #train model and get back classwise metrics
        over_X_train, over_y_train = ros.fit_resample(X_train, y_train)
        metric_dict_list = calc_and_save_metrics(
            over_X_train, X_test, over_y_train, y_test, split_index,
            model_type, embedding_type, AID, metric_names, metric_dict_list,
            split_info, split_num, little_split_num)
        '''Under Sample'''
        split_info = 'Split' + split_index + ' 80% train' + 'UnderSample'
        #get test/train index
        X_train, X_test, y_train, y_test = get_Scaled_Data(
            train_ind, test_ind, X_mfp, activity_table, labels, bin_labels)
        #train model and get back classwise metrics
        under_X_train, under_y_train = rus.fit_resample(X_train, y_train)
        #print('active ratio is:',sum(under_y_train)/len(under_y_train))
        metric_dict_list = calc_and_save_metrics(
            under_X_train, X_test, under_y_train, y_test, split_index,
            model_type, embedding_type, AID, metric_names, metric_dict_list,
            split_info, split_num, little_split_num)

        for little_split_num, [little_train_ind, little_test_ind] in enumerate(
                little_splitter.split(labels[train_ind], labels[train_ind])):
            split_index = str(split_num) + '.' + str(little_split_num)
            '''Regular Sample'''
            split_info = 'Split' + split_index + ' 10% train' + 'BaseRatio'
            #get test/train index
            X_train, X_test, y_train, y_test = get_Scaled_Data(
                train_ind, test_ind, X_mfp, activity_table, labels, bin_labels)
            #train model and get back classwise metrics
            over_X_train, over_y_train = ros.fit_resample(X_train, y_train)
            if len(set(y_train)) == 2:
                metric_dict_list = calc_and_save_metrics(
                    over_X_train, X_test, over_y_train, y_test, split_index,
                    model_type, embedding_type, AID, metric_names,
                    metric_dict_list, split_info, split_num, little_split_num)
            else:
                print('Skipped ' + split_info)
            '''Over Sample'''
            #get test/train index
            X_train, X_test, y_train, y_test = get_Scaled_Data(
                little_train_ind, test_ind, X_mfp, activity_table, labels,
                bin_labels)
            over_X_train, over_y_train = ros.fit_resample(X_train, y_train)
            split_info = 'Split' + str(split_num) + ' 10% train' + 'OverSample'
            #train model and get back classwise metrics
            #check if train_split contains both postive and negative labels
            if len(set(y_train)) == 2:
                metric_dict_list = calc_and_save_metrics(
                    over_X_train, X_test, over_y_train, y_test, split_index,
                    model_type, embedding_type, AID, metric_names,
                    metric_dict_list, split_info, split_num, little_split_num)
            else:
                print('Skipped ' + split_info)
            '''UnderSample'''
            #get test/train index
            X_train, X_test, y_train, y_test = get_Scaled_Data(
                little_train_ind, test_ind, X_mfp, activity_table, labels,
                bin_labels)
            under_X_train, under_y_train = rus.fit_resample(X_train, y_train)
            split_info = 'Split' + str(
                split_num) + ' 10% train' + 'UnderSample'
            #train model and get back classwise metrics
            #check if train_split contains both postive and negative labels
            if len(set(y_train)) == 2:
                metric_dict_list = calc_and_save_metrics(
                    under_X_train, X_test, under_y_train, y_test, split_index,
                    model_type, embedding_type, AID, metric_names,
                    metric_dict_list, split_info, split_num, little_split_num)
            else:
                print('Skipped ' + split_info)
    # now convert metric_dict_list to df:
    metrics_df = pd.DataFrame(metric_dict_list)
    #set Split_ID to inded
    #now plot all the columns
    #first make a new df column to ID things as either split
    #    cols_to_plot = ['prec_Inactive','prec_Active','rec_Inactive','rec_Active','f_1_Inactive','f_1_Active','supp_Inactive','supp_Active','mcc']
    #    #turn off plotting
    #    plt.ioff()
    #    for metric in cols_to_plot:
    #        #make sns boxplot
    #        ax = sns.boxplot(x='Split Info', y=metric,  data=metrics_df)
    #        ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
    #        plt.tight_layout()
    #        #log the plot
    #        exp.log_figure()
    #        plt.clf()
    #    ''' now we're going to go through and calculate means and stds for 3 diff groups
    #        1) the 5 80% train runs
    #        2) the 5 sets of 8 10% runs
    #        3) the 40 total 10% runs
    #        we save each in a list as a pd Series with a name explaining the contents'''
    #
    #    #now add list of dicts of averages to metrics df
    #    #convert metrics_df to metric dict and log it
    #
    #    #save metric_df to current folder
    comet_addtional_info(exp, save_path, metrics_df, X_test, y_test,
                         embedding_type, model_type)
    return metrics_df
Ejemplo n.º 5
0
def main(repitition_number): 
    '''import'''
    from comet_ml import Experiment
    exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                            project_name="iter_baseline", workspace="gdreiman1", disabled = False
                            )
    exp.log_code = True
    exp.log_other('Hypothesis','''Taking 5% batches, 80% of batch is the top ranked compounds, remaining 20% is diverse selection for first 5
                  iterations, then reverts to random sampling''')
    import pickle, sys, os
    from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
    import numpy as np
    from Iterative_help_funcs import get_Scaled_Data,train_SVM,train_DNN,train_RF,train_LGBM,calc_and_save_metrics,train_PyTorchGCNN
    from imblearn.over_sampling import RandomOverSampler
    #choosing a 3:1 Inactive to Active ratio
    ros = RandomOverSampler(sampling_strategy= 0.33)
    import pandas as pd
    from joblib import Parallel, delayed
    from joblib.externals.loky import set_loky_pickler
    from joblib import parallel_backend
    import tensorflow as tf
    tf.logging.set_verbosity(tf.logging.ERROR)
    
    #%% 
    '''Load data'''
    
    AID_list =['AID_1345083','AID_624255','AID_449739','AID_995','AID_938','AID_628','AID_596','AID_893','AID_894']
    #AID_list =['AID_893','AID_894']
    
    #This will hold a series of dicts of metrics that we then build into a dataframe
    metric_dict_list = []
    multi_dump_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', 'ranked_diverse_run'+str(repitition_number)+'.pkl') 
    exp.log_other('Metrics Dict Path',multi_dump_path)
    #using longest most imformantive embeddings
          
    classifier_dict = {'SVM': train_SVM, 'RF': train_RF, 'LGBM':train_LGBM,'DNN':train_DNN,'GCNN_pytorch':train_PyTorchGCNN}
    
    model_list = ['GCNN_pytorch','SVM','RF','LGBM','DNN']
    model_list = ['RF']

    num_models = len(model_list)
    mmp = MaxMinPicker()
    #define how we select after inital training run
    selection_type = 'Diverse'
    #define size of iter after first 10% train relative to that trainsize
    iterRel2Start = 0.5
    end_iter = 10
    for AID in AID_list:
            if 'win' in sys.platform:
                AID_path = os.path.join(r'C:\Users\gdrei\Dropbox\UCL\Thesis\Data', AID) 
            else:
                AID_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', AID) 
            save_path = AID_path+ '/' + AID +'graph_processed.pkl'
            pickle_off = open(save_path,'rb')
            activity_table=pickle.load(pickle_off)
            pickle_off.close()
            
            '''Pick diverse starting point of 10% of library'''
            fplist = [x for x in activity_table['bit_MFP']]
            '''start_indexs holds the indexes of molecules already scanned at the
            start of each iteration. So for the first iter it hold the diversity selection. 
            For the second, it holds both the diversity selection and the molecules 
            screened based on the results of the first training iteration etc'''
            start_index_meta_list = []
            for i in range(rep_number):
                
                start_indexs = np.array(mmp.LazyBitVectorPick(fplist,len(fplist),int(len(fplist)/10)))
                '''store in a list that will vary as each model makes its predictions'''
                start_ind_list=[start_indexs for i in range(num_models)]
                index_name = "starting_sample_"+str(i)
                start_index_meta_dict[index_name] = start_ind_list
            diverse_size_list = [0 for i in range(num_models*)]
            fp_metalist = [fplist for i in range(num_models)]
            library_size = len(fplist)
            iter_num = 0 
            while iter_num < end_iter:
                print("Beginning Iteration ",iter_num)
                if iter_num < 5:
                    selection_type = 'Diverse'
                else:
                    selection_type = 'Random'
                        
                '''run thru models and get their preds for this iter'''
                for list_idx,[model_type,start_indexs] in enumerate(zip(model_list,start_ind_list)):
                    '''Get data for the starting molecules, it will be graphs for GCNN, else the MFP_MolChars'''
                    test_index = list(set(activity_table.index)-set(start_indexs))
                    #check that we haven't exceeded 50% of library
                    if len(test_index) > int(0.5 *library_size):
                        if model_type == 'GCNN_pytorch':
                            embedding_type = 'Graph'
                            X_train,X_test,y_train,y_test = get_Scaled_Data(start_indexs,test_index,activity_table,True,embedding_type)
                        else:
                            embedding_type = 'MFPMolChars'                  
                            X_train,X_test,y_train,y_test = get_Scaled_Data(start_indexs,test_index,activity_table,True,embedding_type)
                        #oversample to 2:1 Inactive to Active
                        '''what happens when ratio is better than ros??? Bad boy errors!!
                        Now need another check to get out of our pkl with over enrichment (hahaha)'''
                        if (len(y_train)/sum(y_train))<0.25:
                        #need to ros iff ratio is less than 3:1
                            if model_type == 'GCNN_pytorch':
                                '''ros doesn't like the data apparently'''
                                over_X_train,over_y_train = ros.fit_resample(np.arange(len(X_train)).reshape((-1,1)),y_train)
                                over_X_train = over_X_train.reshape(-1)
                                over_X_train = [X_train[i] for i in over_X_train]
                            else:
                                over_X_train,over_y_train = ros.fit_resample(X_train,y_train)
                        else:
                        #just use current enriched sample
                                over_X_train,over_y_train = X_train,y_train
                        '''Inital train run'''
                        train_and_predict_model = classifier_dict[model_type]
                        #have this split here so that I can deal w fact that DNNs
                        #are now returning the history
                        if model_type =='DNN' or model_type=='GCNN_pytorch':
                            train_predicted_probs,test_predicted_probs,base_test_predicted_probs,hist = train_and_predict_model(over_X_train,X_test,over_y_train,y_test,X_train)
                        else:
                            train_predicted_probs,test_predicted_probs,base_test_predicted_probs = train_and_predict_model(over_X_train,X_test,over_y_train,y_test,X_train)                    
                            hist = None
                        metric_dict_list = calc_and_save_metrics(y_test,test_predicted_probs,model_type,
                                              embedding_type,AID,metric_dict_list,iter_num,'test',hist)
                        metric_dict_list = calc_and_save_metrics(over_y_train,train_predicted_probs,model_type,
                                              embedding_type,AID,metric_dict_list,iter_num,'train')
                        metric_dict_list = calc_and_save_metrics(y_train,base_test_predicted_probs,model_type,
                                              embedding_type,AID,metric_dict_list,iter_num,'base_train')
                        '''Now select next 5% section'''
                        '''Put labels and preds in df, sort them. take top 80% of tier size of the top predictions
                        then do a diverse selection or random selection for remaining 20% more'''
                        preds_df = pd.DataFrame({'activity_table_index':np.array(test_index),'prob_active':np.array(test_predicted_probs)},columns= np.array(['activity_table_index','prob_active']))
                        preds_df.sort_values('prob_active',ascending=False,inplace=True,axis=0)
                        next_inds=[]
                        top_to_select = int(len(activity_table)*0.04)
                        explore_select = int(len(activity_table)*0.01)
                        next_inds=next_inds+preds_df.head(top_to_select)['activity_table_index'].tolist()
                        firstPicksList = next_inds+(start_indexs.tolist())
                        start_ind_list[list_idx] = firstPicksList
                        diverse_size_list[list_idx] = explore_select
    
                def getRandomIterInds(firstPicksList,fplist,bottom_to_select):
                    full_list_index = np.arange(len(fplist))
                    unselected_inds = list(set(full_list_index) - set(firstPicksList))
                    random_selection = np.random.choice(unselected_inds,bottom_to_select,replace=False)
                    start_indexs = np.concatenate((firstPicksList,random_selection),axis=0)
                    return start_indexs
                def getNextIterInds(firstPicksList,fplist,bottom_to_select):
                    diverse_picks = mmp.LazyBitVectorPick(fplist,len(fplist),len(firstPicksList)+bottom_to_select,firstPicksList)
                    start_indexs = np.array(diverse_picks)
                    return start_indexs
                with parallel_backend('multiprocessing'):
                    if selection_type == 'Diverse':
                        start_ind_list = Parallel(n_jobs=5)(delayed(getNextIterInds)(firstPicksList=i, fplist=j,bottom_to_select=k) for i,j,k in zip(start_ind_list,fp_metalist,diverse_size_list))
                    elif selection_type == 'Random':
                        start_ind_list = Parallel(n_jobs=5)(delayed(getRandomIterInds)(firstPicksList=i, fplist=j,bottom_to_select=k) for i,j,k in zip(start_ind_list,fp_metalist,diverse_size_list))
    
                metrics_df = pd.DataFrame(metric_dict_list)
                metrics_df.to_pickle(multi_dump_path)
                iter_num +=1
    exp.end()
Ejemplo n.º 6
0
def comet_Fold(save_path, embedding_type, model_type, bin_labels):
    from comet_ml import Experiment
    exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                     project_name="80_10_baseline",
                     workspace="gdreiman1")
    exp.log_code = True
    import pickle
    import numpy as np
    import sklearn as sklearn
    from sklearn.metrics import precision_recall_fscore_support as prf
    from sklearn.linear_model import SGDClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    '''Comet Saving Zone'''
    def comet_addtional_info(exp, save_path, metrics_dict, X_test, y_test,
                             embedding_type, model_type):
        #get AID number
        import ntpath
        #get base file name
        folder, base = ntpath.split(save_path)
        #split file name at second _ assumes file save in AID_xxx_endinfo.pkl
        AID, _, end_info = base.rpartition('_')
        exp.add_tag(AID)
        #save data location, AID info, and version info
        exp.log_dataset_info(name=AID, version=end_info, path=save_path)
        #save some informatvie tags:
        tags = [AID, end_info, model_type]
        exp.add_tags(tags)
        exp.add_tag(embedding_type)
        #save metrics_dict in data_folder with comet experiement number associated
        exp_num = exp.get_key()
        model_save = folder + '/' + model_type + '_' + exp_num + 'metrics_dict.pkl'
        pickle_on = open(model_save, 'wb')
        pickle.dump(metrics_dict, pickle_on)
        pickle_on.close()
        #log trained model location
        exp.log_other('Metrics Dict Path', model_save)
        #tell comet that the experiement is over
        exp.end()

    def get_Scaled_Data(train_ind, test_ind, X_mfp, activity_table, labels,
                        bin_labels):
        #get start and end index for molchars
        MC_start = activity_table.columns.get_loc('Chi0')
        #need to add 1 bc exclusive indexing
        MC_end = activity_table.columns.get_loc('VSA_EState9') + 1
        # standardize data
        scaler = StandardScaler(copy=False)
        #return requested datatype
        if embedding_type == 'MFPMolChars':
            X_train_molchars_std = scaler.fit_transform(
                np.array(activity_table.iloc[train_ind, MC_start:MC_end]))
            X_test_molchars_std = scaler.transform(
                np.array(activity_table.iloc[test_ind, MC_start:MC_end]))
            X_train = np.concatenate(
                (X_mfp[train_ind, :], X_train_molchars_std), axis=1)
            X_test = np.concatenate((X_mfp[test_ind, :], X_test_molchars_std),
                                    axis=1)
        elif embedding_type == 'MFP':
            X_train = X_mfp[train_ind, :]
            X_test = X_mfp[test_ind, :]
        elif embedding_type == 'MolChars':
            X_train_molchars_std = scaler.fit_transform(
                np.array(activity_table.iloc[train_ind, MC_start:MC_end]))
            X_test_molchars_std = scaler.transform(
                np.array(activity_table.iloc[test_ind, MC_start:MC_end]))
            X_train = X_train_molchars_std
            X_test = X_test_molchars_std
        y_train = labels[train_ind]
        y_test = labels[test_ind]
        #remapping active to 1 and everything else to zero
        bin_y_train, bin_y_test = np.array([
            1 if x == 0 else 0 for x in y_train
        ]), np.array([1 if x == 0 else 0 for x in y_test])
        if bin_labels == True:
            y_test = bin_y_test
            y_train = bin_y_train
        return X_train, X_test, y_train, y_test

    def train_SVM(X_train, X_test, y_train, y_test, split_ID):
        sgd_linear_SVM = SGDClassifier(loss='hinge',
                                       penalty='l2',
                                       alpha=0.0001,
                                       l1_ratio=0.15,
                                       fit_intercept=True,
                                       max_iter=500000,
                                       tol=0.001,
                                       shuffle=True,
                                       verbose=0,
                                       epsilon=0.1,
                                       n_jobs=-1,
                                       random_state=None,
                                       learning_rate='optimal',
                                       eta0=0.0,
                                       power_t=0.5,
                                       early_stopping=False,
                                       validation_fraction=0.1,
                                       n_iter_no_change=5,
                                       class_weight='balanced',
                                       warm_start=False,
                                       average=False)
        sgd_linear_SVM_model = sgd_linear_SVM.fit(X_train, y_train)

        sgd_lSVM_preds = sgd_linear_SVM_model.predict(X_test)
        prec, rec, f_1, supp = prf(y_test, sgd_lSVM_preds, average=None)
        class_rep = sklearn.metrics.classification_report(
            y_test, sgd_lSVM_preds)
        exp.log_other('Classification Report' + split_ID, class_rep)
        mcc = sklearn.metrics.matthews_corrcoef(y_test, sgd_lSVM_preds)

        #if first iteration, report model parameters to comet
        if split_ID == '0':
            exp.log_parameters(sgd_linear_SVM_model.get_params())
        return prec, rec, f_1, supp, mcc

    def train_RF(X_train, X_test, y_train, y_test, split_ID):

        rf = RandomForestClassifier(n_estimators=100,
                                    random_state=2562,
                                    class_weight="balanced_subsample",
                                    n_jobs=-1)
        rand_for = rf.fit(X_train, y_train)
        rf_preds = rand_for.predict(X_test)
        prec, rec, f_1, supp = prf(y_test, rf_preds, average=None)
        class_rep = sklearn.metrics.classification_report(y_test, rf_preds)
        exp.log_other('Classification Report' + split_ID, class_rep)
        mcc = sklearn.metrics.matthews_corrcoef(y_test, rf_preds)

        #if first iteration, report model parameters to comet
        if split_ID == '0':
            exp.log_parameters(rand_for.get_params())
        return prec, rec, f_1, supp, mcc

    def train_LGBM(X_train, X_test, y_train, y_test, split_ID):
        import lightgbm as lgb
        #make model class
        lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt',
                                        num_leaves=31,
                                        max_depth=-1,
                                        learning_rate=0.1,
                                        n_estimators=500,
                                        subsample_for_bin=200000,
                                        objective='binary',
                                        is_unbalance=True,
                                        min_split_gain=0.0,
                                        min_child_weight=0.001,
                                        min_child_samples=20,
                                        subsample=1.0,
                                        subsample_freq=0,
                                        colsample_bytree=1.0,
                                        reg_alpha=0.0,
                                        reg_lambda=0.0,
                                        random_state=None,
                                        n_jobs=-1,
                                        silent=True,
                                        importance_type='split')
        #train model
        lgbm = lgbm_model.fit(X_train, y_train)
        lgbm_preds = lgbm.predict(X_test)
        prec, rec, f_1, supp = prf(y_test, lgbm_preds, average=None)
        class_rep = sklearn.metrics.classification_report(y_test, lgbm_preds)
        exp.log_other('Classification Report' + split_ID, class_rep)
        mcc = sklearn.metrics.matthews_corrcoef(y_test, lgbm_preds)

        #if first iteration, report model parameters to comet
        if split_ID == '0':
            exp.log_parameters(lgbm.get_params())
        return prec, rec, f_1, supp, mcc

    #get data cleaned
    pickle_off = open(save_path, 'rb')
    activity_table = pickle.load(pickle_off)
    pickle_off.close()
    #get length of MFP
    fp_length = len(activity_table.iloc[5]['MFP'])
    #reshape mfp
    X_mfp = np.concatenate(np.array(activity_table['MFP'])).ravel()
    X_mfp = X_mfp.reshape((-1, fp_length))
    le = LabelEncoder()
    labels = le.fit_transform(activity_table['PUBCHEM_ACTIVITY_OUTCOME'])
    #split data:
    from sklearn.model_selection import StratifiedShuffleSplit
    #this is outer 5fold cross validation i.e. 80/20 split
    big_splitter = StratifiedShuffleSplit(n_splits=5,
                                          test_size=0.2,
                                          random_state=2562)
    #inner replicateing the start with 10% of data (or 12.5% of 80% intial split)
    little_splitter = StratifiedShuffleSplit(n_splits=8,
                                             test_size=0.2,
                                             train_size=0.125,
                                             random_state=2562)
    #this holds all the metrics values that will be stored in comet
    metric_dict = {}
    metric_names = [
        'prec_Inactive', 'prec_Active', 'rec_Inactive', 'rec_Active',
        'f_1_Inactive', 'f_1_Active', 'supp_Inactive', 'supp_Active', 'mcc'
    ]

    #determine model type
    classifier_dict = {'SVM': train_SVM, 'RF': train_RF, 'LGBM': train_LGBM}
    classifier_train = classifier_dict[model_type]
    #using labels as a dummy for X
    for split_num, [train_ind,
                    test_ind] in enumerate(big_splitter.split(labels, labels)):
        #indexs which split the data comes from X.X ie big.little
        split_index = str(split_num)
        #get test/train index
        X_train, X_test, y_train, y_test = get_Scaled_Data(
            train_ind, test_ind, X_mfp, activity_table, labels, bin_labels)
        #train model and get back classwise metrics
        prec, rec, f_1, supp, mcc = classifier_train(X_train, X_test, y_train,
                                                     y_test, split_index)
        #add split_index to metric names this assumes 0 = inactive 1 = active!!
        metric_nameandsplit = [x + '_' + split_index for x in metric_names]
        metric_dict.update(
            zip(metric_nameandsplit, (np.concatenate(
                (prec, rec, f_1, supp))) + [mcc]))

        for little_split_num, [little_train_ind, little_test_ind] in enumerate(
                little_splitter.split(labels[train_ind], labels[train_ind])):
            split_index = str(split_num) + '.' + str(little_split_num)
            #get test/train index
            X_train, X_test, y_train, y_test = get_Scaled_Data(
                little_train_ind, test_ind, X_mfp, activity_table, labels,
                bin_labels)
            #train model and get back classwise metrics
            #check if train_split contains both postive and negative labels
            if len(set(y_train)) == 2:
                prec, rec, f_1, supp, mcc = classifier_train(
                    X_train, X_test, y_train, y_test, split_index)
                metric_nameandsplit = [
                    x + '_' + split_index for x in metric_names
                ]
                metric_dict.update(
                    zip(metric_nameandsplit, (np.concatenate(
                        (prec, rec, f_1, supp))) + [mcc]))
            else:
                metric_dict[
                    split_index] = 'Split Contained only 1 class, no training'

    # = rand_for.predict(X_test)
    exp.log_metrics(metric_dict)

    comet_addtional_info(exp, save_path, metric_dict, X_test, y_test,
                         embedding_type, model_type)