class CometSession(Session): com_ex: Experiment def __init__(self, source_paths: Union[List[str], str], **kwargs) -> None: self.com_ex = Experiment(**kwargs) for path in source_paths: if isfile(path): fs = open(path, mode="r") self.com_ex.log_code(code_name=basename(path), code=fs) fs.close() else: print(f"CometSession: Warning, No such file - {path}") def log_parameters(self, params: Dict[str, Any]) -> None: self.com_ex.log_parameters(params) def log_metric(self, val_name: str, value: Any) -> None: self.com_ex.log_metric(val_name, value)
# -*- coding: utf-8 -*- """ Created on Thu May 30 16:05:37 2019 @author: gdrei """ from comet_ml import Experiment experiment = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="general_test", workspace="gdreiman1") experiment.log_code = True from rdkit import Chem from rdkit.Chem import Descriptors from rdkit.Chem import AllChem import pandas as pd import numpy as np from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray from sklearn import preprocessing from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC import sklearn from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator import matplotlib.pyplot as plt import pickle #%% def comet_SVM(save_path): #encode labels pickle_off = open(save_path,'rb') activity_table=pickle.load(pickle_off) pickle_off.close() #get length of MFP fp_length = len(activity_table.iloc[5]['MFP'])
def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq, customize): try: from comet_ml import Experiment experiment = Experiment( project_name="particleflow-tf", auto_metric_logging=True, auto_param_logging=True, auto_histogram_weight_logging=True, auto_histogram_gradient_logging=False, auto_histogram_activation_logging=False, ) except Exception as e: print("Failed to initialize comet-ml dashboard") experiment = None """Train a model defined by config""" config_file_path = config config, config_file_stem = parse_config(config, nepochs=nepochs, weights=weights) if plot_freq: config["callbacks"]["plot_freq"] = plot_freq if customize: config = customization_functions[customize](config) if recreate or (weights is None): outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node()) else: outdir = str(Path(weights).parent) # Decide tf.distribute.strategy depending on number of available GPUs strategy, num_gpus = get_strategy() #if "CPU" not in strategy.extended.worker_devices[0]: # nvidia_smi_call = "nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f {}/nvidia_smi_log.csv".format(outdir) # p = subprocess.Popen(shlex.split(nvidia_smi_call)) ds_train, num_train_steps = get_datasets(config["train_test_datasets"], config, num_gpus, "train") ds_test, num_test_steps = get_datasets(config["train_test_datasets"], config, num_gpus, "test") ds_val, ds_info = get_heptfds_dataset( config["validation_dataset"], config, num_gpus, "test", config["setup"]["num_events_validation"]) ds_val = ds_val.batch(5) if ntrain: ds_train = ds_train.take(ntrain) num_train_steps = ntrain if ntest: ds_test = ds_test.take(ntest) num_test_steps = ntest print("num_train_steps", num_train_steps) print("num_test_steps", num_test_steps) total_steps = num_train_steps * config["setup"]["num_epochs"] print("total_steps", total_steps) if experiment: experiment.set_name(outdir) experiment.log_code("mlpf/tfmodel/model.py") experiment.log_code("mlpf/tfmodel/utils.py") experiment.log_code(config_file_path) shutil.copy(config_file_path, outdir + "/config.yaml" ) # Copy the config file to the train dir for later reference with strategy.scope(): lr_schedule, optim_callbacks = get_lr_schedule(config, steps=total_steps) opt = get_optimizer(config, lr_schedule) if config["setup"]["dtype"] == "float16": model_dtype = tf.dtypes.float16 policy = mixed_precision.Policy("mixed_float16") mixed_precision.set_global_policy(policy) opt = mixed_precision.LossScaleOptimizer(opt) else: model_dtype = tf.dtypes.float32 model = make_model(config, model_dtype) # Build the layers after the element and feature dimensions are specified model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])) initial_epoch = 0 if weights: # We need to load the weights in the same trainable configuration as the model was set up configure_model_weights( model, config["setup"].get("weights_config", "all")) model.load_weights(weights, by_name=True) initial_epoch = int(weights.split("/")[-1].split("-")[1]) model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])) config = set_config_loss(config, config["setup"]["trainable"]) configure_model_weights(model, config["setup"]["trainable"]) model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])) print("model weights") tw_names = [m.name for m in model.trainable_weights] for w in model.weights: print("layer={} trainable={} shape={} num_weights={}".format( w.name, w.name in tw_names, w.shape, np.prod(w.shape))) loss_dict, loss_weights = get_loss_dict(config) model.compile( loss=loss_dict, optimizer=opt, sample_weight_mode="temporal", loss_weights=loss_weights, metrics={ "cls": [ FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64), FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64), ] + [ SingleClassRecall( icls, name="rec_cls{}".format(icls), dtype=tf.float64) for icls in range(config["dataset"]["num_output_classes"]) ] }, ) model.summary() callbacks = prepare_callbacks(config["callbacks"], outdir, ds_val, ds_info, comet_experiment=experiment) callbacks.append(optim_callbacks) fit_result = model.fit( ds_train.repeat(), validation_data=ds_test.repeat(), epochs=initial_epoch + config["setup"]["num_epochs"], callbacks=callbacks, steps_per_epoch=num_train_steps, validation_steps=num_test_steps, initial_epoch=initial_epoch, ) history_path = Path(outdir) / "history" history_path = str(history_path) with open("{}/history.json".format(history_path), "w") as fi: json.dump(fit_result.history, fi) weights = get_best_checkpoint(outdir) print("Loading best weights that could be found from {}".format(weights)) model.load_weights(weights, by_name=True) model.save(outdir + "/model_full", save_format="tf") print("Training done.")
def comet_Fold(save_path, embedding_type, model_type, bin_labels): from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="80_10_baseline", workspace="gdreiman1", disabled=False) exp.log_code = True #turn off comet logging comments import os os.environ['COMET_LOGGING_FILE_LEVEL'] = 'WARNING' import warnings warnings.filterwarnings('ignore') import pickle import pandas as pd import numpy as np import sklearn as sklearn from sklearn.metrics import roc_curve, from sklearn.metrics import precision_recall_fscore_support as prf from sklearn.linear_model import SGDClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler, LabelEncoder from thundersvm import SVC import matplotlib.pyplot as plt import seaborn as sns import ntpath from imblearn.over_sampling import RandomOverSampler #choosing a 4:1 Inactive to Active ratio ros = RandomOverSampler(sampling_strategy=0.33) from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler(sampling_strategy=0.33) import tensorflow as tf tf.logging.set_verbosity(tf.logging.ERROR) # tf.enable_eager_execution() # from keras import backend as K from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout, GaussianNoise from tensorflow.keras.layers import Lambda from tensorflow.keras.utils import to_categorical os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) '''Comet Saving Zone''' def comet_addtional_info(exp, save_path, metrics_dict, X_test, y_test, embedding_type, model_type): #get base file name folder, base = ntpath.split(save_path) #split file name at second _ assumes file save in AID_xxx_endinfo.pkl AID, _, end_info = base.rpartition('_') exp.add_tag(AID) #save data location, AID info, and version info exp.log_dataset_info(name=AID, version=end_info, path=save_path) #save some informatvie tags: tags = [AID, end_info, model_type] exp.add_tags(tags) exp.add_tag(embedding_type) #save metrics_dict in data_folder with comet experiement number associated exp_num = exp.get_key() model_save = Path(folder + '/' + model_type + '_' + embedding_type + '_' + exp_num + 'metrics_dict.pkl') pickle_on = open(model_save, 'wb') pickle.dump(metrics_dict, pickle_on) pickle_on.close() #log trained model location exp.log_other('Metrics Dict Path', model_save) #tell comet that the experiement is over exp.end() def get_Scaled_Data(train_ind, test_ind, X_mfp, activity_table, labels, bin_labels): #get start and end index for molchars MC_start = activity_table.columns.get_loc('Chi0') #need to add 1 bc exclusive indexing MC_end = activity_table.columns.get_loc('VSA_EState9') + 1 # standardize data scaler = StandardScaler(copy=False) #return requested datatype if embedding_type == 'MFPMolChars': X_train_molchars_std = scaler.fit_transform( np.array(activity_table.iloc[train_ind, MC_start:MC_end]).astype(float)) X_test_molchars_std = scaler.transform( np.array(activity_table.iloc[test_ind, MC_start:MC_end]).astype(float)) X_train = np.concatenate( (X_mfp[train_ind, :], X_train_molchars_std), axis=1) X_test = np.concatenate((X_mfp[test_ind, :], X_test_molchars_std), axis=1) elif embedding_type == 'MFP': X_train = X_mfp[train_ind, :] X_test = X_mfp[test_ind, :] elif embedding_type == 'MolChars': X_train_molchars_std = scaler.fit_transform( np.array(activity_table.iloc[train_ind, MC_start:MC_end]).astype(float)) X_test_molchars_std = scaler.transform( np.array(activity_table.iloc[test_ind, MC_start:MC_end]).astype(float)) X_train = X_train_molchars_std X_test = X_test_molchars_std y_train = labels[train_ind] y_test = labels[test_ind] #remapping active to 1 and everything else to zero bin_y_train, bin_y_test = np.array([ 1 if x == 0 else 0 for x in y_train ]), np.array([1 if x == 0 else 0 for x in y_test]) if bin_labels == True: y_test = bin_y_test y_train = bin_y_train return X_train, X_test, y_train, y_test def train_SVM(X_train, X_test, y_train, y_test, split_ID): sgd_linear_SVM = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=500000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=-1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False) sgd_linear_SVM_model = sgd_linear_SVM.fit(X_train, y_train) sgd_lSVM_preds = sgd_linear_SVM_model.predict(X_test) prec, rec, f_1, supp = prf(y_test, sgd_lSVM_preds, average=None) class_rep = sklearn.metrics.classification_report( y_test, sgd_lSVM_preds) exp.log_other('Classification Report' + split_ID, class_rep) mcc = sklearn.metrics.matthews_corrcoef(y_test, sgd_lSVM_preds) #if first iteration, report model parameters to comet if split_ID == '0': exp.log_parameters(sgd_linear_SVM_model.get_params()) return prec, rec, f_1, supp, mcc def train_kSVM(X_train, X_test, y_train, y_test, split_ID): kSVM = SVC(kernel='rbf', degree=3, gamma='auto', coef0=0.0, C=1.0, tol=0.001, probability=False, class_weight='balanced', shrinking=False, cache_size=None, verbose=False, max_iter=-1, n_jobs=-1, max_mem_size=-1, random_state=None, decision_function_shape='ovo') kSVM_model = kSVM.fit(X_train, y_train) kSVM_preds = kSVM_model.predict(X_test) prec, rec, f_1, supp = prf(y_test, kSVM_preds, average=None) class_rep = sklearn.metrics.classification_report(y_test, kSVM_preds) exp.log_other('Classification Report' + split_ID, class_rep) mcc = sklearn.metrics.matthews_corrcoef(y_test, kSVM_preds) #if first iteration, report model parameters to comet if split_ID == '0': exp.log_parameters(kSVM_preds.get_params()) return prec, rec, f_1, supp, mcc def train_RF(X_train, X_test, y_train, y_test, split_ID): rf = RandomForestClassifier(n_estimators=100, random_state=2562, class_weight="balanced_subsample", n_jobs=-1) rand_for = rf.fit(X_train, y_train) rf_preds = rand_for.predict(X_test) prec, rec, f_1, supp = prf(y_test, rf_preds, average=None) class_rep = sklearn.metrics.classification_report(y_test, rf_preds) exp.log_other('Classification Report' + split_ID, class_rep) mcc = sklearn.metrics.matthews_corrcoef(y_test, rf_preds) #if first iteration, report model parameters to comet if split_ID == '0': exp.log_parameters(rand_for.get_params()) return prec, rec, f_1, supp, mcc def train_LGBM(X_train, X_test, y_train, y_test, split_ID): import lightgbm as lgb #make model class lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=500, subsample_for_bin=200000, objective='binary', is_unbalance=True, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True, importance_type='split') #train model lgbm = lgbm_model.fit(X_train, y_train) lgbm_preds = lgbm.predict(X_test) prec, rec, f_1, supp = prf(y_test, lgbm_preds, average=None) class_rep = sklearn.metrics.classification_report(y_test, lgbm_preds) exp.log_other('Classification Report' + split_ID, class_rep) mcc = sklearn.metrics.matthews_corrcoef(y_test, lgbm_preds) #if first iteration, report model parameters to comet if split_ID == '0': exp.log_parameters(lgbm.get_params()) return prec, rec, f_1, supp, mcc def train_DNN(X_train, X_test, y_train, y_test, split_ID): # import tensorflow as tf ## tf.enable_eager_execution() ## from keras import backend as K # from tensorflow.keras.models import Sequential # from tensorflow.keras.layers import Dense, Dropout, GaussianNoise # from tensorflow.keras.layers import Lambda # from tensorflow.keras.utils import to_categorical # def focal_loss(y_true, y_pred): # gamma = 2.0 # alpha = 0.25 # pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred)) # pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred)) # # pt_1 = K.clip(pt_1, 1e-3, .999) # # pt_0 = K.clip(pt_0, 1e-3, .999) # # return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log( pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0 )) #bias for predictions fl_pi = 0.01 final_bias = -np.log((1 - fl_pi) / fl_pi) num_labels = len(set(y_test)) from sklearn.utils import class_weight class_weights = class_weight.compute_class_weight( 'balanced', np.unique(y_train), y_train) tf.keras.backend.clear_session() tf.logging.set_verbosity(tf.logging.ERROR) fast_NN = Sequential(name='quick') fast_NN.add(Dense(512, activation='sigmoid', name='input')) # fast_NN.add(GaussianNoise(.5)) #fast_NN.add(Dropout(0.5)) fast_NN.add( Dense(128, activation='relu', name='first', bias_initializer=tf.keras.initializers.Constant(value=0.1))) # fast_NN.add(Dropout(0.5)) fast_NN.add( Dense(64, activation='relu', name='second', bias_initializer=tf.keras.initializers.Constant(value=0.1))) #fast_NN.add(Dropout(0.5)) fast_NN.add( Dense(16, activation='relu', name='third', bias_initializer=tf.keras.initializers.Constant(value=0.1))) #fast_NN.add(Dropout(0.25)) fast_NN.add( Dense(num_labels, activation='softmax', name='predict', bias_initializer=tf.keras.initializers.Constant( value=final_bias))) fast_NN.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) fast_NN_model = fast_NN.fit(X_train, to_categorical(y_train), validation_data=(X_test, to_categorical(y_test)), epochs=5, batch_size=500, class_weight=class_weights, shuffle=True, verbose=0) NN_test_preds = fast_NN.predict(X_test) prec, rec, f_1, supp = prf(y_test, np.argmax(NN_test_preds, axis=1), average=None) class_rep = sklearn.metrics.classification_report( y_test, np.argmax(NN_test_preds, axis=1)) exp.log_other('Classification Report' + split_ID, class_rep) mcc = sklearn.metrics.matthews_corrcoef( y_test, np.argmax(NN_test_preds, axis=1)) tf.reset_default_graph() #if first iteration, report model parameters to comet if split_ID == '0': exp.log_parameters(lgbm.get_params()) return prec, rec, f_1, supp, mcc #from https://stackoverflow.com/questions/60jimmy27558/flatten-nested-dictionaries-compressing-keys def flatten(d, parent_key='', sep='_'): import collections items = [] for k, v in d.items(): new_key = parent_key + sep + k if parent_key else k if isinstance(v, collections.MutableMapping): items.extend(flatten(v, new_key, sep=sep).items()) else: items.append((new_key, v)) return dict(items) def calc_and_save_metrics(X_train, X_test, y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num): '''Takes in test and train data + labels, computes metrics and saves them as a dict inside of the provided list. Returns this list.''' prec, rec, f_1, supp, mcc = classifier_train(X_train, X_test, y_train, y_test, split_info) results_array = np.concatenate((prec, rec, f_1, supp)).tolist() + [mcc] if little_split_num == 'NaN': split_size = '80%' else: split_size = '10%' metric_dict_list.append( dict( zip(metric_names, [ model_type, embedding_type, AID, split_num, little_split_num, split_size, split_index, split_info ] + results_array))) return metric_dict_list '''Begin the actual experiment''' #get data cleaned pickle_off = open(save_path, 'rb') activity_table = pickle.load(pickle_off) pickle_off.close() #get AID folder, base = ntpath.split(save_path) #split file name at second _ assumes file save in AID_xxx_endinfo.pkl AID, _, end_info = base.rpartition('_') #get length of MFP fp_length = len(activity_table.iloc[5]['MFP']) #reshape mfp X_mfp = np.concatenate(np.array(activity_table['MFP'])).ravel() X_mfp = X_mfp.reshape((-1, fp_length)) le = LabelEncoder() labels = le.fit_transform(activity_table['PUBCHEM_ACTIVITY_OUTCOME']) #split data: from sklearn.model_selection import StratifiedShuffleSplit #this is outer 5fold cross validation i.e. 80/20 split big_splitter = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=2562) #inner replicateing the start with 10% of data (or 12.5% of 80% intial split) little_splitter = StratifiedShuffleSplit(n_splits=8, test_size=0.2, train_size=0.125, random_state=2562) #this holds all the metrics values that will be stored in comet metric_names = [ 'Classifier', 'Embedding', 'AID', '80% Split Number', '10% Split Number', 'Train Split Size', 'ID', 'Split Info', 'prec_Inactive', 'prec_Active', 'rec_Inactive', 'rec_Active', 'f_1_Inactive', 'f_1_Active', 'supp_Inactive', 'supp_Active', 'mcc' ] #determine model type classifier_dict = { 'SVM': train_SVM, 'RF': train_RF, 'LGBM': train_LGBM, 'DNN': train_DNN, 'kSVM': train_kSVM } #set dummy variable to func that trains specified model classifier_train = classifier_dict[model_type] metric_dict_list = [] #using labels as a dummy for X for split_num, [train_ind, test_ind] in enumerate(big_splitter.split(labels, labels)): #indexs which split the data comes from X.X ie big.little split_index = str(split_num) little_split_num = 'NaN' '''Regular Sample''' split_info = 'Split' + split_index + ' 80% train' + 'BaseRatio' #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) #train model and get back classwise metrics over_X_train, over_y_train = ros.fit_resample(X_train, y_train) metric_dict_list = calc_and_save_metrics( over_X_train, X_test, over_y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num) '''Over Sample''' split_info = 'Split' + split_index + ' 80% train' + 'OverSample' #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) #train model and get back classwise metrics over_X_train, over_y_train = ros.fit_resample(X_train, y_train) metric_dict_list = calc_and_save_metrics( over_X_train, X_test, over_y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num) '''Under Sample''' split_info = 'Split' + split_index + ' 80% train' + 'UnderSample' #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) #train model and get back classwise metrics under_X_train, under_y_train = rus.fit_resample(X_train, y_train) #print('active ratio is:',sum(under_y_train)/len(under_y_train)) metric_dict_list = calc_and_save_metrics( under_X_train, X_test, under_y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num) for little_split_num, [little_train_ind, little_test_ind] in enumerate( little_splitter.split(labels[train_ind], labels[train_ind])): split_index = str(split_num) + '.' + str(little_split_num) '''Regular Sample''' split_info = 'Split' + split_index + ' 10% train' + 'BaseRatio' #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) #train model and get back classwise metrics over_X_train, over_y_train = ros.fit_resample(X_train, y_train) if len(set(y_train)) == 2: metric_dict_list = calc_and_save_metrics( over_X_train, X_test, over_y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num) else: print('Skipped ' + split_info) '''Over Sample''' #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( little_train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) over_X_train, over_y_train = ros.fit_resample(X_train, y_train) split_info = 'Split' + str(split_num) + ' 10% train' + 'OverSample' #train model and get back classwise metrics #check if train_split contains both postive and negative labels if len(set(y_train)) == 2: metric_dict_list = calc_and_save_metrics( over_X_train, X_test, over_y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num) else: print('Skipped ' + split_info) '''UnderSample''' #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( little_train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) under_X_train, under_y_train = rus.fit_resample(X_train, y_train) split_info = 'Split' + str( split_num) + ' 10% train' + 'UnderSample' #train model and get back classwise metrics #check if train_split contains both postive and negative labels if len(set(y_train)) == 2: metric_dict_list = calc_and_save_metrics( under_X_train, X_test, under_y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num) else: print('Skipped ' + split_info) # now convert metric_dict_list to df: metrics_df = pd.DataFrame(metric_dict_list) #set Split_ID to inded #now plot all the columns #first make a new df column to ID things as either split # cols_to_plot = ['prec_Inactive','prec_Active','rec_Inactive','rec_Active','f_1_Inactive','f_1_Active','supp_Inactive','supp_Active','mcc'] # #turn off plotting # plt.ioff() # for metric in cols_to_plot: # #make sns boxplot # ax = sns.boxplot(x='Split Info', y=metric, data=metrics_df) # ax.set_xticklabels(ax.get_xticklabels(),rotation=30) # plt.tight_layout() # #log the plot # exp.log_figure() # plt.clf() # ''' now we're going to go through and calculate means and stds for 3 diff groups # 1) the 5 80% train runs # 2) the 5 sets of 8 10% runs # 3) the 40 total 10% runs # we save each in a list as a pd Series with a name explaining the contents''' # # #now add list of dicts of averages to metrics df # #convert metrics_df to metric dict and log it # # #save metric_df to current folder comet_addtional_info(exp, save_path, metrics_df, X_test, y_test, embedding_type, model_type) return metrics_df
def main(repitition_number): '''import''' from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="iter_baseline", workspace="gdreiman1", disabled = False ) exp.log_code = True exp.log_other('Hypothesis','''Taking 5% batches, 80% of batch is the top ranked compounds, remaining 20% is diverse selection for first 5 iterations, then reverts to random sampling''') import pickle, sys, os from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker import numpy as np from Iterative_help_funcs import get_Scaled_Data,train_SVM,train_DNN,train_RF,train_LGBM,calc_and_save_metrics,train_PyTorchGCNN from imblearn.over_sampling import RandomOverSampler #choosing a 3:1 Inactive to Active ratio ros = RandomOverSampler(sampling_strategy= 0.33) import pandas as pd from joblib import Parallel, delayed from joblib.externals.loky import set_loky_pickler from joblib import parallel_backend import tensorflow as tf tf.logging.set_verbosity(tf.logging.ERROR) #%% '''Load data''' AID_list =['AID_1345083','AID_624255','AID_449739','AID_995','AID_938','AID_628','AID_596','AID_893','AID_894'] #AID_list =['AID_893','AID_894'] #This will hold a series of dicts of metrics that we then build into a dataframe metric_dict_list = [] multi_dump_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', 'ranked_diverse_run'+str(repitition_number)+'.pkl') exp.log_other('Metrics Dict Path',multi_dump_path) #using longest most imformantive embeddings classifier_dict = {'SVM': train_SVM, 'RF': train_RF, 'LGBM':train_LGBM,'DNN':train_DNN,'GCNN_pytorch':train_PyTorchGCNN} model_list = ['GCNN_pytorch','SVM','RF','LGBM','DNN'] model_list = ['RF'] num_models = len(model_list) mmp = MaxMinPicker() #define how we select after inital training run selection_type = 'Diverse' #define size of iter after first 10% train relative to that trainsize iterRel2Start = 0.5 end_iter = 10 for AID in AID_list: if 'win' in sys.platform: AID_path = os.path.join(r'C:\Users\gdrei\Dropbox\UCL\Thesis\Data', AID) else: AID_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', AID) save_path = AID_path+ '/' + AID +'graph_processed.pkl' pickle_off = open(save_path,'rb') activity_table=pickle.load(pickle_off) pickle_off.close() '''Pick diverse starting point of 10% of library''' fplist = [x for x in activity_table['bit_MFP']] '''start_indexs holds the indexes of molecules already scanned at the start of each iteration. So for the first iter it hold the diversity selection. For the second, it holds both the diversity selection and the molecules screened based on the results of the first training iteration etc''' start_index_meta_list = [] for i in range(rep_number): start_indexs = np.array(mmp.LazyBitVectorPick(fplist,len(fplist),int(len(fplist)/10))) '''store in a list that will vary as each model makes its predictions''' start_ind_list=[start_indexs for i in range(num_models)] index_name = "starting_sample_"+str(i) start_index_meta_dict[index_name] = start_ind_list diverse_size_list = [0 for i in range(num_models*)] fp_metalist = [fplist for i in range(num_models)] library_size = len(fplist) iter_num = 0 while iter_num < end_iter: print("Beginning Iteration ",iter_num) if iter_num < 5: selection_type = 'Diverse' else: selection_type = 'Random' '''run thru models and get their preds for this iter''' for list_idx,[model_type,start_indexs] in enumerate(zip(model_list,start_ind_list)): '''Get data for the starting molecules, it will be graphs for GCNN, else the MFP_MolChars''' test_index = list(set(activity_table.index)-set(start_indexs)) #check that we haven't exceeded 50% of library if len(test_index) > int(0.5 *library_size): if model_type == 'GCNN_pytorch': embedding_type = 'Graph' X_train,X_test,y_train,y_test = get_Scaled_Data(start_indexs,test_index,activity_table,True,embedding_type) else: embedding_type = 'MFPMolChars' X_train,X_test,y_train,y_test = get_Scaled_Data(start_indexs,test_index,activity_table,True,embedding_type) #oversample to 2:1 Inactive to Active '''what happens when ratio is better than ros??? Bad boy errors!! Now need another check to get out of our pkl with over enrichment (hahaha)''' if (len(y_train)/sum(y_train))<0.25: #need to ros iff ratio is less than 3:1 if model_type == 'GCNN_pytorch': '''ros doesn't like the data apparently''' over_X_train,over_y_train = ros.fit_resample(np.arange(len(X_train)).reshape((-1,1)),y_train) over_X_train = over_X_train.reshape(-1) over_X_train = [X_train[i] for i in over_X_train] else: over_X_train,over_y_train = ros.fit_resample(X_train,y_train) else: #just use current enriched sample over_X_train,over_y_train = X_train,y_train '''Inital train run''' train_and_predict_model = classifier_dict[model_type] #have this split here so that I can deal w fact that DNNs #are now returning the history if model_type =='DNN' or model_type=='GCNN_pytorch': train_predicted_probs,test_predicted_probs,base_test_predicted_probs,hist = train_and_predict_model(over_X_train,X_test,over_y_train,y_test,X_train) else: train_predicted_probs,test_predicted_probs,base_test_predicted_probs = train_and_predict_model(over_X_train,X_test,over_y_train,y_test,X_train) hist = None metric_dict_list = calc_and_save_metrics(y_test,test_predicted_probs,model_type, embedding_type,AID,metric_dict_list,iter_num,'test',hist) metric_dict_list = calc_and_save_metrics(over_y_train,train_predicted_probs,model_type, embedding_type,AID,metric_dict_list,iter_num,'train') metric_dict_list = calc_and_save_metrics(y_train,base_test_predicted_probs,model_type, embedding_type,AID,metric_dict_list,iter_num,'base_train') '''Now select next 5% section''' '''Put labels and preds in df, sort them. take top 80% of tier size of the top predictions then do a diverse selection or random selection for remaining 20% more''' preds_df = pd.DataFrame({'activity_table_index':np.array(test_index),'prob_active':np.array(test_predicted_probs)},columns= np.array(['activity_table_index','prob_active'])) preds_df.sort_values('prob_active',ascending=False,inplace=True,axis=0) next_inds=[] top_to_select = int(len(activity_table)*0.04) explore_select = int(len(activity_table)*0.01) next_inds=next_inds+preds_df.head(top_to_select)['activity_table_index'].tolist() firstPicksList = next_inds+(start_indexs.tolist()) start_ind_list[list_idx] = firstPicksList diverse_size_list[list_idx] = explore_select def getRandomIterInds(firstPicksList,fplist,bottom_to_select): full_list_index = np.arange(len(fplist)) unselected_inds = list(set(full_list_index) - set(firstPicksList)) random_selection = np.random.choice(unselected_inds,bottom_to_select,replace=False) start_indexs = np.concatenate((firstPicksList,random_selection),axis=0) return start_indexs def getNextIterInds(firstPicksList,fplist,bottom_to_select): diverse_picks = mmp.LazyBitVectorPick(fplist,len(fplist),len(firstPicksList)+bottom_to_select,firstPicksList) start_indexs = np.array(diverse_picks) return start_indexs with parallel_backend('multiprocessing'): if selection_type == 'Diverse': start_ind_list = Parallel(n_jobs=5)(delayed(getNextIterInds)(firstPicksList=i, fplist=j,bottom_to_select=k) for i,j,k in zip(start_ind_list,fp_metalist,diverse_size_list)) elif selection_type == 'Random': start_ind_list = Parallel(n_jobs=5)(delayed(getRandomIterInds)(firstPicksList=i, fplist=j,bottom_to_select=k) for i,j,k in zip(start_ind_list,fp_metalist,diverse_size_list)) metrics_df = pd.DataFrame(metric_dict_list) metrics_df.to_pickle(multi_dump_path) iter_num +=1 exp.end()
def comet_Fold(save_path, embedding_type, model_type, bin_labels): from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="80_10_baseline", workspace="gdreiman1") exp.log_code = True import pickle import numpy as np import sklearn as sklearn from sklearn.metrics import precision_recall_fscore_support as prf from sklearn.linear_model import SGDClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler, LabelEncoder '''Comet Saving Zone''' def comet_addtional_info(exp, save_path, metrics_dict, X_test, y_test, embedding_type, model_type): #get AID number import ntpath #get base file name folder, base = ntpath.split(save_path) #split file name at second _ assumes file save in AID_xxx_endinfo.pkl AID, _, end_info = base.rpartition('_') exp.add_tag(AID) #save data location, AID info, and version info exp.log_dataset_info(name=AID, version=end_info, path=save_path) #save some informatvie tags: tags = [AID, end_info, model_type] exp.add_tags(tags) exp.add_tag(embedding_type) #save metrics_dict in data_folder with comet experiement number associated exp_num = exp.get_key() model_save = folder + '/' + model_type + '_' + exp_num + 'metrics_dict.pkl' pickle_on = open(model_save, 'wb') pickle.dump(metrics_dict, pickle_on) pickle_on.close() #log trained model location exp.log_other('Metrics Dict Path', model_save) #tell comet that the experiement is over exp.end() def get_Scaled_Data(train_ind, test_ind, X_mfp, activity_table, labels, bin_labels): #get start and end index for molchars MC_start = activity_table.columns.get_loc('Chi0') #need to add 1 bc exclusive indexing MC_end = activity_table.columns.get_loc('VSA_EState9') + 1 # standardize data scaler = StandardScaler(copy=False) #return requested datatype if embedding_type == 'MFPMolChars': X_train_molchars_std = scaler.fit_transform( np.array(activity_table.iloc[train_ind, MC_start:MC_end])) X_test_molchars_std = scaler.transform( np.array(activity_table.iloc[test_ind, MC_start:MC_end])) X_train = np.concatenate( (X_mfp[train_ind, :], X_train_molchars_std), axis=1) X_test = np.concatenate((X_mfp[test_ind, :], X_test_molchars_std), axis=1) elif embedding_type == 'MFP': X_train = X_mfp[train_ind, :] X_test = X_mfp[test_ind, :] elif embedding_type == 'MolChars': X_train_molchars_std = scaler.fit_transform( np.array(activity_table.iloc[train_ind, MC_start:MC_end])) X_test_molchars_std = scaler.transform( np.array(activity_table.iloc[test_ind, MC_start:MC_end])) X_train = X_train_molchars_std X_test = X_test_molchars_std y_train = labels[train_ind] y_test = labels[test_ind] #remapping active to 1 and everything else to zero bin_y_train, bin_y_test = np.array([ 1 if x == 0 else 0 for x in y_train ]), np.array([1 if x == 0 else 0 for x in y_test]) if bin_labels == True: y_test = bin_y_test y_train = bin_y_train return X_train, X_test, y_train, y_test def train_SVM(X_train, X_test, y_train, y_test, split_ID): sgd_linear_SVM = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=500000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=-1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False) sgd_linear_SVM_model = sgd_linear_SVM.fit(X_train, y_train) sgd_lSVM_preds = sgd_linear_SVM_model.predict(X_test) prec, rec, f_1, supp = prf(y_test, sgd_lSVM_preds, average=None) class_rep = sklearn.metrics.classification_report( y_test, sgd_lSVM_preds) exp.log_other('Classification Report' + split_ID, class_rep) mcc = sklearn.metrics.matthews_corrcoef(y_test, sgd_lSVM_preds) #if first iteration, report model parameters to comet if split_ID == '0': exp.log_parameters(sgd_linear_SVM_model.get_params()) return prec, rec, f_1, supp, mcc def train_RF(X_train, X_test, y_train, y_test, split_ID): rf = RandomForestClassifier(n_estimators=100, random_state=2562, class_weight="balanced_subsample", n_jobs=-1) rand_for = rf.fit(X_train, y_train) rf_preds = rand_for.predict(X_test) prec, rec, f_1, supp = prf(y_test, rf_preds, average=None) class_rep = sklearn.metrics.classification_report(y_test, rf_preds) exp.log_other('Classification Report' + split_ID, class_rep) mcc = sklearn.metrics.matthews_corrcoef(y_test, rf_preds) #if first iteration, report model parameters to comet if split_ID == '0': exp.log_parameters(rand_for.get_params()) return prec, rec, f_1, supp, mcc def train_LGBM(X_train, X_test, y_train, y_test, split_ID): import lightgbm as lgb #make model class lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=500, subsample_for_bin=200000, objective='binary', is_unbalance=True, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True, importance_type='split') #train model lgbm = lgbm_model.fit(X_train, y_train) lgbm_preds = lgbm.predict(X_test) prec, rec, f_1, supp = prf(y_test, lgbm_preds, average=None) class_rep = sklearn.metrics.classification_report(y_test, lgbm_preds) exp.log_other('Classification Report' + split_ID, class_rep) mcc = sklearn.metrics.matthews_corrcoef(y_test, lgbm_preds) #if first iteration, report model parameters to comet if split_ID == '0': exp.log_parameters(lgbm.get_params()) return prec, rec, f_1, supp, mcc #get data cleaned pickle_off = open(save_path, 'rb') activity_table = pickle.load(pickle_off) pickle_off.close() #get length of MFP fp_length = len(activity_table.iloc[5]['MFP']) #reshape mfp X_mfp = np.concatenate(np.array(activity_table['MFP'])).ravel() X_mfp = X_mfp.reshape((-1, fp_length)) le = LabelEncoder() labels = le.fit_transform(activity_table['PUBCHEM_ACTIVITY_OUTCOME']) #split data: from sklearn.model_selection import StratifiedShuffleSplit #this is outer 5fold cross validation i.e. 80/20 split big_splitter = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=2562) #inner replicateing the start with 10% of data (or 12.5% of 80% intial split) little_splitter = StratifiedShuffleSplit(n_splits=8, test_size=0.2, train_size=0.125, random_state=2562) #this holds all the metrics values that will be stored in comet metric_dict = {} metric_names = [ 'prec_Inactive', 'prec_Active', 'rec_Inactive', 'rec_Active', 'f_1_Inactive', 'f_1_Active', 'supp_Inactive', 'supp_Active', 'mcc' ] #determine model type classifier_dict = {'SVM': train_SVM, 'RF': train_RF, 'LGBM': train_LGBM} classifier_train = classifier_dict[model_type] #using labels as a dummy for X for split_num, [train_ind, test_ind] in enumerate(big_splitter.split(labels, labels)): #indexs which split the data comes from X.X ie big.little split_index = str(split_num) #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) #train model and get back classwise metrics prec, rec, f_1, supp, mcc = classifier_train(X_train, X_test, y_train, y_test, split_index) #add split_index to metric names this assumes 0 = inactive 1 = active!! metric_nameandsplit = [x + '_' + split_index for x in metric_names] metric_dict.update( zip(metric_nameandsplit, (np.concatenate( (prec, rec, f_1, supp))) + [mcc])) for little_split_num, [little_train_ind, little_test_ind] in enumerate( little_splitter.split(labels[train_ind], labels[train_ind])): split_index = str(split_num) + '.' + str(little_split_num) #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( little_train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) #train model and get back classwise metrics #check if train_split contains both postive and negative labels if len(set(y_train)) == 2: prec, rec, f_1, supp, mcc = classifier_train( X_train, X_test, y_train, y_test, split_index) metric_nameandsplit = [ x + '_' + split_index for x in metric_names ] metric_dict.update( zip(metric_nameandsplit, (np.concatenate( (prec, rec, f_1, supp))) + [mcc])) else: metric_dict[ split_index] = 'Split Contained only 1 class, no training' # = rand_for.predict(X_test) exp.log_metrics(metric_dict) comet_addtional_info(exp, save_path, metric_dict, X_test, y_test, embedding_type, model_type)