def fit_validate(exp_params, k, data_path, write_path, others=None, custom_tag=''): """Fit model and compute metrics on train and validation set. Intended for hyperparameter search. Only logs final metrics and scatter plot of final embedding. Args: exp_params(dict): Parameter dict. Should at least have keys model_name, dataset_name & random_state. Other keys are assumed to be model parameters. k(int): Fold identifier. data_path(str): Data directory. write_path(str): Where to write temp files. others(dict): Other things to log to Comet experiment. custom_tag(str): Custom tag for comet experiment. """ # Comet experiment exp = Experiment(parse_args=False) exp.disable_mp() custom_tag += '_validate' exp.add_tag(custom_tag) exp.log_parameters(exp_params) if others is not None: exp.log_others(others) # Parse experiment parameters model_name, dataset_name, random_state, model_params = parse_params(exp_params) # Fetch and split dataset. data_train = getattr(grae.data, dataset_name)(split='train', random_state=random_state, data_path=data_path) data_train, data_val = data_train.validation_split(random_state=FOLD_SEEDS[k]) # Model m = getattr(grae.models, model_name)(random_state=FOLD_SEEDS[k], **model_params) m.write_path = write_path m.data_val = data_val with exp.train(): m.fit(data_train) # Log plot m.comet_exp = exp m.plot(data_train, data_val, title=f'{model_name} : {dataset_name}') # Probe embedding prober = EmbeddingProber() prober.fit(model=m, dataset=data_train, mse_only=True) train_z, train_metrics = prober.score(data_train, is_train=True) # Log train metrics exp.log_metrics(train_metrics) with exp.validate(): val_z, val_metrics = prober.score(data_val) # Log train metrics exp.log_metrics(val_metrics) # Log marker to mark successful experiment exp.log_other('success', 1)
def __init__(self, log_dir, project_name, commit_id, comment=None, disabled=True): # setup comet-ml key_path = Path('~/.cometml').expanduser().as_posix() api_key = open(key_path).read().strip() experiment = Experiment(api_key, project_name, disabled=disabled) experiment.log_parameter('commit_id', commit_id) if comment: experiment.log_other('comment', comment) # setup model backup dir exp_name = project_name + str(experiment.id) log_dir = Path(log_dir).expanduser() / exp_name if not log_dir.is_dir() and not disabled: log_dir.mkdir(0o755) self.log_dir = log_dir self.comet = experiment self.disabled = disabled
def get_comet_logger(self): if not self.paras.load : comet_exp = Experiment(project_name=COMET_PROJECT_NAME, workspace=COMET_WORKSPACE, auto_output_logging=None, auto_metric_logging=None, display_summary=False, ) if self.paras.transfer: comet_exp.set_name(self.exp_name) comet_exp.add_tag(Path(self.ckpdir).parent.name) comet_exp.add_tag('transfer') comet_exp.add_tag(self.config['data']['corpus']['metas'][0]) if self.paras.test: comet_exp.set_name(Path(self.paras.outdir).name) comet_exp.add_tag(Path(self.paras.config).parents[2].name) comet_exp.add_tag('test') comet_exp.add_tag(Path(self.paras.config).parent.stem) #comet_exp.add_tag(Path(self.paras.outdir).name) else: comet_exp.add_tag('train') for name, param in self.config.items(): if isinstance(param, dict): comet_exp.log_parameters(param, prefix=name) else: comet_exp.log_parameter(name, param) comet_exp.log_other('seed', self.paras.seed) with open(Path(self.logdir,'exp_key'), 'w') as f: print(comet_exp.get_key(),file=f) else: with open(Path(self.logdir,'exp_key'),'r') as f: exp_key = f.read().strip() comet_exp = ExistingExperiment(previous_experiment=exp_key, project_name=COMET_PROJECT_NAME, workspace=COMET_WORKSPACE, auto_output_logging=None, auto_metric_logging=None, display_summary=False, ) return comet_exp
def _create_experiment(experiment_name=None): # type: (Optional[str]) -> BaseExperiment LOGGER.debug("Creating new Experiment for MLFlow, implicit? %r", IMPLICIT_START_RUN) global PROJECT_NAME api_key = get_config("comet.api_key") if api_key: from comet_ml import Experiment LOGGER.debug("Creating an online Experiment with project name %r", PROJECT_NAME) exp = Experiment(api_key, project_name=PROJECT_NAME) else: offline_dir = get_config("comet.offline_directory") LOGGER.info(MLFLOW_OFFLINE_EXPERIMENT_FALLBACK) if not offline_dir: offline_dir = tempfile.mkdtemp() LOGGER.debug( "Creating an offline Experiment with project name %r and offline directory %r", PROJECT_NAME, offline_dir, ) exp = OfflineExperiment( project_name=PROJECT_NAME, offline_directory=offline_dir ) if experiment_name: exp.set_name(experiment_name) # Mark the experiment as created implicitely from MLFlow logger exp.log_other("Created from", "MLFlow auto-logger") return exp
class CometLogger(): def __init__(self): global comet_installed self._logging = False if comet_installed: try: self._experiment = Experiment(auto_metric_logging=False, display_summary_level=0) self._experiment.log_other("Created from", "sweetviz!") self._logging = True except: print( "ERROR: comet_ml is installed, but not configured properly (e.g. check API key setup). HTML reports will not be uploaded." ) def log_html(self, html_content): if self._logging: try: self._experiment.log_html(html_content) except: print( "comet_ml.log_html(): error occurred during call to log_html()." ) else: print( "comet_ml.log_html(): comet_ml is not installed or otherwise ready for logging." ) def end(self): if self._logging: try: self._experiment.end() except: print("comet_ml.end(): error occurred during call to end().") else: print( "comet_ml.end(): comet_ml is not installed or otherwise ready." )
def comet_DNN(save_path, embedding_type, bin_labels): from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="DNN_baseline", workspace="gdreiman1") exp.log_code = True exp.log_other( 'Notes', 'NN_arch same as exp from 7/6 that had good prec/rec, added .1 bias to elu layers, added pi from fl paper, using binary labels' ) import tensorflow as tf tf.enable_eager_execution() import pickle import pandas as pd import numpy as np import sklearn import matplotlib.pyplot as plt from sklearn.metrics import precision_recall_fscore_support as prf from keras import backend as K from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout, GaussianNoise from tensorflow.keras.layers import Lambda from tensorflow.keras.utils import to_categorical from sklearn.preprocessing import StandardScaler, LabelEncoder from ROC_funcs import single_roc, multi_roc '''Comet Saving Zone''' def comet_addtional_info(exp, model, save_path, X_test, y_test, embedding_type, model_type): from tensorflow.keras.utils import to_categorical NN_test_preds = model.predict(X_test) class_rep = sklearn.metrics.classification_report( y_test, np.argmax(NN_test_preds, axis=1)) #print(class_rep) if len(set(y_test)) == 2: try: prec, rec, f_1, supp = prf(y_test, np.argmax(NN_test_preds, axis=1), average=None) single_roc(NN_test_preds, y_test) except: pass else: try: prec, rec, f_1, supp = prf(y_test, np.argmax(NN_test_preds, axis=1), average=None) multi_roc(NN_test_preds, to_categorical(y_test), '_', len(set(y_test))) except: pass #get AID number import ntpath #get base file name folder, base = ntpath.split(save_path) #split file name at second _ assumes file save in AID_xxx_endinfo.pkl AID, _, end_info = base.rpartition('_') exp.add_tag(AID) #save data location, AID info, and version info exp.log_dataset_info(name=AID, version=end_info, path=save_path) #save model params #exp.log_parameters(trained_mod.get_params()) #save metrics report to comet if len(set(y_test)) == 2: for i, name in enumerate(['Active', 'Inactive']): exp.log_metric('f1 class ' + name, f_1[i]) exp.log_metric('Recall class' + name, rec[i]) exp.log_metric('Precision class' + name, prec[i]) else: for i, name in enumerate(['Active', 'Inconclusive', 'Inactive']): exp.log_metric('f1 class ' + str(i), f_1[i]) exp.log_metric('Recall class' + str(i), rec[i]) exp.log_metric('Precision class' + str(i), prec[i]) #exp.log_metric('f1 class '+str(i), f_1[i]) #exp.log_metric('Recall class'+str(i),rec[i]) #exp.log_metric('Precision class'+str(i), prec[i]) exp.log_other('Classification Report', class_rep) #save model in data_folder with comet experiement number associated # exp_num = exp.get_key() # model_save = folder+'\\'+model_type+'_'+exp_num+'.pkl' # pickle_on = open(model_save,'wb') # pickle.dump(fast_NN,pickle_on) # pickle_on.close() # #log trained model location # exp.log_other('Trained Model Path',model_save) #save some informatvie tags: if bin_labels == True: label_status = 'binary' else: label_status = 'multiple' tags = [AID, end_info, model_type, label_status] exp.add_tags(tags) exp.add_tag('4_layer') exp.add_tag(embedding_type) #save ROC curve exp.log_figure(figure_name='ROC-Pres/Recall', figure=plt) plt.show() exp.end() model_type = 'DNN' #get data cleaned pickle_off = open(save_path, 'rb') activity_table = pickle.load(pickle_off) pickle_off.close() #get length of MFP fp_length = len(activity_table.iloc[5]['MFP']) #simple neural net scaler = StandardScaler(copy=False) le = LabelEncoder() labels = le.fit_transform(activity_table['PUBCHEM_ACTIVITY_OUTCOME']) #split data: from sklearn.model_selection import StratifiedShuffleSplit splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=None, random_state=2562) X_mfp = np.concatenate(np.array(activity_table['MFP'])).ravel() X_mfp = X_mfp.reshape((-1, fp_length)) for train_ind, test_ind in splitter.split(X_mfp, labels): #get start and end index for molchars MC_start = activity_table.columns.get_loc('Chi0') #need to add 1 bc exclusive indexing MC_end = activity_table.columns.get_loc('VSA_EState9') + 1 # standardize data X_train_molchars_std = scaler.fit_transform( np.array(activity_table.iloc[train_ind, MC_start:MC_end])) X_test_molchars_std = scaler.transform( np.array(activity_table.iloc[test_ind, MC_start:MC_end])) if embedding_type == 'MFPMolChars': X_train = np.concatenate( (X_mfp[train_ind, :], X_train_molchars_std), axis=1) X_test = np.concatenate((X_mfp[test_ind, :], X_test_molchars_std), axis=1) elif embedding_type == 'MFP': X_train = X_mfp[train_ind, :] X_test = X_mfp[test_ind, :] elif embedding_type == 'MolChars': X_train = X_train_molchars_std X_test = X_test_molchars_std y_train = labels[train_ind] y_test = labels[test_ind] #X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,labels,test_size = .5, shuffle = True, stratify = labels, random_state = 2562) #remapping active to 1 and everything else to zero bin_y_train, bin_y_test = np.array([ 1 if x == 0 else 0 for x in y_train ]), np.array([1 if x == 0 else 0 for x in y_test]) if bin_labels == True: y_test = bin_y_test y_train = bin_y_train #from https://towardsdatascience.com/handling-imbalanced-datasets-in-deep-learning-f48407a0e758 def focal_loss(y_true, y_pred): gamma = 2.0 alpha = 2 pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred)) pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred)) # pt_1 = K.clip(pt_1, 1e-3, .999) # pt_0 = K.clip(pt_0, 1e-3, .999) return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.sum( (1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0)) #bias for predictions fl_pi = 0.01 final_bias = -np.log((1 - fl_pi) / fl_pi) num_labels = len(set(y_test)) tf.keras.backend.clear_session() fast_NN = Sequential(name='quick') #fast_NN.add(GaussianNoise(.5)) fast_NN.add(Dense(512, activation='sigmoid', name='input')) #fast_NN.add(Dropout(0.5)) fast_NN.add( Dense(128, activation='relu', name='first', bias_initializer=tf.keras.initializers.Constant(value=0.1))) #fast_NN.add(Dropout(0.5)) fast_NN.add( Dense(64, activation='relu', name='second', bias_initializer=tf.keras.initializers.Constant(value=0.1))) #fast_NN.add(Dropout(0.5)) fast_NN.add( Dense(16, activation='relu', name='third', bias_initializer=tf.keras.initializers.Constant(value=0.1))) #fast_NN.add(Dropout(0.25)) fast_NN.add( Dense( num_labels, activation='softmax', name='predict', bias_initializer=tf.keras.initializers.Constant(value=final_bias))) fast_NN.compile(loss=[focal_loss], optimizer='adam', metrics=[ 'categorical_accuracy', tf.keras.metrics.Recall(), tf.keras.metrics.Precision() ]) fast_NN_model = fast_NN.fit(X_train, to_categorical(y_train), validation_data=(X_test, to_categorical(y_test)), epochs=5, batch_size=500, shuffle=True, verbose=0) comet_addtional_info(exp, fast_NN, save_path, X_test, y_test, embedding_type, model_type)
args = parser.parse_args() # Ideas # Pretrain network without permuted convolutions. Then train it using permuted/shuffled convolutions ################################################ num_channels_permuted = "5, 10" # model_name = "DenseNet_reduced_1x1_regularized_conv1-2" # model_name = "small_CNN_1x1_3x3_no_bias_LBFGS" model_name = "PermSmallCNN_SGD_LR_0.0001_LRS_no_bias" gpu_id = 3 reg_lambda = 5e-3 ################################################ experiment.add_tag(model_name) experiment.add_tag(num_channels_permuted) experiment.log_other("Network", model_name) experiment.log_other("Dataset", "CIFAR-100") experiment.log_other("Type", model_name) # experiment.log_other("Regularizer", reg_lambda) device = 'cuda:' + str(gpu_id) if torch.cuda.is_available() else 'cpu' # device = 'cpu' best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch train_batch_size = 250 test_batch_size = 250 # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4),
to_file(y_train, "true_train", y_train) to_file(y_dev, "true_dev", y_train) print('saving model') model.save(os.path.join(DATADIR,'level1_'+MODEL_NAME)) # experiment.log_figure(figure_name='dev_support', figure=train_support) # experiment.log_figure(figure_name='train_support', figure=dev_support) print('logging experiment parameters') params={ "max_sequence_length":MAX_SEQUENCE_LENGTH, "embedding_dim":EMBEDDING_DIM, "p_threshold":P_THRESHOLD, "pos_ratio":POS_RATIO, "num_words":NUM_WORDS, "datadir":DATADIR, "metadata":os.getenv('METADATA_LIST'), "Data_since":os.getenv('SINCE_THRESHOLD') } experiment.log_multiple_params(params) experiment.log_other("datadir", DATADIR) experiment.log_other("metadata", os.getenv('METADATA_LIST')) experiment.log_other("data_since", os.getenv('SINCE_THRESHOLD')) experiment.log_dataset_hash(x_train)
def comet_lgbm(save_path): from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="baseline", workspace="gdreiman1") exp.log_code = True import pickle import pandas as pd import lightgbm as lgb import numpy as np import sklearn import matplotlib.pyplot as plt from sklearn.metrics import precision_recall_fscore_support as prf #%% def single_roc(y_preds,y_true): from sklearn.metrics import roc_curve, auc,precision_recall_curve fpr, tpr, _ = roc_curve(y_true, y_preds) roc_auc = auc(fpr, tpr) plt.figure() lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') precision, recall, thresholds = precision_recall_curve(y_true, y_preds) plt.plot(recall, precision, color='blue', lw=lw, label='Precision vs Recall') # show the plot plt.legend(loc="lower right") plt.show() def multi_roc(y_preds,y_true,name,n_classes): import collections nested_dict = lambda: collections.defaultdict(nested_dict) data_store = nested_dict() from sklearn.metrics import roc_curve, auc from scipy import interp from itertools import cycle lw = 2 name_store = ['Active', 'Inactive', 'Inconclusive'] fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_preds[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_true[:, i].ravel(), y_preds[:, i].ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Compute macro-average ROC curve and ROC area # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves plt.figure() plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4) colors = cycle(['aqua', 'darkorange', 'cornflowerblue','green']) for i, color in zip(range(n_classes), colors): plt.plot(fpr[i], tpr[i], color=color, lw=lw, label='ROC curve of '+ name_store[i]+'(area = {1:0.2f})' ''.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--', lw=lw) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') #plt.title('Multi-class ROC for '+name+' Split= '+str(count+1)) plt.title('Multi-class ROC for '+name) plt.legend(loc="lower right") #plt.show() #%% #save_path = r'C:\Users\gdrei\Dropbox\UCL\Thesis\May_13\AID_1345083_processed.pkl' model_type = 'lgbm' #get data cleaned pickle_off = open(save_path,'rb') activity_table=pickle.load(pickle_off) pickle_off.close() #get length of MFP fp_length = len(activity_table.iloc[5]['MFP']) from sklearn.preprocessing import StandardScaler, LabelEncoder scaler = StandardScaler(copy = False) le = LabelEncoder() labels = le.fit_transform(activity_table['PUBCHEM_ACTIVITY_OUTCOME']) #split data: from sklearn.model_selection import StratifiedShuffleSplit splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.5, train_size=None, random_state=2562) X_mfp = np.concatenate(np.array(activity_table['MFP'])).ravel() X_mfp = X_mfp.reshape((-1,fp_length)) for train_ind, test_ind in splitter.split(X_mfp,labels): # standardize data X_train_molchars_std = scaler.fit_transform(np.array(activity_table.iloc[train_ind,4:])) X_test_molchars_std = scaler.transform(np.array(activity_table.iloc[test_ind,4:])) X_train = np.concatenate((X_mfp[train_ind,:],X_train_molchars_std),axis = 1) X_test = np.concatenate((X_mfp[test_ind,:],X_test_molchars_std),axis = 1) y_train = labels[train_ind] y_test = labels[test_ind] #X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,labels,test_size = .5, shuffle = True, stratify = labels, random_state = 2562) bin_y_train, bin_y_test = [1 if x ==2 else x for x in y_train],[1 if x ==2 else x for x in y_test] #do light gbm #need to make a lib svm file train_data = lgb.Dataset(X_train,label=y_train) test_data = lgb.Dataset(X_test,label=y_test) #make model class lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=500, subsample_for_bin=200000, objective='binary', is_unbalance=True, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True, importance_type='split') #train model trained_mod = lgbm_model.fit(X_train,y_train) #predict classes and class_probs test_class_preds = lgbm_model.predict(X_test) test_prob_preds = lgbm_model.predict_proba(X_test) #calculate Class report class_rep = sklearn.metrics.classification_report(y_test,test_class_preds) print(class_rep) if len(set(y_test)) == 2: single_roc(test_prob_preds[:,1],y_test) prec,rec,f_1,supp = prf(y_test, test_class_preds, average=None) else: from tensorflow.keras.utils import to_categorical multi_roc(test_prob_preds,to_categorical(y_test),'',3) prec,rec,f_1,supp = prf(y_test, test_class_preds, average=None) #%% '''Comet Saving Zone''' #get AID number import ntpath #get base file name folder,base = ntpath.split(save_path) #split file name at second _ assumes file save in AID_xxx_endinfo.pkl AID, _,end_info = base.rpartition('_') #save data location, AID info, and version info exp.log_dataset_info(name = AID, version = end_info, path = save_path) #save model params exp.log_parameters(trained_mod.get_params()) #save metrics report to comet if len(f_1) == 2: for i,name in enumerate(['Active','Inactive']): exp.log_metric('f1 class '+name, f_1[i]) exp.log_metric('Recall class'+name,rec[i]) exp.log_metric('Precision class'+name, prec[i]) else: for i,name in enumerate(['Active','Inconclusive','Inactive']): exp.log_metric('f1 class '+str(i), f_1[i]) exp.log_metric('Recall class'+str(i),rec[i]) exp.log_metric('Precision class'+str(i), prec[i]) #exp.log_metric('f1 class '+str(i), f_1[i]) #exp.log_metric('Recall class'+str(i),rec[i]) #exp.log_metric('Precision class'+str(i), prec[i]) exp.log_other('Classification Report',class_rep) #save model in data_folder with comet experiement number associated exp_num = exp.get_key() model_save = folder+'\\'+model_type+'_'+exp_num+'.pkl' pickle_on = open(model_save,'wb') pickle.dump(trained_mod,pickle_on) pickle_on.close() #log trained model location exp.log_other('Trained Model Path',model_save) #save some informatvie tags: tags = [AID,end_info,model_type] exp.add_tags(tags) #save ROC curve exp.log_figure(figure_name = 'ROC-Pres/Recall',figure=plt) plt.show() #tell comet that the experiement is over exp.end()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Sep 8 20:47:11 2019 @author: gabriel """ '''Make the final graph: all hyper parameters tuned''' from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="iter_plotting", workspace="gdreiman1", disabled=False) exp.log_code = True exp.log_other( 'Hypothesis', '''These are my plots combining the GCNN 100 epoch random run with other classifiers also with random selection and 5% ''' ) import pickle import os import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np data_dir = '/home/gabriel/Dropbox/UCL/Thesis/Data' first8 = 'tuned_7_svmmod0.pkl' second8 = 'tuned_7_svmmod1.pkl' third8 = 'tuned_7_svmmod2.pkl' from iter_plot_help_funcs import find_active_percents, plot_metrics, plot_prec_rec_curve, plot_prec_rec_vs_tresh, get_checkpoint35, set_sns_pal def get_35_tune(pathlist, sizes, expr_num):
def very_simple_param_count(model): result = sum([p.numel() for p in model.parameters()]) return result if __name__ == "__main__": experiment = Experiment(project_name=PROJECT_NAME, api_key=COMET_ML_API) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) args = my_args_parse() print(args) experiment.set_name(args.task_id) experiment.set_cmd_args() experiment.log_other("note", args.note) DATA_PATH = args.input TRAIN_PATH = os.path.join(DATA_PATH, "train_data") TEST_PATH = os.path.join(DATA_PATH, "test_data") dataset_name = args.datasetname if dataset_name == "shanghaitech": print("will use shanghaitech dataset with crop ") elif dataset_name == "shanghaitech_keepfull": print("will use shanghaitech_keepfull") else: print("cannot detect dataset_name") print("current dataset_name is ", dataset_name) # create list train_list = create_image_list(TRAIN_PATH)
def run_cycle(args, metadata, train_dataloader, train_dataset, validation_dataloader, test_dataloader, experiment_path, run_id): # Comet if args.comet_logging: from comet_ml import Experiment experiment = Experiment(api_key="GKIWhJ0lS0N674H48YQVMVNgV", project_name="thesis", workspace="rpalma") experiment.log_parameters(vars(args)) experiment.log_other("run_id", run_id) # Build the model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Device:", device, "\n") entnet = EntityNetwork( embeddings_size=args.embeddings_size, vocab_size=metadata["vocab_size"], answers_vocab_size=metadata["answers_vocab_size"], sentences_length=metadata["max_sentence_length"], queries_length=metadata["max_query_length"], n_blocks=args.n_blocks, output_module=args.output_module, output_inner_size=args.output_inner_size, temporal_attention_to_sentence=args.temporal_attention_to_sentence, temporal_activation=args.temporal_activation, temporal_attention=args.temporal_attention, dropout_prob=args.dropout_prob, temporal_attention_module=args.temporal_attention_module, device=device) entnet.to(device) print("Trainable parameters:", sum(p.numel() for p in entnet.parameters() if p.requires_grad), "\n") print("Output module:", entnet.output_module, "\n") # Set up the loss and optimizer qa_criterion = nn.CrossEntropyLoss() supp_facts_criterion = nn.BCEWithLogitsLoss( pos_weight=torch.Tensor([metadata["neg_pos_ratio"]]).to(device)) optimizer = torch.optim.Adam(entnet.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) schedulers = { "step": torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.decay_period, gamma=args.decay_rate), "cyclical": CyclicalLRScheduler(optimizer, min_lr=args.min_lr, max_lr=args.max_lr, cycle_length=args.cycle_period) } scheduler = schedulers[args.lr_scheduler] print("Scheduler:", scheduler, "\n") optimizer.zero_grad() # Build the writers train_writer = SummaryWriter( os.path.join(experiment_path, "train-%d" % run_id)) val_writer = SummaryWriter( os.path.join(experiment_path, "validation-%d" % run_id)) test_writer = SummaryWriter( os.path.join(experiment_path, "test-%d" % run_id)) def run_epoch(dataloader, should_train, should_teach_force, should_teach_force_answer, summaries_writer, experiment, experiment_context, epoch, quiet): losses = [] qa_losses = [] qa_targets = [] qa_predictions = [] supp_facts_losses = [] supp_facts_targets = [] supp_facts_predictions = [] entnet.train(mode=should_train) for batch in tqdm(dataloader) if not quiet else dataloader: story = batch["story"].to(device) query = batch["query"].to(device) qa_target = batch["answer"].to(device) supp_facts_target = batch["supporting"].float().to(device) story_mask = batch["story_mask"].float().to(device) qa_predicted, supp_facts_alignment, supp_facts_attention = entnet( story, story_mask, query, supporting_facts=supp_facts_target if should_teach_force else None, answers=qa_target if should_teach_force_answer else None) qa_loss = qa_criterion(qa_predicted, qa_target) supp_facts_loss = supp_facts_criterion(supp_facts_alignment, supp_facts_target) loss = args.qa_lambda * qa_loss + args.supporting_facts_lambda * supp_facts_loss if should_train: loss.backward() nn.utils.clip_grad_norm_(entnet.parameters(), args.gradient_clipping) optimizer.step() optimizer.zero_grad() losses.append(loss.item()) qa_losses.append(qa_loss.item()) qa_targets.append(qa_target.tolist()) qa_predictions.append(qa_predicted.argmax(dim=1).tolist()) supp_facts_losses.append(supp_facts_loss.item()) supp_facts_targets.append(supp_facts_target.tolist()) supp_facts_predictions.append(supp_facts_attention.tolist()) if should_train: translated_story, translated_query, translated_answer = train_dataset.translate_story( story[-1], query[-1], qa_target[-1]) print("\nSTORY:", translated_story) print("QUERY:", translated_query) print("ANSWER:", translated_answer) print("\nSupporting facts:", supp_facts_target[-1, :]) print("Attended:", supp_facts_attention[-1, :], "\n") mean_loss = np.mean(losses) mean_qa_loss = np.mean(qa_losses) mean_supp_facts_loss = np.mean(supp_facts_losses) mean_qa_accuracy = accuracy(qa_targets, qa_predictions) mean_supp_facts_f1 = f1(supp_facts_targets, supp_facts_predictions) # Escribir summaries write_summaries(mean_loss, mean_qa_loss, mean_supp_facts_loss, mean_qa_accuracy, mean_supp_facts_f1, supp_facts_targets, supp_facts_predictions, entnet.named_parameters(), summaries_writer, epoch) if experiment is not None: with experiment_context(): metrics = { "loss": mean_loss, "qa_loss": mean_qa_loss, "supp_facts_loss": mean_supp_facts_loss, "qa_accuracy": mean_qa_accuracy, "supp_facts_f1": mean_supp_facts_f1 } experiment.log_metrics(metrics, step=epoch) experiment.log_epoch_end(args.epochs, step=epoch) return mean_loss, mean_qa_loss, mean_supp_facts_loss, mean_qa_accuracy, mean_supp_facts_f1 best_val_loss = float("inf") best_epoch = 0 for epoch in range(1, args.epochs + 1): # Training epoch train_loss, train_qa_loss, train_supp_facts_loss, train_qa_accuracy, train_supp_facts_f1 = run_epoch( train_dataloader, should_train=True, should_teach_force=args.teach_force_training, should_teach_force_answer=args.teach_force_answer_training, summaries_writer=train_writer, experiment=experiment if args.comet_logging else None, experiment_context=experiment.train if args.comet_logging else None, epoch=epoch, quiet=False) print( "Epoch = %d.%d; task_id = %d\n\ttrain QA accuracy = %.5f; train QA error = %.5f; train supp. facts F1 = %.5f; train loss = %.5f; train QA loss = %.5f; train supp. facts loss = %.5f" % (run_id, epoch, args.task_id, train_qa_accuracy, 1 - train_qa_accuracy, train_supp_facts_f1, train_loss, train_qa_loss, train_supp_facts_loss)) # Validation with torch.no_grad(): val_loss, _, _, val_accuracy, val_f1 = run_epoch( validation_dataloader, should_train=False, should_teach_force=args.teach_force_evaluation, should_teach_force_answer=args.teach_force_answer_evaluation, summaries_writer=val_writer, experiment=experiment if args.comet_logging else None, experiment_context=experiment.validate if args.comet_logging else None, epoch=epoch, quiet=True) print( "\tval QA accuracy = %.5f; val QA error = %.5f; val loss = %.8f; val F1 = %.8f" % (val_accuracy, 1 - val_accuracy, val_loss, val_f1), "\n") save_model(entnet.state_dict(), experiment_path, run_id, epoch) if val_loss < best_val_loss: best_val_loss = val_loss best_epoch = epoch # Update learning rate scheduler.step() # Model evaluation entnet.load_state_dict(load_model(experiment_path, run_id, best_epoch)) with torch.no_grad(): test_loss, test_qa_loss, test_supp_facts_loss, test_accuracy, test_f1 = run_epoch( test_dataloader, should_train=False, should_teach_force=args.teach_force_evaluation, should_teach_force_answer=args.teach_force_answer_evaluation, summaries_writer=test_writer, experiment=experiment if args.comet_logging else None, experiment_context=experiment.test if args.comet_logging else None, epoch=best_epoch, quiet=False) print( "Epoch = %d.%d\n\ttest accuracy = %.5f; test error = %.5f; test loss = %.8f; test F1 = %.8f" % (run_id, best_epoch, test_accuracy, 1 - test_accuracy, test_loss, test_f1), "\n") return test_loss, test_qa_loss, test_supp_facts_loss, test_accuracy, test_f1, best_epoch
def victim(kwargs=None): def comet_pull_poison(craftstep): for attempt in range(5): try: bytefile = craftexpt.get_asset(assets[craftstep]) if localrank == 0: print('==> poisoninputs-{} pulled'.format(craftstep)) poisoninputs = pickle.loads(bytefile) return poisoninputs[:args.npoison] except: print(f'WARNING: comet pull attempt for craftstep {craftstep} failed on attempt {attempt}') sleep(5) if kwargs is not None: for key in kwargs: globals()[key] = kwargs[key] for key in argsmod: setattr(args, key, argsmod[key]) craftexpt = api.get_experiment(cometconfig["workspace"], args.craftproj, args.craftkey) assets = {asset['step']: asset['assetId'] for asset in craftexpt.get_asset_list() if 'poisoninputs-' in asset['fileName']} print('==> begin victim train') trial = 0 while args.ntrial is None or trial < args.ntrial: for craftstep in args.craftsteps: experiment = Experiment(project_name=args.victimproj, auto_param_logging=False, auto_metric_logging=False, parse_args=False) experiment.log_parameters(vars(args)) experiment.set_name(f'{args.craftkey[:5]}-{experiment.get_key()[:5]}') experiment.add_tag(args.tag) # experiment.add_tag(args.Xtag) experiment.log_parameters(dict(craftstep=craftstep, trial=trial)) experiment.log_other('crafturl', craftexpt.url) experiment.log_other('command', 'python ' + ' '.join(sys.argv)) if localrank == 0: print_command_and_args(args); print('crafturl: ' + craftexpt.url) if 'victim.py' in sys.argv[0]: poisoninputs = comet_pull_poison(craftstep) if poisoninputs is None: experiment.end(); print(f'skipping craftstep {craftstep}'); continue if args.savepoisondataset: package_poisoned_dataset(poisoninputs, xtrain, ytrain, xtarget, ytarget, ytargetadv, xvalid, yvalid, args, craftstep); experiment.end(); continue # meta.init_weights(sess, pretrain_weights) # what we had before meta.global_initialize(args, sess) meta.poisoninputs.load(poisoninputs, sess) trainstep = 0 for epoch in range(args.nvictimepoch): tic = time() lrnrate = lr_schedule(args.lrnrate, epoch, args.warmupperiod, args.schedule) # log hidden layer features if args.logfeat and epoch == args.nvictimepoch - 1: feats = [] for victimfeed in feeddict_generator(xtrain, ytrain, lrnrate, meta, args, victim=True): hiddens = sess.run(meta.hiddens, victimfeed) for i, hidden in enumerate(hiddens): if len(feats) <= i: feats.append(defaultdict(list)) feat = np.reshape(hidden, [-1, np.prod(hidden.shape[1:])]) appendfeats(feats[i], feat, victimfeed, ybase, ytarget, args.batchsize) for i, feats_layer in enumerate(feats): comet_log_asset(experiment, f'feats_layer{i}', feats_layer, step=epoch) # log validation acc if epoch in np.round((args.nvictimepoch - 1) * np.linspace(0, 1, args.nvalidpoints) ** 2): resVs = [] # validation for _, validfeed, _ in feeddict_generator(xvalid, yvalid, lrnrate, meta, args, valid=True): resV = sess.run(meta.resultV, validfeed) resVs.append(resV) experiment.log_metrics(avg_n_dicts(resVs), step=trainstep) # train one epoch for victimfeed in feeddict_generator(xtrain, ytrain, lrnrate, meta, args, victim=True): _, resL = sess.run([meta.trainop, meta.resultL,], victimfeed) if not trainstep % 200: experiment.log_metrics(resL, step=trainstep) trainstep += 1 experiment.log_metric('elapsed', time() - tic, step=trainstep) if args.saveweights: comet_log_asset_weights_and_buffers(epoch, experiment, meta, sess) if not epoch % 20 and localrank == 0: print(' | '.join([f'{args.craftkey[:5]}-{args.tag} | trial-{trial} | craftstep-{craftstep} | epoch {epoch} | elapsed {round(time() - tic, 2)}'] + [f'{key} {trunc_decimal(val)}' for key, val in resL.items() if 'class' not in key] + [f'{key} {trunc_decimal(val)}' for key, val in resV.items() if 'class' not in key])) experiment.end() trial += 1
https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker/ or from deep chem: MaxMinSplitter(Splitter), ButinaSplitter(Splitter), FingerprintSplitter(Splitter) can use maxmin splitter to fill out remainining space in next iter by specifying number remaining as sample size and the pool of compounds as target ''' #%% '''import''' from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="iter_baseline", workspace="gdreiman1", disabled=False) exp.log_code = True exp.log_other( 'Hypothesis', '''15% start, 5% iter, all random, svm hinge loss and sigmoidinstead of calibrated cv''' ) exper_file_name = 'tuned_3_svmmod_sigmoid' import pickle, sys, os from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker import numpy as np from Iterative_help_funcs_tuned import get_Scaled_Data, train_SVM, train_DNN, train_RF, train_LGBM, calc_and_save_metrics, train_PyTorchGCNN, train_random_classifier from imblearn.over_sampling import RandomOverSampler #choosing a 3:1 Inactive to Active ratio ros = RandomOverSampler(sampling_strategy=0.33) import pandas as pd from joblib import Parallel, delayed #from joblib.externals.loky import set_loky_pickler from joblib import parallel_backend import tensorflow as tf tf.logging.set_verbosity(tf.logging.ERROR)
model_name = experiment_params['model_name'] training = experiment_params['training'] testing = experiment_params['testing'] save_model = experiment_params['save_model'] load_model = experiment_params['load_model'] init_ckpt_file = experiment_params['init_ckpt_file'] # Set up comet experiment # experiment = Experiment(project_name="sentence-encoding-for-da", workspace="nathanduran", auto_output_logging='simple') experiment = Experiment(auto_output_logging='simple', disabled=True) experiment.set_name(experiment_name) # Log parameters experiment.log_parameters(model_params) experiment.log_parameters(experiment_params) for key, value in experiment_params.items(): experiment.log_other(key, value) # Data set and output paths dataset_name = 'token_dataset' if experiment_params[ 'to_tokens'] else 'text_dataset' dataset_dir = os.path.join(task_name, dataset_name) output_dir = os.path.join(task_name, experiment_name) checkpoint_dir = os.path.join(output_dir, 'checkpoints') embeddings_dir = 'embeddings' # Create appropriate directories if they don't exist for directory in [ task_name, dataset_dir, output_dir, checkpoint_dir, embeddings_dir ]: if not os.path.exists(directory): os.mkdir(directory)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Sep 6 17:40:01 2019 @author: gabriel """ '''Plotting Iter_7''' from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="iter_plotting", workspace="gdreiman1", disabled=False) exp.log_code = True exp.log_other('Hypothesis', '''These are my plots from the intial iterations Iter_7 ''') import pickle import os import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np data_dir = '/home/gabriel/Dropbox/UCL/Thesis/Data' gcnn_initial = 'second_diverse_GCNN_50epoch_iter_run.pkl' save_path = os.path.join(data_dir, gcnn_initial) pickle_off = open(save_path, 'rb') gcnn_initial = pickle.load(pickle_off) pickle_off.close() from iter_plot_help_funcs import find_active_percents, plot_metrics, plot_prec_rec_curve, plot_prec_rec_vs_tresh, plot_avg_percent_found, set_sns_pal
experiment.log_metric("test_accuracy", accuracy_score(y_true, y_pred)) experiment.log_metric("beta", best_b) experiment.log_metric("neurons", best_p) experiment.log_confusion_matrix(matrix=confusion_matrix(y_true, y_pred).tolist(), labels=oDataSet.labelsNames) # model.save('model.h5') # experiment.log_asset("model.h5") model.save_weights('model.weights') experiment.log_asset("model.weights") print(accuracy_score(y_true, y_pred)) print(confusion_matrix(y_true, y_pred)) oData.confusion_matrix = confusion_matrix(y_true, y_pred) oData.model = model oData.params = { "k_fold": K_FOLD, "GRID_RESULT": grid_result, "GRID_VALUES_NEURON": GRID_NEURON, "GRID_VALUES_BETA": GRID_B, "LEARNING RATE": LEARNING_RATE, "EPOCHS": epochs } experiment.log_other("params", oData.params) y_pred = model.predict( oDataSet.attributes[oData.Training_indexes]).argmax(axis=1) y_true = oDataSet.labels[oData.Training_indexes] experiment.log_metric("train_accuracy", accuracy_score(y_true, y_pred)) experiment.end() oDataSet.append(oData)
grid_result[g1, g2, k_slice] = accuracy_score(y_true, y_pred) # print(grid_result) k_slice += 1 print(grid_result) model, bests = fit(oDataSet.attributes[oData.Training_indexes], oDataSet.labels[oData.Training_indexes], LEARNING_RATE, epochs, 0.2, 0.1, 0.7) y_pred = model._predict(oDataSet.attributes[oData.Testing_indexes]).argmax( axis=1).T.tolist()[0] y_true = oDataSet.labels[oData.Testing_indexes] bests = [x.fitness for x in bests] plt.plot(bests) plt.show() experiment.log_other("pesos", str(model.genes)) experiment.log_metric("test_accuracy", accuracy_score(y_true, y_pred)) experiment.log_metric("beta", LEARNING_RATE) experiment.log_metric("neurons", epochs) experiment.log_confusion_matrix(matrix=confusion_matrix(y_true, y_pred).tolist(), labels=oDataSet.labelsNames) print(accuracy_score(y_true, y_pred)) print(confusion_matrix(y_true, y_pred)) oData.confusion_matrix = confusion_matrix(y_true, y_pred) oData.model = model oData.params = { "k_fold": K_FOLD, "GRID_RESULT": grid_result, "GRID_VALUES_NEURON": GRID_NEURON, "GRID_VALUES_BETA": GRID_B,
groups = data_reader.groups all_scores = [] for i in range(3): ae = Autoencoder(config[i]["encoder"], config[i]["decoder"], input_shape=input_shapes[i], latent_shape=latent_shape, loss="mean_squared_error", optimizer_params=None) experiment.log_multiple_params(config[i]) scores = ae.cross_validate(data[i], groups, experiment=experiment, epochs=10000, n_splits=4, log_prefix=f"dataset_{i}_") all_scores.append(scores) mean_scores = np.mean(scores) experiment.log_metric(f"mean_scores_{i}", mean_scores) experiment.log_other(f"scores_{i}", scores) experiment.log_metric(f"mean_all_scores", np.mean(all_scores)) print(all_scores)
'''Select data set, do smart sampling either rdkit: https://www.rdkit.org/docs/source/rdkit.ML.Cluster.Butina.html https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker/ or from deep chem: MaxMinSplitter(Splitter), ButinaSplitter(Splitter), FingerprintSplitter(Splitter) can use maxmin splitter to fill out remainining space in next iter by specifying number remaining as sample size and the pool of compounds as target ''' #%% '''import''' from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="iter_baseline", workspace="gdreiman1", disabled=False) exp.log_code = True exp.log_other('Hypothesis', '''15% start 10% iter, diverse svm hinge loss''') exper_file_name = 'tuned_4_svmmod_corrected' import pickle, sys, os from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker import numpy as np from Iterative_help_funcs_tuned import get_Scaled_Data, train_SVM, train_DNN, train_RF, train_LGBM, calc_and_save_metrics, train_PyTorchGCNN, train_random_classifier from imblearn.over_sampling import RandomOverSampler #choosing a 3:1 Inactive to Active ratio ros = RandomOverSampler(sampling_strategy=0.33) import pandas as pd from joblib import Parallel, delayed #from joblib.externals.loky import set_loky_pickler from joblib import parallel_backend import tensorflow as tf tf.logging.set_verbosity(tf.logging.ERROR)
model2 = ClassicNet(hyperparameters['filters'], hyperparameters['layers']) model2 = model2.cuda() model2.load_state_dict(checkpoint2['state_dict']) evaluate(dataloader, model1, model2) experiment = None if args.api_key: project_dir, experiment_name = split(dirname(realpath(__file__))) project_name = basename(project_dir) experiment = Experiment(api_key=args.api_key, project_name=project_name, auto_param_logging=False, auto_metric_logging=False, parse_args=False) experiment.log_other('experiment_name', experiment_name) experiment.log_parameters(vars(args)) for k in hyperparameters: if type(hyperparameters[k]) == dict: experiment.log_parameters(hyperparameters[k], prefix=k) else: experiment.log_parameter(k, hyperparameters[k]) try: dataset = torchvision.datasets.ImageFolder(root='./trainset') except: import zipfile zip_ref = zipfile.ZipFile('trainset.zip', 'r') zip_ref.extractall() zip_ref.close() dataset = torchvision.datasets.ImageFolder(root='./trainset')
or from deep chem: MaxMinSplitter(Splitter), ButinaSplitter(Splitter), FingerprintSplitter(Splitter) can use maxmin splitter to fill out remainining space in next iter by specifying number remaining as sample size and the pool of compounds as target ''' #%% '''import''' from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="iter_baseline", workspace="gdreiman1", disabled=False) exp.log_code = True exp.log_other( 'Hypothesis', 'Comparing diverse with smaller iter sizes to random with small itersize') import pickle, sys, os from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker import numpy as np from Iterative_help_funcs import get_Scaled_Data, train_SVM, train_DNN, train_RF, train_LGBM, calc_and_save_metrics from imblearn.over_sampling import RandomOverSampler #choosing a 3:1 Inactive to Active ratio ros = RandomOverSampler(sampling_strategy=0.33) import pandas as pd from joblib import Parallel, delayed from joblib.externals.loky import set_loky_pickler from joblib import parallel_backend import tensorflow as tf tf.logging.set_verbosity(tf.logging.ERROR)
else: print("cannot detect dataset_name") print("current dataset_name is ", dataset_name) # create list train_list = create_image_list(TRAIN_PATH) test_list = create_image_list(TEST_PATH) # create data loader train_loader, train_loader_for_eval, test_loader = get_dataloader(train_list, train_list, test_list, dataset_name=dataset_name, batch_size=args.batch_size) print("len train_loader ", len(train_loader)) # model model_name = args.model experiment.log_other("model", model_name) if model_name == "M1": model = M1() elif model_name == "M2": model = M2() elif model_name == "M3": model = M3() elif model_name == "M4": model = M4() elif model_name == "CustomCNNv2": model = CustomCNNv2() elif model_name == "BigTailM1": model = BigTailM1() elif model_name == "BigTailM2": model = BigTailM2() elif model_name == "BigTail3":
@author: gabriel """ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Jul 18 15:07:55 2019 @author: gabriel """ from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="iter_plotting", workspace="gdreiman1", disabled = False ) exp.log_code = True exp.log_other('Hypothesis','''These are my plots from the intial iterations "Iter_2" and "Iter_3" ''') import pickle import os import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np data_dir = '/home/gabriel/Dropbox/UCL/Thesis/Data' random_run = 'first_random_9iter_run.pkl' diverse_run = 'first_diverse_9iter_run.pkl' save_path = os.path.join(data_dir,random_run) pickle_off = open(save_path,'rb') random_run=pickle.load(pickle_off) pickle_off.close() save_path = os.path.join(data_dir,diverse_run)
def test_autoencoder(): (x_train, _), (x_test, _) = mnist.load_data() x_train = x_train.astype('float32') / 255. x_test = x_test.astype('float32') / 255. x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:]))) x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:]))) config = { "encoder": [{ "kwargs": { "activation": "relu", "units": 256 }, "name": "hidden1", "type": "Dense" }, { "name": "batchnorm", "type": "BatchNormalization" }, { "kwargs": { "rate": 0 }, "name": "dropout", "type": "Dropout" }, { "kwargs": { "activation": "sigmoid", }, "name": "latent", "regularizer": { "type": "l1", "value": 0 }, "type": "Dense" }] } latent_dim = 32 latent_shape = (latent_dim, ) input_shape = (x_train.shape[1], ) print(latent_shape) print(input_shape) ae = Autoencoder(config["encoder"], None, input_shape=input_shape, latent_shape=latent_shape, loss="mean_squared_error", optimizer_params=None) #experiment = Experiment(api_key="ac4P1dtMEjJf1d9hIo9CIuSXC", project_name="mnist-autoencode") experiment = Experiment(project_name="MNIST test", api_key="50kNmWUHJrWHz3FlgtpITIsB1") experiment.log_parameter("Experiment name", "Testing ae") experiment.log_multiple_params(config) experiment.log_parameter("Latent dim", latent_shape[0]) ae.fit(x_train, batch_size=1000, epochs=5, validation_data=x_test) predictions = ae.predict(x_test) scores = np.sqrt(((predictions - x_test)**2).mean()) experiment.log_other("scores", scores) print(scores) print(predictions.shape) pred_imgs = predictions.reshape(-1, 28, 28) fig = plt.figure() for i, img in enumerate(pred_imgs[:5]): fig.add_subplot(2, 5, i + 1) plt.imshow(img) plt.axis('off') fig.add_subplot(2, 5, i + 6) plt.imshow(x_test[i].reshape(28, 28)) plt.axis('off') plt.show()
# read corpus and create dataloaders corpus = Corpus(args.task, seq2seq, max_len=args.outliers) train_loader, valid_loader, test_loader = load_data(corpus, batch_size=args.batch_size, sample=args.truncated_training, model=args.model) size_vocab = len(corpus.word2id) # will be 1 in bert, not a real count because we use their tokenizer and pretrained vocab size # beware that labels2id has an extra symbol for padding that is not a tag, # so we subtract 1 from number of labels n_labels = len(corpus.label2id)-1 pad_id = corpus.word2id['<pad>'] label_pad_id = corpus.label2id['<pad>'] if args.comet_track: experiment.log_other("size_vocab", size_vocab) experiment.log_other("n_labels", n_labels) experiment.log_other("size_trainset", len(corpus.train)) experiment.log_other("size_validset", len(corpus.valid)) experiment.log_other("size_testset", len(corpus.test)) # Create NN model and send it to my_device print('Building model...') if args.model == 'vanilla_lstm': model = vanillaLSTM(size_vocab, args.dim_emb, args.dim_hid, args.nlayers, n_labels, args.dropout, pad_id, corpus, no_glove=args.no_glove, freeze=args.freeze, bidirectional=False).to(my_device) elif args.model == 'vanilla_bilstm': model = vanillaLSTM(size_vocab, args.dim_emb, args.dim_hid, args.nlayers, n_labels, args.dropout, pad_id, corpus,
'''Select data set, do smart sampling either rdkit: https://www.rdkit.org/docs/source/rdkit.ML.Cluster.Butina.html https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker/ or from deep chem: MaxMinSplitter(Splitter), ButinaSplitter(Splitter), FingerprintSplitter(Splitter) can use maxmin splitter to fill out remainining space in next iter by specifying number remaining as sample size and the pool of compounds as target ''' #%% '''import''' from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="iter_baseline", workspace="gdreiman1", disabled=False) exp.log_code = True exp.log_other('Hypothesis', '''20% start, 5% iter, all random''') exper_file_name = 'tuned_6' import pickle, sys, os from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker import numpy as np from Iterative_help_funcs_tuned import get_Scaled_Data, train_SVM, train_DNN, train_RF, train_LGBM, calc_and_save_metrics, train_PyTorchGCNN, train_random_classifier from imblearn.over_sampling import RandomOverSampler #choosing a 3:1 Inactive to Active ratio ros = RandomOverSampler(sampling_strategy=0.33) import pandas as pd from joblib import Parallel, delayed #from joblib.externals.loky import set_loky_pickler from joblib import parallel_backend import tensorflow as tf tf.logging.set_verbosity(tf.logging.ERROR)
https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker/ or from deep chem: MaxMinSplitter(Splitter), ButinaSplitter(Splitter), FingerprintSplitter(Splitter) can use maxmin splitter to fill out remainining space in next iter by specifying number remaining as sample size and the pool of compounds as target ''' #%% '''import''' from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="iter_baseline", workspace="gdreiman1", disabled=False) exp.log_code = True exp.log_other( 'Hypothesis', '''JustGCNN, only 3 datasets, 20 epochs, adam lr = 0.05, positve weight = 1''' ) exper_file_name = 'svm_metricsdict_test' import pickle, sys, os from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker import numpy as np from Iterative_help_funcs_pytorchmod import get_Scaled_Data, train_SVM, train_DNN, train_RF, train_LGBM, calc_and_save_metrics, train_PyTorchGCNN, train_random_classifier from imblearn.over_sampling import RandomOverSampler #choosing a 3:1 Inactive to Active ratio ros = RandomOverSampler(sampling_strategy=0.33) import pandas as pd from joblib import Parallel, delayed #from joblib.externals.loky import set_loky_pickler from joblib import parallel_backend import tensorflow as tf tf.logging.set_verbosity(tf.logging.ERROR)
or from deep chem: MaxMinSplitter(Splitter), ButinaSplitter(Splitter), FingerprintSplitter(Splitter) can use maxmin splitter to fill out remainining space in next iter by specifying number remaining as sample size and the pool of compounds as target ''' #%% '''import''' from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="iter_baseline", workspace="gdreiman1", disabled=False) exp.log_code = True exp.log_other( 'Hypothesis', '''Making following changes. 1) Kept 100 epochs 2) halve the size of the iters after inital screen 3)No Weak Inactives after predicted actives falls below 80% of batch size 4) Diverse selection back on''' ) import pickle, sys, os from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker import numpy as np from Iterative_help_funcs import get_Scaled_Data, train_SVM, train_DNN, train_RF, train_LGBM, calc_and_save_metrics, train_PyTorchGCNN from imblearn.over_sampling import RandomOverSampler #choosing a 3:1 Inactive to Active ratio ros = RandomOverSampler(sampling_strategy=0.33) import pandas as pd from joblib import Parallel, delayed from joblib.externals.loky import set_loky_pickler from joblib import parallel_backend import tensorflow as tf tf.logging.set_verbosity(tf.logging.ERROR)
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) try: os.mkdir(args.experiment_path) except FileExistsError: pass try: with open(f"{os.environ['HOME']}/.comet_key") as f: comet_key = f.read().strip() exp = Experiment(comet_key, project_name='evo', log_graph=False, auto_metric_logging=False) exp.log_other('Notes', args.notes) except FileNotFoundError: exp = None # TODO: make data loading more modular... tasks = ['J3/J2'] seqs = load_seqs(tasks) splits = make_splits(seqs.index, seqs[tasks], test_frac=0.1, val_frac=0.1, by_value=False) max_seq_len = seqs.index.str.len().max() for split in splits.keys(): if len(splits[split].inputs):