Ejemplo n.º 1
0
def fit_validate(exp_params, k, data_path, write_path, others=None, custom_tag=''):
    """Fit model and compute metrics on train and validation set. Intended for hyperparameter search.

    Only logs final metrics and scatter plot of final embedding.

    Args:
        exp_params(dict): Parameter dict. Should at least have keys model_name, dataset_name & random_state. Other
        keys are assumed to be model parameters.
        k(int): Fold identifier.
        data_path(str): Data directory.
        write_path(str): Where to write temp files.
        others(dict): Other things to log to Comet experiment.
        custom_tag(str): Custom tag for comet experiment.

    """
    # Comet experiment
    exp = Experiment(parse_args=False)
    exp.disable_mp()
    custom_tag += '_validate'
    exp.add_tag(custom_tag)
    exp.log_parameters(exp_params)

    if others is not None:
        exp.log_others(others)

    # Parse experiment parameters
    model_name, dataset_name, random_state, model_params = parse_params(exp_params)

    # Fetch and split dataset.
    data_train = getattr(grae.data, dataset_name)(split='train', random_state=random_state, data_path=data_path)
    data_train, data_val = data_train.validation_split(random_state=FOLD_SEEDS[k])

    # Model
    m = getattr(grae.models, model_name)(random_state=FOLD_SEEDS[k], **model_params)
    m.write_path = write_path
    m.data_val = data_val

    with exp.train():
        m.fit(data_train)

        # Log plot
        m.comet_exp = exp
        m.plot(data_train, data_val, title=f'{model_name} : {dataset_name}')

        # Probe embedding
        prober = EmbeddingProber()
        prober.fit(model=m, dataset=data_train, mse_only=True)
        train_z, train_metrics = prober.score(data_train, is_train=True)

        # Log train metrics
        exp.log_metrics(train_metrics)

    with exp.validate():
        val_z, val_metrics = prober.score(data_val)

        # Log train metrics
        exp.log_metrics(val_metrics)

    # Log marker to mark successful experiment
    exp.log_other('success', 1)
Ejemplo n.º 2
0
    def __init__(self,
                 log_dir,
                 project_name,
                 commit_id,
                 comment=None,
                 disabled=True):

        # setup comet-ml
        key_path = Path('~/.cometml').expanduser().as_posix()
        api_key = open(key_path).read().strip()
        experiment = Experiment(api_key, project_name, disabled=disabled)

        experiment.log_parameter('commit_id', commit_id)
        if comment:
            experiment.log_other('comment', comment)

        # setup model backup dir
        exp_name = project_name + str(experiment.id)
        log_dir = Path(log_dir).expanduser() / exp_name
        if not log_dir.is_dir() and not disabled:
            log_dir.mkdir(0o755)

        self.log_dir = log_dir
        self.comet = experiment
        self.disabled = disabled
Ejemplo n.º 3
0
    def get_comet_logger(self):
        if not self.paras.load :
            comet_exp = Experiment(project_name=COMET_PROJECT_NAME,
                                         workspace=COMET_WORKSPACE,
                                         auto_output_logging=None,
                                         auto_metric_logging=None,
                                         display_summary=False,
                                         )
            if self.paras.transfer:
                comet_exp.set_name(self.exp_name)
                comet_exp.add_tag(Path(self.ckpdir).parent.name)
                comet_exp.add_tag('transfer')
                comet_exp.add_tag(self.config['data']['corpus']['metas'][0])
            if self.paras.test:
                comet_exp.set_name(Path(self.paras.outdir).name)
                comet_exp.add_tag(Path(self.paras.config).parents[2].name)
                comet_exp.add_tag('test')
                comet_exp.add_tag(Path(self.paras.config).parent.stem)
                #comet_exp.add_tag(Path(self.paras.outdir).name)
            else:
                comet_exp.add_tag('train')

            for name, param in self.config.items():
                if isinstance(param, dict):
                    comet_exp.log_parameters(param, prefix=name)
                else:
                    comet_exp.log_parameter(name, param)
            comet_exp.log_other('seed', self.paras.seed)


            with open(Path(self.logdir,'exp_key'), 'w') as f:
                print(comet_exp.get_key(),file=f)
        else:
            with open(Path(self.logdir,'exp_key'),'r') as f:
                exp_key = f.read().strip()
                comet_exp = ExistingExperiment(previous_experiment=exp_key,
                                                    project_name=COMET_PROJECT_NAME,
                                                    workspace=COMET_WORKSPACE,
                                                    auto_output_logging=None,
                                                    auto_metric_logging=None,
                                                    display_summary=False,
                                                    )
        return comet_exp
Ejemplo n.º 4
0
def _create_experiment(experiment_name=None):
    # type: (Optional[str]) -> BaseExperiment
    LOGGER.debug("Creating new Experiment for MLFlow, implicit? %r", IMPLICIT_START_RUN)

    global PROJECT_NAME

    api_key = get_config("comet.api_key")

    if api_key:
        from comet_ml import Experiment

        LOGGER.debug("Creating an online Experiment with project name %r", PROJECT_NAME)

        exp = Experiment(api_key, project_name=PROJECT_NAME)
    else:
        offline_dir = get_config("comet.offline_directory")

        LOGGER.info(MLFLOW_OFFLINE_EXPERIMENT_FALLBACK)

        if not offline_dir:
            offline_dir = tempfile.mkdtemp()

        LOGGER.debug(
            "Creating an offline Experiment with project name %r and offline directory %r",
            PROJECT_NAME,
            offline_dir,
        )

        exp = OfflineExperiment(
            project_name=PROJECT_NAME, offline_directory=offline_dir
        )

    if experiment_name:
        exp.set_name(experiment_name)

    # Mark the experiment as created implicitely from MLFlow logger
    exp.log_other("Created from", "MLFlow auto-logger")

    return exp
Ejemplo n.º 5
0
class CometLogger():
    def __init__(self):
        global comet_installed
        self._logging = False
        if comet_installed:
            try:
                self._experiment = Experiment(auto_metric_logging=False,
                                              display_summary_level=0)
                self._experiment.log_other("Created from", "sweetviz!")
                self._logging = True
            except:
                print(
                    "ERROR: comet_ml is installed, but not configured properly (e.g. check API key setup). HTML reports will not be uploaded."
                )

    def log_html(self, html_content):
        if self._logging:
            try:
                self._experiment.log_html(html_content)
            except:
                print(
                    "comet_ml.log_html(): error occurred during call to log_html()."
                )
        else:
            print(
                "comet_ml.log_html(): comet_ml is not installed or otherwise ready for logging."
            )

    def end(self):
        if self._logging:
            try:
                self._experiment.end()
            except:
                print("comet_ml.end(): error occurred during call to end().")
        else:
            print(
                "comet_ml.end(): comet_ml is not installed or otherwise ready."
            )
Ejemplo n.º 6
0
def comet_DNN(save_path, embedding_type, bin_labels):
    from comet_ml import Experiment
    exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                     project_name="DNN_baseline",
                     workspace="gdreiman1")
    exp.log_code = True
    exp.log_other(
        'Notes',
        'NN_arch same as exp from 7/6 that had good prec/rec, added .1 bias to elu layers, added pi from fl paper, using binary labels'
    )
    import tensorflow as tf
    tf.enable_eager_execution()
    import pickle
    import pandas as pd
    import numpy as np
    import sklearn
    import matplotlib.pyplot as plt
    from sklearn.metrics import precision_recall_fscore_support as prf

    from keras import backend as K
    from tensorflow.keras.models import Sequential

    from tensorflow.keras.layers import Dense, Dropout, GaussianNoise
    from tensorflow.keras.layers import Lambda
    from tensorflow.keras.utils import to_categorical

    from sklearn.preprocessing import StandardScaler, LabelEncoder
    from ROC_funcs import single_roc, multi_roc
    '''Comet Saving Zone'''
    def comet_addtional_info(exp, model, save_path, X_test, y_test,
                             embedding_type, model_type):
        from tensorflow.keras.utils import to_categorical
        NN_test_preds = model.predict(X_test)
        class_rep = sklearn.metrics.classification_report(
            y_test, np.argmax(NN_test_preds, axis=1))

        #print(class_rep)
        if len(set(y_test)) == 2:
            try:
                prec, rec, f_1, supp = prf(y_test,
                                           np.argmax(NN_test_preds, axis=1),
                                           average=None)
                single_roc(NN_test_preds, y_test)
            except:
                pass

        else:

            try:
                prec, rec, f_1, supp = prf(y_test,
                                           np.argmax(NN_test_preds, axis=1),
                                           average=None)
                multi_roc(NN_test_preds, to_categorical(y_test), '_',
                          len(set(y_test)))
            except:
                pass
        #get AID number
        import ntpath
        #get base file name
        folder, base = ntpath.split(save_path)
        #split file name at second _ assumes file save in AID_xxx_endinfo.pkl
        AID, _, end_info = base.rpartition('_')
        exp.add_tag(AID)
        #save data location, AID info, and version info
        exp.log_dataset_info(name=AID, version=end_info, path=save_path)
        #save model params
        #exp.log_parameters(trained_mod.get_params())
        #save metrics report to comet
        if len(set(y_test)) == 2:
            for i, name in enumerate(['Active', 'Inactive']):
                exp.log_metric('f1 class ' + name, f_1[i])
                exp.log_metric('Recall class' + name, rec[i])
                exp.log_metric('Precision class' + name, prec[i])
        else:
            for i, name in enumerate(['Active', 'Inconclusive', 'Inactive']):
                exp.log_metric('f1 class ' + str(i), f_1[i])
                exp.log_metric('Recall class' + str(i), rec[i])
                exp.log_metric('Precision class' + str(i), prec[i])
            #exp.log_metric('f1 class '+str(i), f_1[i])
            #exp.log_metric('Recall class'+str(i),rec[i])
            #exp.log_metric('Precision class'+str(i), prec[i])
        exp.log_other('Classification Report', class_rep)
        #save model in data_folder with comet experiement number associated
        #        exp_num = exp.get_key()
        #        model_save = folder+'\\'+model_type+'_'+exp_num+'.pkl'
        #        pickle_on = open(model_save,'wb')
        #        pickle.dump(fast_NN,pickle_on)
        #        pickle_on.close()
        #        #log trained model location
        #        exp.log_other('Trained Model Path',model_save)
        #save some informatvie tags:
        if bin_labels == True:
            label_status = 'binary'
        else:
            label_status = 'multiple'

        tags = [AID, end_info, model_type, label_status]
        exp.add_tags(tags)
        exp.add_tag('4_layer')
        exp.add_tag(embedding_type)
        #save ROC curve
        exp.log_figure(figure_name='ROC-Pres/Recall', figure=plt)
        plt.show()
        exp.end()

    model_type = 'DNN'
    #get data cleaned
    pickle_off = open(save_path, 'rb')
    activity_table = pickle.load(pickle_off)
    pickle_off.close()
    #get length of MFP
    fp_length = len(activity_table.iloc[5]['MFP'])
    #simple neural net

    scaler = StandardScaler(copy=False)
    le = LabelEncoder()
    labels = le.fit_transform(activity_table['PUBCHEM_ACTIVITY_OUTCOME'])
    #split data:
    from sklearn.model_selection import StratifiedShuffleSplit
    splitter = StratifiedShuffleSplit(n_splits=1,
                                      test_size=0.2,
                                      train_size=None,
                                      random_state=2562)
    X_mfp = np.concatenate(np.array(activity_table['MFP'])).ravel()
    X_mfp = X_mfp.reshape((-1, fp_length))
    for train_ind, test_ind in splitter.split(X_mfp, labels):
        #get start and end index for molchars
        MC_start = activity_table.columns.get_loc('Chi0')
        #need to add 1 bc exclusive indexing
        MC_end = activity_table.columns.get_loc('VSA_EState9') + 1
        # standardize data
        X_train_molchars_std = scaler.fit_transform(
            np.array(activity_table.iloc[train_ind, MC_start:MC_end]))
        X_test_molchars_std = scaler.transform(
            np.array(activity_table.iloc[test_ind, MC_start:MC_end]))
        if embedding_type == 'MFPMolChars':
            X_train = np.concatenate(
                (X_mfp[train_ind, :], X_train_molchars_std), axis=1)
            X_test = np.concatenate((X_mfp[test_ind, :], X_test_molchars_std),
                                    axis=1)
        elif embedding_type == 'MFP':
            X_train = X_mfp[train_ind, :]
            X_test = X_mfp[test_ind, :]
        elif embedding_type == 'MolChars':
            X_train = X_train_molchars_std
            X_test = X_test_molchars_std
        y_train = labels[train_ind]
        y_test = labels[test_ind]
        #X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,labels,test_size = .5, shuffle = True, stratify = labels, random_state = 2562)
        #remapping active to 1 and everything else to zero
        bin_y_train, bin_y_test = np.array([
            1 if x == 0 else 0 for x in y_train
        ]), np.array([1 if x == 0 else 0 for x in y_test])
        if bin_labels == True:
            y_test = bin_y_test
            y_train = bin_y_train

    #from https://towardsdatascience.com/handling-imbalanced-datasets-in-deep-learning-f48407a0e758
    def focal_loss(y_true, y_pred):
        gamma = 2.0
        alpha = 2
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        #        pt_1 = K.clip(pt_1, 1e-3, .999)
        #        pt_0 = K.clip(pt_0, 1e-3, .999)

        return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.sum(
            (1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))

    #bias for predictions
    fl_pi = 0.01
    final_bias = -np.log((1 - fl_pi) / fl_pi)
    num_labels = len(set(y_test))
    tf.keras.backend.clear_session()
    fast_NN = Sequential(name='quick')
    #fast_NN.add(GaussianNoise(.5))
    fast_NN.add(Dense(512, activation='sigmoid', name='input'))
    #fast_NN.add(Dropout(0.5))
    fast_NN.add(
        Dense(128,
              activation='relu',
              name='first',
              bias_initializer=tf.keras.initializers.Constant(value=0.1)))
    #fast_NN.add(Dropout(0.5))
    fast_NN.add(
        Dense(64,
              activation='relu',
              name='second',
              bias_initializer=tf.keras.initializers.Constant(value=0.1)))
    #fast_NN.add(Dropout(0.5))
    fast_NN.add(
        Dense(16,
              activation='relu',
              name='third',
              bias_initializer=tf.keras.initializers.Constant(value=0.1)))
    #fast_NN.add(Dropout(0.25))
    fast_NN.add(
        Dense(
            num_labels,
            activation='softmax',
            name='predict',
            bias_initializer=tf.keras.initializers.Constant(value=final_bias)))
    fast_NN.compile(loss=[focal_loss],
                    optimizer='adam',
                    metrics=[
                        'categorical_accuracy',
                        tf.keras.metrics.Recall(),
                        tf.keras.metrics.Precision()
                    ])
    fast_NN_model = fast_NN.fit(X_train,
                                to_categorical(y_train),
                                validation_data=(X_test,
                                                 to_categorical(y_test)),
                                epochs=5,
                                batch_size=500,
                                shuffle=True,
                                verbose=0)
    comet_addtional_info(exp, fast_NN, save_path, X_test, y_test,
                         embedding_type, model_type)
Ejemplo n.º 7
0
args = parser.parse_args()

# Ideas
# Pretrain network without permuted convolutions. Then train it using permuted/shuffled convolutions
################################################
num_channels_permuted = "5, 10"
# model_name = "DenseNet_reduced_1x1_regularized_conv1-2"
# model_name = "small_CNN_1x1_3x3_no_bias_LBFGS"
model_name = "PermSmallCNN_SGD_LR_0.0001_LRS_no_bias"
gpu_id = 3
reg_lambda = 5e-3
################################################

experiment.add_tag(model_name)
experiment.add_tag(num_channels_permuted)
experiment.log_other("Network", model_name)
experiment.log_other("Dataset", "CIFAR-100")
experiment.log_other("Type", model_name)
# experiment.log_other("Regularizer", reg_lambda)

device = 'cuda:' + str(gpu_id) if torch.cuda.is_available() else 'cpu'
# device = 'cpu'
best_acc = 0  # best test accuracy
start_epoch = 0  # start from epoch 0 or last checkpoint epoch
train_batch_size = 250
test_batch_size = 250

# Data
print('==> Preparing data..')
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
Ejemplo n.º 8
0
                             


to_file(y_train, "true_train", y_train)
to_file(y_dev, "true_dev", y_train)

print('saving model')
model.save(os.path.join(DATADIR,'level1_'+MODEL_NAME))

# experiment.log_figure(figure_name='dev_support', figure=train_support)
# experiment.log_figure(figure_name='train_support', figure=dev_support)

print('logging experiment parameters')
params={
    "max_sequence_length":MAX_SEQUENCE_LENGTH,
    "embedding_dim":EMBEDDING_DIM,
    "p_threshold":P_THRESHOLD,
    "pos_ratio":POS_RATIO,
    "num_words":NUM_WORDS,
    "datadir":DATADIR,
    "metadata":os.getenv('METADATA_LIST'),
    "Data_since":os.getenv('SINCE_THRESHOLD')
        }

experiment.log_multiple_params(params)

experiment.log_other("datadir", DATADIR)
experiment.log_other("metadata", os.getenv('METADATA_LIST'))
experiment.log_other("data_since", os.getenv('SINCE_THRESHOLD'))
experiment.log_dataset_hash(x_train)
Ejemplo n.º 9
0
def comet_lgbm(save_path):
    from comet_ml import Experiment
    exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                            project_name="baseline", workspace="gdreiman1")
    exp.log_code = True
    
    import pickle
    import pandas as pd
    import lightgbm as lgb
    import numpy as np
    import sklearn
    import matplotlib.pyplot as plt
    from sklearn.metrics import precision_recall_fscore_support as prf
    #%%
    def single_roc(y_preds,y_true):
        
        from sklearn.metrics import roc_curve, auc,precision_recall_curve
        fpr, tpr, _ = roc_curve(y_true, y_preds)
        roc_auc = auc(fpr, tpr)
        plt.figure()
        lw = 2
        plt.plot(fpr, tpr, color='darkorange',
                 lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic example')
        
        precision, recall, thresholds = precision_recall_curve(y_true, y_preds)
        plt.plot(recall, precision, color='blue',
                 lw=lw, label='Precision vs Recall')
        # show the plot
        plt.legend(loc="lower right")
        plt.show()
    def multi_roc(y_preds,y_true,name,n_classes):
        import collections
        nested_dict = lambda: collections.defaultdict(nested_dict)
        data_store = nested_dict()
        from sklearn.metrics import roc_curve, auc
        from scipy import interp
        from itertools import cycle
        lw = 2
        name_store = ['Active', 'Inactive', 'Inconclusive']
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_preds[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        
        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_true[:, i].ravel(), y_preds[:, i].ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
        # Compute macro-average ROC curve and ROC area
        
        # First aggregate all false positive rates
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
        
        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += interp(all_fpr, fpr[i], tpr[i])
        
        # Finally average it and compute AUC
        mean_tpr /= n_classes
        
        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
        
        # Plot all ROC curves
        plt.figure()
        plt.plot(fpr["micro"], tpr["micro"],
                 label='micro-average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["micro"]),
                 color='deeppink', linestyle=':', linewidth=4)
        
        plt.plot(fpr["macro"], tpr["macro"],
                 label='macro-average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["macro"]),
                 color='navy', linestyle=':', linewidth=4)
        
        colors = cycle(['aqua', 'darkorange', 'cornflowerblue','green'])
        for i, color in zip(range(n_classes), colors):
            plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                     label='ROC curve of '+ name_store[i]+'(area = {1:0.2f})'
                     ''.format(i, roc_auc[i]))
        
        plt.plot([0, 1], [0, 1], 'k--', lw=lw)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        #plt.title('Multi-class ROC for '+name+' Split= '+str(count+1))
        plt.title('Multi-class ROC for '+name)
    
        plt.legend(loc="lower right")
        #plt.show()
    #%%
    #save_path = r'C:\Users\gdrei\Dropbox\UCL\Thesis\May_13\AID_1345083_processed.pkl'
    model_type = 'lgbm'
    #get data cleaned
    pickle_off = open(save_path,'rb')
    activity_table=pickle.load(pickle_off)
    pickle_off.close()
    #get length of MFP
    fp_length = len(activity_table.iloc[5]['MFP'])
    
    
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    scaler = StandardScaler(copy = False)
    le = LabelEncoder()
    labels = le.fit_transform(activity_table['PUBCHEM_ACTIVITY_OUTCOME'])
    #split data:
    from sklearn.model_selection import StratifiedShuffleSplit
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.5, train_size=None, random_state=2562)
    X_mfp = np.concatenate(np.array(activity_table['MFP'])).ravel()
    X_mfp = X_mfp.reshape((-1,fp_length))
    for train_ind, test_ind in splitter.split(X_mfp,labels):
        # standardize data
        X_train_molchars_std = scaler.fit_transform(np.array(activity_table.iloc[train_ind,4:]))
        X_test_molchars_std = scaler.transform(np.array(activity_table.iloc[test_ind,4:]))
        X_train = np.concatenate((X_mfp[train_ind,:],X_train_molchars_std),axis = 1)
        X_test = np.concatenate((X_mfp[test_ind,:],X_test_molchars_std),axis = 1)
        y_train = labels[train_ind]
        y_test = labels[test_ind]
        #X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,labels,test_size = .5, shuffle = True, stratify = labels, random_state = 2562)
        bin_y_train, bin_y_test = [1 if x ==2 else x for x in y_train],[1 if x ==2 else x for x in y_test]
        
    #do light gbm
        
    #need to make a lib svm file
    train_data = lgb.Dataset(X_train,label=y_train)
    test_data = lgb.Dataset(X_test,label=y_test)
    #make model class
    lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=500, subsample_for_bin=200000, 
                                    objective='binary', is_unbalance=True, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, 
                                    subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True, 
                                    importance_type='split')
    #train model
    trained_mod = lgbm_model.fit(X_train,y_train)
    #predict classes and class_probs
    test_class_preds = lgbm_model.predict(X_test)
    test_prob_preds = lgbm_model.predict_proba(X_test)
    #calculate Class report
    class_rep = sklearn.metrics.classification_report(y_test,test_class_preds)
    
    print(class_rep)
    if len(set(y_test)) == 2:
        single_roc(test_prob_preds[:,1],y_test)
        prec,rec,f_1,supp = prf(y_test, test_class_preds, average=None)
    else:
        from tensorflow.keras.utils import to_categorical
        multi_roc(test_prob_preds,to_categorical(y_test),'',3)
        prec,rec,f_1,supp = prf(y_test, test_class_preds, average=None)
    
    
     #%% 
    '''Comet Saving Zone'''
    #get AID number
    import ntpath
    #get base file name
    folder,base = ntpath.split(save_path)
    #split file name at second _ assumes file save in AID_xxx_endinfo.pkl
    AID, _,end_info = base.rpartition('_')
    #save data location, AID info, and version info
    exp.log_dataset_info(name = AID, version = end_info, path = save_path)
    #save model params
    exp.log_parameters(trained_mod.get_params())
    #save metrics report to comet
    if len(f_1) == 2:
        for i,name in enumerate(['Active','Inactive']):
            exp.log_metric('f1 class '+name, f_1[i])
            exp.log_metric('Recall class'+name,rec[i])
            exp.log_metric('Precision class'+name, prec[i])
    else:
        for i,name in enumerate(['Active','Inconclusive','Inactive']):
            exp.log_metric('f1 class '+str(i), f_1[i])
            exp.log_metric('Recall class'+str(i),rec[i])
            exp.log_metric('Precision class'+str(i), prec[i])
        #exp.log_metric('f1 class '+str(i), f_1[i])
        #exp.log_metric('Recall class'+str(i),rec[i])
        #exp.log_metric('Precision class'+str(i), prec[i])
    exp.log_other('Classification Report',class_rep)
     #save model in data_folder with comet experiement number associated
    exp_num = exp.get_key()
    model_save = folder+'\\'+model_type+'_'+exp_num+'.pkl'
    pickle_on = open(model_save,'wb')
    pickle.dump(trained_mod,pickle_on)
    pickle_on.close()
    #log trained model location
    exp.log_other('Trained Model Path',model_save)
    #save some informatvie tags:
    tags = [AID,end_info,model_type]
    exp.add_tags(tags)
    #save ROC curve
    exp.log_figure(figure_name = 'ROC-Pres/Recall',figure=plt)
    plt.show()

    #tell comet that the experiement is over
    exp.end()
Ejemplo n.º 10
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep  8 20:47:11 2019

@author: gabriel
"""
'''Make the final graph: all hyper parameters tuned'''
from comet_ml import Experiment
exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                 project_name="iter_plotting",
                 workspace="gdreiman1",
                 disabled=False)
exp.log_code = True
exp.log_other(
    'Hypothesis',
    '''These are my plots combining the GCNN 100 epoch random run with other classifiers also with random selection and 5% '''
)
import pickle
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
data_dir = '/home/gabriel/Dropbox/UCL/Thesis/Data'
first8 = 'tuned_7_svmmod0.pkl'
second8 = 'tuned_7_svmmod1.pkl'
third8 = 'tuned_7_svmmod2.pkl'
from iter_plot_help_funcs import find_active_percents, plot_metrics, plot_prec_rec_curve, plot_prec_rec_vs_tresh, get_checkpoint35, set_sns_pal


def get_35_tune(pathlist, sizes, expr_num):
def very_simple_param_count(model):
    result = sum([p.numel() for p in model.parameters()])
    return result


if __name__ == "__main__":
    experiment = Experiment(project_name=PROJECT_NAME, api_key=COMET_ML_API)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    args = my_args_parse()
    print(args)

    experiment.set_name(args.task_id)
    experiment.set_cmd_args()
    experiment.log_other("note", args.note)

    DATA_PATH = args.input
    TRAIN_PATH = os.path.join(DATA_PATH, "train_data")
    TEST_PATH = os.path.join(DATA_PATH, "test_data")
    dataset_name = args.datasetname
    if dataset_name == "shanghaitech":
        print("will use shanghaitech dataset with crop ")
    elif dataset_name == "shanghaitech_keepfull":
        print("will use shanghaitech_keepfull")
    else:
        print("cannot detect dataset_name")
        print("current dataset_name is ", dataset_name)

    # create list
    train_list = create_image_list(TRAIN_PATH)
Ejemplo n.º 12
0
def run_cycle(args, metadata, train_dataloader, train_dataset,
              validation_dataloader, test_dataloader, experiment_path, run_id):

    # Comet
    if args.comet_logging:
        from comet_ml import Experiment
        experiment = Experiment(api_key="GKIWhJ0lS0N674H48YQVMVNgV",
                                project_name="thesis",
                                workspace="rpalma")
        experiment.log_parameters(vars(args))
        experiment.log_other("run_id", run_id)

    # Build the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device, "\n")
    entnet = EntityNetwork(
        embeddings_size=args.embeddings_size,
        vocab_size=metadata["vocab_size"],
        answers_vocab_size=metadata["answers_vocab_size"],
        sentences_length=metadata["max_sentence_length"],
        queries_length=metadata["max_query_length"],
        n_blocks=args.n_blocks,
        output_module=args.output_module,
        output_inner_size=args.output_inner_size,
        temporal_attention_to_sentence=args.temporal_attention_to_sentence,
        temporal_activation=args.temporal_activation,
        temporal_attention=args.temporal_attention,
        dropout_prob=args.dropout_prob,
        temporal_attention_module=args.temporal_attention_module,
        device=device)
    entnet.to(device)
    print("Trainable parameters:",
          sum(p.numel() for p in entnet.parameters() if p.requires_grad), "\n")
    print("Output module:", entnet.output_module, "\n")

    # Set up the loss and optimizer
    qa_criterion = nn.CrossEntropyLoss()
    supp_facts_criterion = nn.BCEWithLogitsLoss(
        pos_weight=torch.Tensor([metadata["neg_pos_ratio"]]).to(device))
    optimizer = torch.optim.Adam(entnet.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=args.weight_decay)
    schedulers = {
        "step":
        torch.optim.lr_scheduler.StepLR(optimizer,
                                        step_size=args.decay_period,
                                        gamma=args.decay_rate),
        "cyclical":
        CyclicalLRScheduler(optimizer,
                            min_lr=args.min_lr,
                            max_lr=args.max_lr,
                            cycle_length=args.cycle_period)
    }
    scheduler = schedulers[args.lr_scheduler]
    print("Scheduler:", scheduler, "\n")
    optimizer.zero_grad()

    # Build the writers
    train_writer = SummaryWriter(
        os.path.join(experiment_path, "train-%d" % run_id))
    val_writer = SummaryWriter(
        os.path.join(experiment_path, "validation-%d" % run_id))
    test_writer = SummaryWriter(
        os.path.join(experiment_path, "test-%d" % run_id))

    def run_epoch(dataloader, should_train, should_teach_force,
                  should_teach_force_answer, summaries_writer, experiment,
                  experiment_context, epoch, quiet):
        losses = []
        qa_losses = []
        qa_targets = []
        qa_predictions = []
        supp_facts_losses = []
        supp_facts_targets = []
        supp_facts_predictions = []
        entnet.train(mode=should_train)
        for batch in tqdm(dataloader) if not quiet else dataloader:
            story = batch["story"].to(device)
            query = batch["query"].to(device)
            qa_target = batch["answer"].to(device)
            supp_facts_target = batch["supporting"].float().to(device)
            story_mask = batch["story_mask"].float().to(device)
            qa_predicted, supp_facts_alignment, supp_facts_attention = entnet(
                story,
                story_mask,
                query,
                supporting_facts=supp_facts_target
                if should_teach_force else None,
                answers=qa_target if should_teach_force_answer else None)
            qa_loss = qa_criterion(qa_predicted, qa_target)
            supp_facts_loss = supp_facts_criterion(supp_facts_alignment,
                                                   supp_facts_target)
            loss = args.qa_lambda * qa_loss + args.supporting_facts_lambda * supp_facts_loss
            if should_train:
                loss.backward()
                nn.utils.clip_grad_norm_(entnet.parameters(),
                                         args.gradient_clipping)
                optimizer.step()
                optimizer.zero_grad()
            losses.append(loss.item())
            qa_losses.append(qa_loss.item())
            qa_targets.append(qa_target.tolist())
            qa_predictions.append(qa_predicted.argmax(dim=1).tolist())
            supp_facts_losses.append(supp_facts_loss.item())
            supp_facts_targets.append(supp_facts_target.tolist())
            supp_facts_predictions.append(supp_facts_attention.tolist())

        if should_train:
            translated_story, translated_query, translated_answer = train_dataset.translate_story(
                story[-1], query[-1], qa_target[-1])
            print("\nSTORY:", translated_story)
            print("QUERY:", translated_query)
            print("ANSWER:", translated_answer)
            print("\nSupporting facts:", supp_facts_target[-1, :])
            print("Attended:", supp_facts_attention[-1, :], "\n")

        mean_loss = np.mean(losses)
        mean_qa_loss = np.mean(qa_losses)
        mean_supp_facts_loss = np.mean(supp_facts_losses)
        mean_qa_accuracy = accuracy(qa_targets, qa_predictions)
        mean_supp_facts_f1 = f1(supp_facts_targets, supp_facts_predictions)

        # Escribir summaries
        write_summaries(mean_loss, mean_qa_loss, mean_supp_facts_loss,
                        mean_qa_accuracy, mean_supp_facts_f1,
                        supp_facts_targets, supp_facts_predictions,
                        entnet.named_parameters(), summaries_writer, epoch)

        if experiment is not None:
            with experiment_context():
                metrics = {
                    "loss": mean_loss,
                    "qa_loss": mean_qa_loss,
                    "supp_facts_loss": mean_supp_facts_loss,
                    "qa_accuracy": mean_qa_accuracy,
                    "supp_facts_f1": mean_supp_facts_f1
                }
                experiment.log_metrics(metrics, step=epoch)
                experiment.log_epoch_end(args.epochs, step=epoch)

        return mean_loss, mean_qa_loss, mean_supp_facts_loss, mean_qa_accuracy, mean_supp_facts_f1

    best_val_loss = float("inf")
    best_epoch = 0
    for epoch in range(1, args.epochs + 1):
        # Training epoch
        train_loss, train_qa_loss, train_supp_facts_loss, train_qa_accuracy, train_supp_facts_f1 = run_epoch(
            train_dataloader,
            should_train=True,
            should_teach_force=args.teach_force_training,
            should_teach_force_answer=args.teach_force_answer_training,
            summaries_writer=train_writer,
            experiment=experiment if args.comet_logging else None,
            experiment_context=experiment.train
            if args.comet_logging else None,
            epoch=epoch,
            quiet=False)
        print(
            "Epoch = %d.%d; task_id = %d\n\ttrain QA accuracy = %.5f; train QA error = %.5f; train supp. facts F1 = %.5f; train loss = %.5f; train QA loss = %.5f; train supp. facts loss = %.5f"
            % (run_id, epoch, args.task_id, train_qa_accuracy,
               1 - train_qa_accuracy, train_supp_facts_f1, train_loss,
               train_qa_loss, train_supp_facts_loss))

        # Validation
        with torch.no_grad():
            val_loss, _, _, val_accuracy, val_f1 = run_epoch(
                validation_dataloader,
                should_train=False,
                should_teach_force=args.teach_force_evaluation,
                should_teach_force_answer=args.teach_force_answer_evaluation,
                summaries_writer=val_writer,
                experiment=experiment if args.comet_logging else None,
                experiment_context=experiment.validate
                if args.comet_logging else None,
                epoch=epoch,
                quiet=True)
        print(
            "\tval QA accuracy = %.5f;  val QA error = %.5f;  val loss = %.8f;  val F1 = %.8f"
            % (val_accuracy, 1 - val_accuracy, val_loss, val_f1), "\n")
        save_model(entnet.state_dict(), experiment_path, run_id, epoch)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch

        # Update learning rate
        scheduler.step()

    # Model evaluation
    entnet.load_state_dict(load_model(experiment_path, run_id, best_epoch))
    with torch.no_grad():
        test_loss, test_qa_loss, test_supp_facts_loss, test_accuracy, test_f1 = run_epoch(
            test_dataloader,
            should_train=False,
            should_teach_force=args.teach_force_evaluation,
            should_teach_force_answer=args.teach_force_answer_evaluation,
            summaries_writer=test_writer,
            experiment=experiment if args.comet_logging else None,
            experiment_context=experiment.test if args.comet_logging else None,
            epoch=best_epoch,
            quiet=False)
    print(
        "Epoch = %d.%d\n\ttest accuracy = %.5f;  test error = %.5f;  test loss = %.8f;  test F1 = %.8f"
        % (run_id, best_epoch, test_accuracy, 1 - test_accuracy, test_loss,
           test_f1), "\n")

    return test_loss, test_qa_loss, test_supp_facts_loss, test_accuracy, test_f1, best_epoch
Ejemplo n.º 13
0
def victim(kwargs=None):

    def comet_pull_poison(craftstep):
        for attempt in range(5):
            try:
                bytefile = craftexpt.get_asset(assets[craftstep])
                if localrank == 0: print('==> poisoninputs-{} pulled'.format(craftstep))
                poisoninputs = pickle.loads(bytefile)
                return poisoninputs[:args.npoison]
            except:
                print(f'WARNING: comet pull attempt for craftstep {craftstep} failed on attempt {attempt}')
                sleep(5)

    if kwargs is not None:
        for key in kwargs: globals()[key] = kwargs[key]
        for key in argsmod: setattr(args, key, argsmod[key])

    craftexpt = api.get_experiment(cometconfig["workspace"], args.craftproj, args.craftkey)
    assets = {asset['step']: asset['assetId'] for asset in craftexpt.get_asset_list() if 'poisoninputs-' in asset['fileName']}
    print('==> begin victim train')
    trial = 0
    while args.ntrial is None or trial < args.ntrial:
        for craftstep in args.craftsteps:
            experiment = Experiment(project_name=args.victimproj, auto_param_logging=False, auto_metric_logging=False, parse_args=False)
            experiment.log_parameters(vars(args))
            experiment.set_name(f'{args.craftkey[:5]}-{experiment.get_key()[:5]}')
            experiment.add_tag(args.tag)
            # experiment.add_tag(args.Xtag)
            experiment.log_parameters(dict(craftstep=craftstep, trial=trial))
            experiment.log_other('crafturl', craftexpt.url)
            experiment.log_other('command', 'python ' + ' '.join(sys.argv))
            if localrank == 0: print_command_and_args(args); print('crafturl: ' + craftexpt.url)

            if 'victim.py' in sys.argv[0]:
                poisoninputs = comet_pull_poison(craftstep)
                if poisoninputs is None: experiment.end(); print(f'skipping craftstep {craftstep}'); continue
                if args.savepoisondataset: package_poisoned_dataset(poisoninputs, xtrain, ytrain, xtarget, ytarget, ytargetadv, xvalid, yvalid, args, craftstep); experiment.end(); continue
                # meta.init_weights(sess, pretrain_weights) # what we had before
                meta.global_initialize(args, sess)
                meta.poisoninputs.load(poisoninputs, sess)
            trainstep = 0
            for epoch in range(args.nvictimepoch):
                tic = time()
                lrnrate = lr_schedule(args.lrnrate, epoch, args.warmupperiod, args.schedule)

                # log hidden layer features
                if args.logfeat and epoch == args.nvictimepoch - 1:
                    feats = []
                    for victimfeed in feeddict_generator(xtrain, ytrain, lrnrate, meta, args, victim=True):
                        hiddens = sess.run(meta.hiddens, victimfeed)
                        for i, hidden in enumerate(hiddens):
                            if len(feats) <= i: feats.append(defaultdict(list))
                            feat = np.reshape(hidden, [-1, np.prod(hidden.shape[1:])])
                            appendfeats(feats[i], feat, victimfeed, ybase, ytarget, args.batchsize)
                    for i, feats_layer in enumerate(feats): comet_log_asset(experiment, f'feats_layer{i}', feats_layer, step=epoch)

                # log validation acc
                if epoch in np.round((args.nvictimepoch - 1) * np.linspace(0, 1, args.nvalidpoints) ** 2):
                    resVs = []  # validation
                    for _, validfeed, _ in feeddict_generator(xvalid, yvalid, lrnrate, meta, args, valid=True):
                        resV = sess.run(meta.resultV, validfeed)
                        resVs.append(resV)
                    experiment.log_metrics(avg_n_dicts(resVs), step=trainstep)

                # train one epoch
                for victimfeed in feeddict_generator(xtrain, ytrain, lrnrate, meta, args, victim=True):
                    _, resL = sess.run([meta.trainop, meta.resultL,], victimfeed)
                    if not trainstep % 200: experiment.log_metrics(resL, step=trainstep)
                    trainstep += 1
                    
                experiment.log_metric('elapsed', time() - tic, step=trainstep)
                if args.saveweights: comet_log_asset_weights_and_buffers(epoch, experiment, meta, sess)
                if not epoch % 20 and localrank == 0:
                    print(' | '.join([f'{args.craftkey[:5]}-{args.tag} | trial-{trial} | craftstep-{craftstep} | epoch {epoch} | elapsed {round(time() - tic, 2)}'] +
                                     [f'{key} {trunc_decimal(val)}' for key, val in resL.items() if 'class' not in key] +
                                     [f'{key} {trunc_decimal(val)}' for key, val in resV.items() if 'class' not in key]))
            experiment.end()
        trial += 1
https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker/
or from deep chem: MaxMinSplitter(Splitter), ButinaSplitter(Splitter), FingerprintSplitter(Splitter)

can use maxmin splitter to fill out remainining space in next iter by specifying number remaining as sample size and the pool of 
compounds as target
'''
#%%
'''import'''
from comet_ml import Experiment
exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                 project_name="iter_baseline",
                 workspace="gdreiman1",
                 disabled=False)
exp.log_code = True
exp.log_other(
    'Hypothesis',
    '''15% start, 5% iter, all random, svm hinge loss and sigmoidinstead of calibrated cv'''
)
exper_file_name = 'tuned_3_svmmod_sigmoid'
import pickle, sys, os
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
import numpy as np
from Iterative_help_funcs_tuned import get_Scaled_Data, train_SVM, train_DNN, train_RF, train_LGBM, calc_and_save_metrics, train_PyTorchGCNN, train_random_classifier
from imblearn.over_sampling import RandomOverSampler
#choosing a 3:1 Inactive to Active ratio
ros = RandomOverSampler(sampling_strategy=0.33)
import pandas as pd
from joblib import Parallel, delayed
#from joblib.externals.loky import set_loky_pickler
from joblib import parallel_backend
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
Ejemplo n.º 15
0
    model_name = experiment_params['model_name']
    training = experiment_params['training']
    testing = experiment_params['testing']
    save_model = experiment_params['save_model']
    load_model = experiment_params['load_model']
    init_ckpt_file = experiment_params['init_ckpt_file']

    # Set up comet experiment
    # experiment = Experiment(project_name="sentence-encoding-for-da", workspace="nathanduran", auto_output_logging='simple')
    experiment = Experiment(auto_output_logging='simple', disabled=True)
    experiment.set_name(experiment_name)
    # Log parameters
    experiment.log_parameters(model_params)
    experiment.log_parameters(experiment_params)
    for key, value in experiment_params.items():
        experiment.log_other(key, value)

    # Data set and output paths
    dataset_name = 'token_dataset' if experiment_params[
        'to_tokens'] else 'text_dataset'
    dataset_dir = os.path.join(task_name, dataset_name)
    output_dir = os.path.join(task_name, experiment_name)
    checkpoint_dir = os.path.join(output_dir, 'checkpoints')
    embeddings_dir = 'embeddings'

    # Create appropriate directories if they don't exist
    for directory in [
            task_name, dataset_dir, output_dir, checkpoint_dir, embeddings_dir
    ]:
        if not os.path.exists(directory):
            os.mkdir(directory)
Ejemplo n.º 16
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep  6 17:40:01 2019

@author: gabriel
"""
'''Plotting Iter_7'''
from comet_ml import Experiment

exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                 project_name="iter_plotting",
                 workspace="gdreiman1",
                 disabled=False)
exp.log_code = True
exp.log_other('Hypothesis',
              '''These are my plots from the intial iterations Iter_7 ''')
import pickle
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

data_dir = '/home/gabriel/Dropbox/UCL/Thesis/Data'
gcnn_initial = 'second_diverse_GCNN_50epoch_iter_run.pkl'
save_path = os.path.join(data_dir, gcnn_initial)
pickle_off = open(save_path, 'rb')
gcnn_initial = pickle.load(pickle_off)
pickle_off.close()

from iter_plot_help_funcs import find_active_percents, plot_metrics, plot_prec_rec_curve, plot_prec_rec_vs_tresh, plot_avg_percent_found, set_sns_pal
Ejemplo n.º 17
0
    experiment.log_metric("test_accuracy", accuracy_score(y_true, y_pred))
    experiment.log_metric("beta", best_b)
    experiment.log_metric("neurons", best_p)
    experiment.log_confusion_matrix(matrix=confusion_matrix(y_true,
                                                            y_pred).tolist(),
                                    labels=oDataSet.labelsNames)
    # model.save('model.h5')
    # experiment.log_asset("model.h5")
    model.save_weights('model.weights')
    experiment.log_asset("model.weights")

    print(accuracy_score(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))
    oData.confusion_matrix = confusion_matrix(y_true, y_pred)
    oData.model = model
    oData.params = {
        "k_fold": K_FOLD,
        "GRID_RESULT": grid_result,
        "GRID_VALUES_NEURON": GRID_NEURON,
        "GRID_VALUES_BETA": GRID_B,
        "LEARNING RATE": LEARNING_RATE,
        "EPOCHS": epochs
    }
    experiment.log_other("params", oData.params)
    y_pred = model.predict(
        oDataSet.attributes[oData.Training_indexes]).argmax(axis=1)
    y_true = oDataSet.labels[oData.Training_indexes]
    experiment.log_metric("train_accuracy", accuracy_score(y_true, y_pred))
    experiment.end()
    oDataSet.append(oData)
Ejemplo n.º 18
0
                grid_result[g1, g2, k_slice] = accuracy_score(y_true, y_pred)
                # print(grid_result)
                k_slice += 1
                print(grid_result)

    model, bests = fit(oDataSet.attributes[oData.Training_indexes],
                       oDataSet.labels[oData.Training_indexes], LEARNING_RATE,
                       epochs, 0.2, 0.1, 0.7)

    y_pred = model._predict(oDataSet.attributes[oData.Testing_indexes]).argmax(
        axis=1).T.tolist()[0]
    y_true = oDataSet.labels[oData.Testing_indexes]
    bests = [x.fitness for x in bests]
    plt.plot(bests)
    plt.show()
    experiment.log_other("pesos", str(model.genes))
    experiment.log_metric("test_accuracy", accuracy_score(y_true, y_pred))
    experiment.log_metric("beta", LEARNING_RATE)
    experiment.log_metric("neurons", epochs)
    experiment.log_confusion_matrix(matrix=confusion_matrix(y_true,
                                                            y_pred).tolist(),
                                    labels=oDataSet.labelsNames)
    print(accuracy_score(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))
    oData.confusion_matrix = confusion_matrix(y_true, y_pred)
    oData.model = model
    oData.params = {
        "k_fold": K_FOLD,
        "GRID_RESULT": grid_result,
        "GRID_VALUES_NEURON": GRID_NEURON,
        "GRID_VALUES_BETA": GRID_B,
            groups = data_reader.groups
            all_scores = []

            for i in range(3):
                ae = Autoencoder(config[i]["encoder"],
                                 config[i]["decoder"],
                                 input_shape=input_shapes[i],
                                 latent_shape=latent_shape,
                                 loss="mean_squared_error",
                                 optimizer_params=None)

                experiment.log_multiple_params(config[i])

                scores = ae.cross_validate(data[i],
                                           groups,
                                           experiment=experiment,
                                           epochs=10000,
                                           n_splits=4,
                                           log_prefix=f"dataset_{i}_")

                all_scores.append(scores)

                mean_scores = np.mean(scores)

                experiment.log_metric(f"mean_scores_{i}", mean_scores)

                experiment.log_other(f"scores_{i}", scores)

            experiment.log_metric(f"mean_all_scores", np.mean(all_scores))
            print(all_scores)
Ejemplo n.º 20
0
'''Select data set, do smart sampling either rdkit: https://www.rdkit.org/docs/source/rdkit.ML.Cluster.Butina.html
https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker/
or from deep chem: MaxMinSplitter(Splitter), ButinaSplitter(Splitter), FingerprintSplitter(Splitter)

can use maxmin splitter to fill out remainining space in next iter by specifying number remaining as sample size and the pool of 
compounds as target
'''
#%%
'''import'''
from comet_ml import Experiment
exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                 project_name="iter_baseline",
                 workspace="gdreiman1",
                 disabled=False)
exp.log_code = True
exp.log_other('Hypothesis', '''15% start 10% iter, diverse svm hinge loss''')
exper_file_name = 'tuned_4_svmmod_corrected'
import pickle, sys, os
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
import numpy as np
from Iterative_help_funcs_tuned import get_Scaled_Data, train_SVM, train_DNN, train_RF, train_LGBM, calc_and_save_metrics, train_PyTorchGCNN, train_random_classifier
from imblearn.over_sampling import RandomOverSampler
#choosing a 3:1 Inactive to Active ratio
ros = RandomOverSampler(sampling_strategy=0.33)
import pandas as pd
from joblib import Parallel, delayed
#from joblib.externals.loky import set_loky_pickler
from joblib import parallel_backend
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
Ejemplo n.º 21
0
    model2 = ClassicNet(hyperparameters['filters'], hyperparameters['layers'])
    model2 = model2.cuda()
    model2.load_state_dict(checkpoint2['state_dict'])

    evaluate(dataloader, model1, model2)

experiment = None
if args.api_key:
    project_dir, experiment_name = split(dirname(realpath(__file__)))
    project_name = basename(project_dir)
    experiment = Experiment(api_key=args.api_key,
                            project_name=project_name,
                            auto_param_logging=False,
                            auto_metric_logging=False,
                            parse_args=False)
    experiment.log_other('experiment_name', experiment_name)
    experiment.log_parameters(vars(args))
    for k in hyperparameters:
        if type(hyperparameters[k]) == dict:
            experiment.log_parameters(hyperparameters[k], prefix=k)
        else:
            experiment.log_parameter(k, hyperparameters[k])

try:
    dataset = torchvision.datasets.ImageFolder(root='./trainset')
except:
    import zipfile
    zip_ref = zipfile.ZipFile('trainset.zip', 'r')
    zip_ref.extractall()
    zip_ref.close()
    dataset = torchvision.datasets.ImageFolder(root='./trainset')
Ejemplo n.º 22
0
or from deep chem: MaxMinSplitter(Splitter), ButinaSplitter(Splitter), FingerprintSplitter(Splitter)

can use maxmin splitter to fill out remainining space in next iter by specifying number remaining as sample size and the pool of 
compounds as target
'''
#%%
'''import'''
from comet_ml import Experiment

exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                 project_name="iter_baseline",
                 workspace="gdreiman1",
                 disabled=False)
exp.log_code = True
exp.log_other(
    'Hypothesis',
    'Comparing diverse with smaller iter sizes to random with small itersize')
import pickle, sys, os
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
import numpy as np
from Iterative_help_funcs import get_Scaled_Data, train_SVM, train_DNN, train_RF, train_LGBM, calc_and_save_metrics
from imblearn.over_sampling import RandomOverSampler
#choosing a 3:1 Inactive to Active ratio
ros = RandomOverSampler(sampling_strategy=0.33)
import pandas as pd
from joblib import Parallel, delayed
from joblib.externals.loky import set_loky_pickler
from joblib import parallel_backend
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)
Ejemplo n.º 23
0
    else:
        print("cannot detect dataset_name")
        print("current dataset_name is ", dataset_name)

    # create list
    train_list = create_image_list(TRAIN_PATH)
    test_list = create_image_list(TEST_PATH)

    # create data loader
    train_loader, train_loader_for_eval, test_loader = get_dataloader(train_list, train_list, test_list, dataset_name=dataset_name, batch_size=args.batch_size)

    print("len train_loader ", len(train_loader))

    # model
    model_name = args.model
    experiment.log_other("model", model_name)
    if model_name == "M1":
        model = M1()
    elif model_name == "M2":
        model = M2()
    elif model_name == "M3":
        model = M3()
    elif model_name == "M4":
        model = M4()
    elif model_name == "CustomCNNv2":
        model = CustomCNNv2()
    elif model_name == "BigTailM1":
        model = BigTailM1()
    elif model_name == "BigTailM2":
        model = BigTailM2()
    elif model_name == "BigTail3":
Ejemplo n.º 24
0
@author: gabriel
"""

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 18 15:07:55 2019

@author: gabriel
"""
from comet_ml import Experiment
exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                        project_name="iter_plotting", workspace="gdreiman1", disabled = False
                        )
exp.log_code = True
exp.log_other('Hypothesis','''These are my plots from the intial iterations "Iter_2" and "Iter_3" ''')
import pickle
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
data_dir = '/home/gabriel/Dropbox/UCL/Thesis/Data'
random_run = 'first_random_9iter_run.pkl'
diverse_run = 'first_diverse_9iter_run.pkl'

save_path = os.path.join(data_dir,random_run)
pickle_off = open(save_path,'rb')
random_run=pickle.load(pickle_off)
pickle_off.close() 
save_path = os.path.join(data_dir,diverse_run)
Ejemplo n.º 25
0
def test_autoencoder():
    (x_train, _), (x_test, _) = mnist.load_data()

    x_train = x_train.astype('float32') / 255.
    x_test = x_test.astype('float32') / 255.
    x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
    x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))

    config = {
        "encoder": [{
            "kwargs": {
                "activation": "relu",
                "units": 256
            },
            "name": "hidden1",
            "type": "Dense"
        }, {
            "name": "batchnorm",
            "type": "BatchNormalization"
        }, {
            "kwargs": {
                "rate": 0
            },
            "name": "dropout",
            "type": "Dropout"
        }, {
            "kwargs": {
                "activation": "sigmoid",
            },
            "name": "latent",
            "regularizer": {
                "type": "l1",
                "value": 0
            },
            "type": "Dense"
        }]
    }

    latent_dim = 32
    latent_shape = (latent_dim, )
    input_shape = (x_train.shape[1], )

    print(latent_shape)
    print(input_shape)

    ae = Autoencoder(config["encoder"],
                     None,
                     input_shape=input_shape,
                     latent_shape=latent_shape,
                     loss="mean_squared_error",
                     optimizer_params=None)

    #experiment = Experiment(api_key="ac4P1dtMEjJf1d9hIo9CIuSXC", project_name="mnist-autoencode")
    experiment = Experiment(project_name="MNIST test",
                            api_key="50kNmWUHJrWHz3FlgtpITIsB1")
    experiment.log_parameter("Experiment name", "Testing ae")
    experiment.log_multiple_params(config)
    experiment.log_parameter("Latent dim", latent_shape[0])

    ae.fit(x_train, batch_size=1000, epochs=5, validation_data=x_test)

    predictions = ae.predict(x_test)

    scores = np.sqrt(((predictions - x_test)**2).mean())
    experiment.log_other("scores", scores)
    print(scores)

    print(predictions.shape)
    pred_imgs = predictions.reshape(-1, 28, 28)
    fig = plt.figure()
    for i, img in enumerate(pred_imgs[:5]):
        fig.add_subplot(2, 5, i + 1)
        plt.imshow(img)
        plt.axis('off')
        fig.add_subplot(2, 5, i + 6)
        plt.imshow(x_test[i].reshape(28, 28))
        plt.axis('off')
    plt.show()
Ejemplo n.º 26
0
    # read corpus and create dataloaders
    corpus = Corpus(args.task, seq2seq, max_len=args.outliers)
    
train_loader, valid_loader, test_loader = load_data(corpus, 
                                                    batch_size=args.batch_size,
                                                    sample=args.truncated_training,
                                                    model=args.model)
size_vocab = len(corpus.word2id) # will be 1 in bert, not a real count because we use their tokenizer and pretrained vocab size
# beware that labels2id has an extra symbol for padding that is not a tag,
# so we subtract 1 from number of labels
n_labels = len(corpus.label2id)-1 
pad_id = corpus.word2id['<pad>']
label_pad_id = corpus.label2id['<pad>']

if args.comet_track:
    experiment.log_other("size_vocab", size_vocab)
    experiment.log_other("n_labels", n_labels)
    experiment.log_other("size_trainset", len(corpus.train))
    experiment.log_other("size_validset", len(corpus.valid))
    experiment.log_other("size_testset", len(corpus.test))

# Create NN model and send it to my_device
print('Building model...')
if args.model == 'vanilla_lstm':
    model = vanillaLSTM(size_vocab, args.dim_emb, args.dim_hid, args.nlayers, 
                           n_labels, args.dropout, pad_id, corpus,
                           no_glove=args.no_glove, freeze=args.freeze,
                           bidirectional=False).to(my_device)
elif args.model == 'vanilla_bilstm':
    model = vanillaLSTM(size_vocab, args.dim_emb, args.dim_hid, args.nlayers, 
                           n_labels, args.dropout, pad_id, corpus, 
Ejemplo n.º 27
0
'''Select data set, do smart sampling either rdkit: https://www.rdkit.org/docs/source/rdkit.ML.Cluster.Butina.html
https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker/
or from deep chem: MaxMinSplitter(Splitter), ButinaSplitter(Splitter), FingerprintSplitter(Splitter)

can use maxmin splitter to fill out remainining space in next iter by specifying number remaining as sample size and the pool of 
compounds as target
'''
#%%
'''import'''
from comet_ml import Experiment
exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                 project_name="iter_baseline",
                 workspace="gdreiman1",
                 disabled=False)
exp.log_code = True
exp.log_other('Hypothesis', '''20% start, 5% iter, all random''')
exper_file_name = 'tuned_6'
import pickle, sys, os
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
import numpy as np
from Iterative_help_funcs_tuned import get_Scaled_Data, train_SVM, train_DNN, train_RF, train_LGBM, calc_and_save_metrics, train_PyTorchGCNN, train_random_classifier
from imblearn.over_sampling import RandomOverSampler
#choosing a 3:1 Inactive to Active ratio
ros = RandomOverSampler(sampling_strategy=0.33)
import pandas as pd
from joblib import Parallel, delayed
#from joblib.externals.loky import set_loky_pickler
from joblib import parallel_backend
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
Ejemplo n.º 28
0
https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker/
or from deep chem: MaxMinSplitter(Splitter), ButinaSplitter(Splitter), FingerprintSplitter(Splitter)

can use maxmin splitter to fill out remainining space in next iter by specifying number remaining as sample size and the pool of 
compounds as target
'''
#%%
'''import'''
from comet_ml import Experiment
exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                 project_name="iter_baseline",
                 workspace="gdreiman1",
                 disabled=False)
exp.log_code = True
exp.log_other(
    'Hypothesis',
    '''JustGCNN, only 3 datasets, 20 epochs, adam lr = 0.05, positve weight = 1'''
)
exper_file_name = 'svm_metricsdict_test'
import pickle, sys, os
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
import numpy as np
from Iterative_help_funcs_pytorchmod import get_Scaled_Data, train_SVM, train_DNN, train_RF, train_LGBM, calc_and_save_metrics, train_PyTorchGCNN, train_random_classifier
from imblearn.over_sampling import RandomOverSampler
#choosing a 3:1 Inactive to Active ratio
ros = RandomOverSampler(sampling_strategy=0.33)
import pandas as pd
from joblib import Parallel, delayed
#from joblib.externals.loky import set_loky_pickler
from joblib import parallel_backend
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
Ejemplo n.º 29
0
or from deep chem: MaxMinSplitter(Splitter), ButinaSplitter(Splitter), FingerprintSplitter(Splitter)

can use maxmin splitter to fill out remainining space in next iter by specifying number remaining as sample size and the pool of 
compounds as target
'''
#%%
'''import'''
from comet_ml import Experiment
exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                 project_name="iter_baseline",
                 workspace="gdreiman1",
                 disabled=False)
exp.log_code = True
exp.log_other(
    'Hypothesis',
    '''Making following changes. 1) Kept 100 epochs 2) halve the size of the iters after inital screen 
3)No Weak Inactives after predicted actives falls below 80% of batch size 4) Diverse selection back on'''
)
import pickle, sys, os
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
import numpy as np
from Iterative_help_funcs import get_Scaled_Data, train_SVM, train_DNN, train_RF, train_LGBM, calc_and_save_metrics, train_PyTorchGCNN
from imblearn.over_sampling import RandomOverSampler
#choosing a 3:1 Inactive to Active ratio
ros = RandomOverSampler(sampling_strategy=0.33)
import pandas as pd
from joblib import Parallel, delayed
from joblib.externals.loky import set_loky_pickler
from joblib import parallel_backend
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
Ejemplo n.º 30
0
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)

    try:
        os.mkdir(args.experiment_path)
    except FileExistsError:
        pass

    try:
        with open(f"{os.environ['HOME']}/.comet_key") as f:
            comet_key = f.read().strip()
        exp = Experiment(comet_key,
                         project_name='evo',
                         log_graph=False,
                         auto_metric_logging=False)
        exp.log_other('Notes', args.notes)
    except FileNotFoundError:
        exp = None

    # TODO: make data loading more modular...
    tasks = ['J3/J2']
    seqs = load_seqs(tasks)
    splits = make_splits(seqs.index,
                         seqs[tasks],
                         test_frac=0.1,
                         val_frac=0.1,
                         by_value=False)

    max_seq_len = seqs.index.str.len().max()
    for split in splits.keys():
        if len(splits[split].inputs):