Beispiel #1
0
def _eval(model, dataloader, CONFIG):
    model.eval()
    torch.set_grad_enabled(False)

    correct = 0
    total = 0

    all_targ = torch.tensor([]).to(dtype=torch.int64).cuda()
    all_pred = torch.tensor([]).to(dtype=torch.int64).cuda()

    for test_data in dataloader:
        X, y = test_data
        X, y = X.cuda(), y.cuda()

        y_pred = model(X)

        y_pred_classified = y_pred.view(-1).clone()
        for i in range(len(y_pred)):
            y_pred_classified[i] = classify(y_pred[i])

        all_pred = torch.cat((all_pred, y_pred_classified.to(torch.int64)))
        all_targ = torch.cat((all_targ, y.to(torch.int64)))

        total += y.size(0)
        correct += accuracy(y_pred.cpu(), y.cpu().float()) * y.size(0)
    acc = round(correct / total, 4)
    c_matrix, kappa = quadratic_weighted_kappa(all_targ.cpu().numpy(),
                                               all_pred.cpu().numpy())
    model.train()
    torch.set_grad_enabled(True)
    return acc, c_matrix, kappa, all_pred.cpu().numpy()
Beispiel #2
0
def evaluate(CONFIG):

    #creat result folder
    if not os.path.isdir(CONFIG['SAVE_PATH']):
        os.makedirs(CONFIG['SAVE_PATH'])

    # creat dataset
    test_dataset = generate_stem_dataset(CONFIG['DATA_PATH'],
                                         CONFIG['INPUT_SIZE'],
                                         CONFIG['DATA_AUGMENTATION'],
                                         cv=False,
                                         mode='evaluate')

    # creat dataloader
    test_loader = DataLoader(test_dataset,
                             batch_size=CONFIG['BATCH_SIZE'],
                             num_workers=CONFIG['NUM_WORKERS'],
                             shuffle=False)
    # define model
    model_name = CONFIG['MODEL_NAME']
    model = EfficientNet.from_pretrained(model_name)
    feature = model._fc.in_features
    model._fc = nn.Linear(in_features=feature, out_features=1, bias=True)

    #multi-gpu setting
    torch.cuda.set_device(CONFIG['GPU_NUM'][0])
    model = torch.nn.DataParallel(
        model, device_ids=CONFIG['GPU_NUM']).to(device=torch.device('cuda'))

    # load pretrained weights
    if CONFIG['PRETRAINED_PATH']:
        state_dict = torch.load(CONFIG['PRETRAINED_PATH'])
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            if 'module' not in k:
                k = 'module.' + k
            else:
                k = k.replace('features.module.', 'module.features.')
            new_state_dict[k] = v
        model.load_state_dict(new_state_dict)

    # evaluate
    model.eval()
    torch.set_grad_enabled(False)

    correct = 0
    total = 0

    all_targ = torch.tensor([]).to(dtype=torch.int64).cuda()
    all_pred = torch.tensor([]).to(dtype=torch.int64).cuda()
    logit_pred_y = []
    logit_targ_y = []
    for test_data in test_loader:
        X, y = test_data
        X, y = X.cuda(), y.cuda()

        y_pred = model(X)

        y_pred_classified = y_pred.view(-1).clone()
        for i in range(len(y_pred)):
            y_pred_classified[i] = classify(y_pred[i])

        all_pred = torch.cat((all_pred, y_pred_classified.to(torch.int64)))
        all_targ = torch.cat((all_targ, y.to(torch.int64)))

        total += y.size(0)
        correct += accuracy(y_pred.cpu(), y.cpu().float()) * y.size(0)

        logit_pred_y += list(y_pred.view(-1).cpu().numpy())
        logit_targ_y += list(y.cpu().float().numpy())

    acc = round(correct / total, 4)
    c_matrix, kappa = quadratic_weighted_kappa(all_targ.cpu().numpy(),
                                               all_pred.cpu().numpy())

    ks_dataframe = pd.DataFrame({'pred': logit_pred_y, 'targ': logit_targ_y})
    ks_dataframe.to_csv(os.path.join(CONFIG['SAVE_PATH'],
                                     model_name + '_eval_results.csv'),
                        index=False,
                        sep=',')

    print('==============================')
    print('Test acc: {}'.format(acc))
    print('Confusion Matrix:\n{}'.format(c_matrix))
    print('quadratic kappa: {}'.format(kappa))
    print('==============================')
Beispiel #3
0
def run():
    stats = {}
    stats['runtime_second'] = time.time()

    startTime = time.time()

    # Initialize using the same seed (to get stable results on comparisons)
    np.random.seed(RP['seed'])

    rawData = db.getData()

    # filter infs and nans from data cols
    cols = rawData.columns.tolist()[1:-1]

    print(cols)

    for col in cols:
        rawData = rawData.drop(rawData[np.isinf(rawData[col])].index)
        rawData = rawData.drop(rawData[np.isnan(rawData[col])].index)

    rawData.reset_index(drop=True, inplace=True)
    rawData.reindex(np.random.permutation(rawData.index))

    # print(rawData)

    X_raw = rawData.iloc[:, 2:-1]
    y_raw = rawData.iloc[:, 1:2]

    scalerX = preprocessing.StandardScaler(copy=False)
    scalerX.fit(X_raw)
    scalery = preprocessing.StandardScaler(copy=False)
    scalery.fit(y_raw)

    if RP['zscore_norm']:
        X = pd.DataFrame(scalerX.transform(X_raw),
                         columns=X_raw.columns.values)
        y = pd.DataFrame(scalery.transform(y_raw),
                         columns=y_raw.columns.values)
    else:
        X = X_raw
        y = y_raw

    # print(X.head(), y.head())

    model = Sequential()

    # hidden
    model.add(
        Dense(300,
              W_regularizer=l2(0.0),
              activity_regularizer=activity_l2(0.0),
              input_shape=(X.shape[1], )))
    model.add(Activation('relu'))
    model.add(Dropout(0.300))
    model.add(
        Dense(300,
              W_regularizer=l2(0.0),
              activity_regularizer=activity_l2(0.0)))
    model.add(Activation('relu'))
    model.add(Dropout(0.200))
    model.add(Dense(1))

    model.compile(loss='mse', optimizer=OPTIMIZER)

    if RD['use_test_flags']:
        maskTrain = np.zeros(len(X), dtype=bool)
        maskTest = np.zeros(len(X), dtype=bool)
        for i in range(len(X)):
            maskTrain[i] = rawData[RD['testing']][i] == 0
            maskTest[i] = rawData[RD['testing']][i] == 1

        trainX = X.loc[maskTrain]
        testX = X.loc[maskTest]
        trainy = y.loc[maskTrain]
        testy = y.loc[maskTest]

    else:
        ratio = 0.8
        split = int(X.shape[0] * ratio)

        trainX, testX = X.iloc[:split], X.iloc[split:]
        trainy, testy = y.iloc[:split], y.iloc[split:]

    trainX.reset_index(drop=True, inplace=True)
    testX.reset_index(drop=True, inplace=True)
    trainy.reset_index(drop=True, inplace=True)
    testy.reset_index(drop=True, inplace=True)

    stats['training_row_count'] = len(trainX)
    stats['testing_row_count'] = len(testX)

    print(trainX.shape, testX.shape, trainy.shape, testy.shape)

    early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)

    history = model.fit(trainX.values,
                        trainy.values,
                        nb_epoch=RP['epochs'],
                        batch_size=RP['batch'],
                        callbacks=[early],
                        validation_data=(testX.values, testy.values))

    preprocessMeta = {'scaler': scalery}

    # compute metrics for the model based on the task for both testing and training data
    print('\nGetting metrics for training data:')
    if RP['classify']:
        trainMetrics = metrics.classify(model, trainX.values, trainy.values,
                                        preprocessMeta)
    else:
        trainMetrics = metrics.predict(model, trainX.values, trainy.values,
                                       preprocessMeta)

    print('\nGetting metrics for test data:')
    if RP['classify']:
        testMetrics = metrics.classify(model, testX.values, testy.values,
                                       preprocessMeta)
    else:
        testMetrics = metrics.predict(model, testX.values, testy.values,
                                      preprocessMeta)

    print('Plot:')
    values = np.zeros((len(history.history['loss']), 2))
    for i in range(len(history.history['loss'])):
        values[i][0] = history.history['loss'][i]
        values[i][1] = history.history['val_loss'][i]
    utility.plotLoss(values)

    print('Dump csv pred')
    pred = model.predict(testX.values, batch_size=RP['batch'])

    if RP['zscore_norm']:
        predScaled = pd.DataFrame(scalery.inverse_transform(pred),
                                  columns=['pred'])
        testScaled = pd.DataFrame(scalery.inverse_transform(testy),
                                  columns=['true'])
    else:
        predScaled = pd.DataFrame(pred, columns=['pred'])
        testScaled = pd.DataFrame(testy, columns=['true'])

    predByTruth = pd.concat([predScaled, testScaled], axis=1)

    # predByTruth.plot(x='pred',y='true', kind='scatter')
    # plt.show()
    # predByTruth.to_csv('local/pred.csv')

    # statistics to send to journal
    stats['runtime_second'] = time.time() - stats['runtime_second']
    stats['memory_pm_mb'], stats['memory_vm_mb'] = utility.getMemoryUsage()
    stats['git_commit'] = utility.getGitCommitHash()
    stats['comment'] = RP['comment']
    stats['hostname'] = socket.gethostname()
    stats['experiment_config'] = yaml.dump(cc.exp, default_flow_style=False)

    stats['model'] = utility.modelToString(model)
    stats['loaded_model'] = RP['load_model']
    stats['parameter_count'] = model.count_params()
    stats['task'] = 'classification' if RP['classify'] else 'regression'

    stats['dataset_name'] = cc.exp['fetch']['table']
    stats['split_name'] = RD['testing']
    stats['label_name'] = ','.join(RD['labels'])

    stats['epoch_max'] = RP['epochs']
    stats['learning_rate'] = RP['learning_rate']
    stats['optimization_method'] = OPTIMIZER.__class__.__name__
    stats['batch_size'] = RP['batch']
    stats['seed'] = RP['seed']
    stats['objective'] = RP['objective']
    stats['learning_curve'] = {
        'val':
        open('{}/{}'.format(cc.cfg['plots']['dir'], utility.PLOT_NAME),
             'rb').read(),
        'type':
        'bin'
    }

    # metric statistics to send
    metricStats = {}

    if RP['classify']:
        metricStats['relevance_training'] = trainMetrics['acc_avg']
        metricStats['relevance_training_std'] = trainMetrics['acc_std']
        metricStats['relevance_testing'] = testMetrics['acc_avg']
        metricStats['relevance_testing_std'] = testMetrics['acc_std']
        metricStats['log_loss'] = testMetrics['log_loss_avg']
        metricStats['log_loss_std'] = testMetrics['log_loss_std']
        metricStats['auc'] = testMetrics['auc_avg']
        metricStats['auc_std'] = testMetrics['auc_std']
    else:
        metricStats['relevance_training'] = trainMetrics['r2_avg']
        metricStats['relevance_training_std'] = trainMetrics['r2_std']
        metricStats['relevance_testing'] = testMetrics['r2_avg']
        metricStats['relevance_testing_std'] = testMetrics['r2_std']
        metricStats['mse'] = testMetrics['mse_avg']
        metricStats['mse_std'] = testMetrics['mse_std']

    stats.update(metricStats)
    db.sendStatistics(**stats)
Beispiel #4
0
def run(grid = None):
    stats = {}
    stats['runtime_second'] = time.time()

    # initialize using the same seed (to get stable results on comparisons)
    np.random.seed(RP['seed'])

    # grab the commit at start
    stats['git_commit'] = utility.getGitCommitHash()

    # get the training and testing datasets along with some meta info

    if RP['edge_prediction']:
        trainIn, trainLabel, testIn, testLabel, preprocessMeta = data.preprocessEdgeData(db.getData())
    else:
        trainIn, trainLabel, testIn, testLabel, preprocessMeta = data.preprocessData(db.getData())
        # trainIn, trainLabel, testIn, testLabel, preprocessMeta = data.preprocessFastaOneHotData(db.getData())

    stats['training_row_count'] = len(testLabel)
    stats['testing_row_count'] = len(testLabel)

    # load model from file or create and train one from scratch
    if RP['load_model']:
        model = utility.loadModel(RP['load_model'])
    else:
        if RP['edge_prediction']:
            model = configureEdgeModel(trainIn[0],trainIn[1])
        elif RP['discrete_label']:
            model = configureModel(trainIn, len(trainLabel[0]))
        else:
            model = configureModel(trainIn)
        stats['epoch_count'] = train(model, trainIn, trainLabel, (testIn, testLabel))

    # persistence first
    if cc.cfg['persistence']['model']:
        name = '{}_rg_{}'.format(stats['git_commit'],':'.join([str(x) for x in RG['ratios']]))
        # name = stats['git_commit']
        stats['persistent_model_name'] = name
        utility.saveModel(model, name)

    # compute metrics for the model based on the task for both testing and training data
    print('\nGetting metrics for training data:')
    if RP['classify']:
        if RP['discrete_label']:
            trainMetrics = metrics.discreteClassify(model, trainIn, trainLabel, preprocessMeta)
        else:
            trainMetrics = metrics.classify(model, trainIn, trainLabel, preprocessMeta)
    else:
        trainMetrics = metrics.predict(model, trainIn, trainLabel, preprocessMeta)

    print('\nGetting metrics for test data:')
    if RP['classify']:
        if RP['discrete_label']:
            testMetrics = metrics.discreteClassify(model, testIn, testLabel, preprocessMeta)
        else:
            testMetrics = metrics.classify(model, testIn, testLabel, preprocessMeta)
    else:
        testMetrics = metrics.predict(model, testIn, testLabel, preprocessMeta)


    # utilities and visualizations
    if cc.cfg['plots']['layer_activations']:
        visualization.layerActivations(model, testIn, testLabel)

    if cc.cfg['plots']['seq_output']:
        df = pd.DataFrame(cc.cfg['plots']['seq_output_seq_input'], columns=[RD['fasta'] if cc.cfg['plots']['seq_output_seq_input_name'] == 'fasta' else RD['smiles']])
        visualization.visualizeSequentialOutput(model, cc.cfg['plots']['seq_output_layer_idx'], df)

    if cc.cfg['plots']['print_pred']:
        visualization.printPrediction(model, cc.cfg['plots']['print_pred_smiles'])

    if cc.cfg['plots']['print_train_test_pred']:
        visualization.printTrainTestPred(model, cc.cfg['plots']['print_train_test_pred_cnt'], trainIn, trainLabel, testIn, testLabel, preprocessMeta)

    # statistics to send to journal
    stats['runtime_second'] = time.time() - stats['runtime_second']
    stats['memory_pm_mb'], stats['memory_vm_mb'] = utility.getMemoryUsage()
    stats['comment'] = RP['comment']
    stats['hostname'] = socket.gethostname()
    stats['experiment_config'] = yaml.dump(cc.exp,default_flow_style=False)

    stats['model'] = utility.modelToString(model)
    stats['loaded_model'] = RP['load_model']
    stats['parameter_count'] = model.count_params()
    stats['task'] = 'classification' if RP['classify'] else 'regression'

    stats['dataset_name'] = cc.exp['fetch']['table']
    stats['split_name'] = RD['testing']
    stats['label_name'] = ','.join(RD['labels'])

    stats['epoch_max'] = RP['epochs']
    stats['learning_rate'] = RP['learning_rate']
    stats['optimization_method'] = OPTIMIZER.__class__.__name__
    stats['batch_size'] = RP['batch']
    stats['seed'] = RP['seed']
    stats['objective'] = RP['objective']
    stats['learning_curve'] = {'val':open('{}/{}'.format(cc.cfg['plots']['dir'], utility.PLOT_NAME),'rb').read(),'type':'bin'}

    # metric statistics to send
    metricStats = {}

    if RP['classify']:
        metricStats['relevance_training'] = trainMetrics['acc_avg']
        metricStats['relevance_training_std'] = trainMetrics['acc_std']
        metricStats['relevance_testing'] = testMetrics['acc_avg']
        metricStats['relevance_testing_std'] = testMetrics['acc_std']
        metricStats['log_loss'] = testMetrics['log_loss_avg']
        metricStats['log_loss_std'] = testMetrics['log_loss_std']
        metricStats['auc'] = testMetrics['auc_avg']
        metricStats['auc_std'] = testMetrics['auc_std']
        metricStats['auc_micro'] = testMetrics['auc_avg']
        metricStats['auc_micro_std'] = testMetrics['auc_std']
    else:
        metricStats['relevance_training'] = trainMetrics['r2_avg']
        metricStats['relevance_training_std'] = trainMetrics['r2_std']
        metricStats['relevance_testing'] = testMetrics['r2_avg']
        metricStats['relevance_testing_std'] = testMetrics['r2_std']
        metricStats['mse'] = testMetrics['mse_avg']
        metricStats['mse_std'] = testMetrics['mse_std']
        metricStats['mae'] = testMetrics['mae_avg']
        metricStats['mae_std'] = testMetrics['mae_std']

    stats.update(metricStats)
    db.sendStatistics(**stats)

    utility.freeModel(model)
Beispiel #5
0
def run():
    stats = {}
    stats['runtime_second'] = time.time()

    startTime = time.time()

    # Initialize using the same seed (to get stable results on comparisons)
    np.random.seed(RP['seed'])

    rawData = db.getData()

    # filter infs and nans from data cols
    cols = rawData.columns.tolist()[1:-1]

    print(cols)

    for col in cols:
        rawData = rawData.drop(rawData[np.isinf(rawData[col])].index)
        rawData = rawData.drop(rawData[np.isnan(rawData[col])].index)

    rawData.reset_index(drop=True,inplace=True)
    rawData.reindex(np.random.permutation(rawData.index))

    # print(rawData)

    X_raw = rawData.iloc[:, 2:-1]
    y_raw = rawData.iloc[:, 1:2]

    scalerX = preprocessing.StandardScaler(copy=False)
    scalerX.fit(X_raw)
    scalery = preprocessing.StandardScaler(copy=False)
    scalery.fit(y_raw)

    if RP['zscore_norm']:
        X = pd.DataFrame(scalerX.transform(X_raw), columns=X_raw.columns.values)
        y = pd.DataFrame(scalery.transform(y_raw), columns=y_raw.columns.values)
    else:
        X = X_raw
        y = y_raw

    # print(X.head(), y.head())

    model = Sequential()

    # hidden
    model.add(Dense(300, W_regularizer=l2(0.0),activity_regularizer=activity_l2(0.0), input_shape=(X.shape[1], )))
    model.add(Activation('relu'))
    model.add(Dropout(0.300))
    model.add(Dense(300, W_regularizer=l2(0.0),activity_regularizer=activity_l2(0.0)))
    model.add(Activation('relu'))
    model.add(Dropout(0.200))
    model.add(Dense(1))

    model.compile(loss = 'mse', optimizer = OPTIMIZER)

    if RD['use_test_flags']:
        maskTrain = np.zeros(len(X),dtype=bool)
        maskTest = np.zeros(len(X),dtype=bool)
        for i in range(len(X)):
            maskTrain[i] = rawData[RD['testing']][i] == 0
            maskTest[i] = rawData[RD['testing']][i] == 1

        trainX = X.loc[maskTrain]
        testX = X.loc[maskTest]
        trainy = y.loc[maskTrain]
        testy = y.loc[maskTest]

    else:
        ratio = 0.8
        split = int(X.shape[0] * ratio)


        trainX, testX = X.iloc[:split], X.iloc[split:]
        trainy, testy = y.iloc[:split], y.iloc[split:]

    trainX.reset_index(drop=True,inplace=True)
    testX.reset_index(drop=True,inplace=True)
    trainy.reset_index(drop=True,inplace=True)
    testy.reset_index(drop=True,inplace=True)



    stats['training_row_count'] = len(trainX)
    stats['testing_row_count'] = len(testX)


    print(trainX.shape, testX.shape, trainy.shape, testy.shape)

    early = keras.callbacks.EarlyStopping(monitor = 'val_loss',
            patience = 20)

    history = model.fit(trainX.values, trainy.values, nb_epoch = RP['epochs'],
            batch_size = RP['batch'], callbacks = [early],
            validation_data = (testX.values, testy.values))


    preprocessMeta = {
        'scaler': scalery
    }

    # compute metrics for the model based on the task for both testing and training data
    print('\nGetting metrics for training data:')
    if RP['classify']:
        trainMetrics = metrics.classify(model, trainX.values, trainy.values, preprocessMeta)
    else:
        trainMetrics = metrics.predict(model, trainX.values, trainy.values, preprocessMeta)

    print('\nGetting metrics for test data:')
    if RP['classify']:
        testMetrics = metrics.classify(model, testX.values, testy.values, preprocessMeta)
    else:
        testMetrics = metrics.predict(model, testX.values, testy.values, preprocessMeta)


    print('Plot:')
    values = np.zeros((len(history.history['loss']), 2))
    for i in range(len(history.history['loss'])):
        values[i][0] = history.history['loss'][i]
        values[i][1] = history.history['val_loss'][i]
    utility.plotLoss(values)

    print('Dump csv pred')
    pred = model.predict(testX.values, batch_size = RP['batch'])


    if RP['zscore_norm']:
        predScaled = pd.DataFrame(scalery.inverse_transform(pred), columns=['pred'])
        testScaled = pd.DataFrame(scalery.inverse_transform(testy), columns=['true'])
    else:
        predScaled = pd.DataFrame(pred,columns=['pred'])
        testScaled = pd.DataFrame(testy,columns=['true'])

    predByTruth = pd.concat([predScaled, testScaled],axis=1)

    # predByTruth.plot(x='pred',y='true', kind='scatter')
    # plt.show()
    # predByTruth.to_csv('local/pred.csv')


    # statistics to send to journal
    stats['runtime_second'] = time.time() - stats['runtime_second']
    stats['memory_pm_mb'], stats['memory_vm_mb'] = utility.getMemoryUsage()
    stats['git_commit'] = utility.getGitCommitHash()
    stats['comment'] = RP['comment']
    stats['hostname'] = socket.gethostname()
    stats['experiment_config'] = yaml.dump(cc.exp,default_flow_style=False)

    stats['model'] = utility.modelToString(model)
    stats['loaded_model'] = RP['load_model']
    stats['parameter_count'] = model.count_params()
    stats['task'] = 'classification' if RP['classify'] else 'regression'

    stats['dataset_name'] = cc.exp['fetch']['table']
    stats['split_name'] = RD['testing']
    stats['label_name'] = ','.join(RD['labels'])

    stats['epoch_max'] = RP['epochs']
    stats['learning_rate'] = RP['learning_rate']
    stats['optimization_method'] = OPTIMIZER.__class__.__name__
    stats['batch_size'] = RP['batch']
    stats['seed'] = RP['seed']
    stats['objective'] = RP['objective']
    stats['learning_curve'] = {'val':open('{}/{}'.format(cc.cfg['plots']['dir'], utility.PLOT_NAME),'rb').read(),'type':'bin'}

    # metric statistics to send
    metricStats = {}

    if RP['classify']:
        metricStats['relevance_training'] = trainMetrics['acc_avg']
        metricStats['relevance_training_std'] = trainMetrics['acc_std']
        metricStats['relevance_testing'] = testMetrics['acc_avg']
        metricStats['relevance_testing_std'] = testMetrics['acc_std']
        metricStats['log_loss'] = testMetrics['log_loss_avg']
        metricStats['log_loss_std'] = testMetrics['log_loss_std']
        metricStats['auc'] = testMetrics['auc_avg']
        metricStats['auc_std'] = testMetrics['auc_std']
    else:
        metricStats['relevance_training'] = trainMetrics['r2_avg']
        metricStats['relevance_training_std'] = trainMetrics['r2_std']
        metricStats['relevance_testing'] = testMetrics['r2_avg']
        metricStats['relevance_testing_std'] = testMetrics['r2_std']
        metricStats['mse'] = testMetrics['mse_avg']
        metricStats['mse_std'] = testMetrics['mse_std']

    stats.update(metricStats)
    db.sendStatistics(**stats)