def _eval(model, dataloader, CONFIG): model.eval() torch.set_grad_enabled(False) correct = 0 total = 0 all_targ = torch.tensor([]).to(dtype=torch.int64).cuda() all_pred = torch.tensor([]).to(dtype=torch.int64).cuda() for test_data in dataloader: X, y = test_data X, y = X.cuda(), y.cuda() y_pred = model(X) y_pred_classified = y_pred.view(-1).clone() for i in range(len(y_pred)): y_pred_classified[i] = classify(y_pred[i]) all_pred = torch.cat((all_pred, y_pred_classified.to(torch.int64))) all_targ = torch.cat((all_targ, y.to(torch.int64))) total += y.size(0) correct += accuracy(y_pred.cpu(), y.cpu().float()) * y.size(0) acc = round(correct / total, 4) c_matrix, kappa = quadratic_weighted_kappa(all_targ.cpu().numpy(), all_pred.cpu().numpy()) model.train() torch.set_grad_enabled(True) return acc, c_matrix, kappa, all_pred.cpu().numpy()
def evaluate(CONFIG): #creat result folder if not os.path.isdir(CONFIG['SAVE_PATH']): os.makedirs(CONFIG['SAVE_PATH']) # creat dataset test_dataset = generate_stem_dataset(CONFIG['DATA_PATH'], CONFIG['INPUT_SIZE'], CONFIG['DATA_AUGMENTATION'], cv=False, mode='evaluate') # creat dataloader test_loader = DataLoader(test_dataset, batch_size=CONFIG['BATCH_SIZE'], num_workers=CONFIG['NUM_WORKERS'], shuffle=False) # define model model_name = CONFIG['MODEL_NAME'] model = EfficientNet.from_pretrained(model_name) feature = model._fc.in_features model._fc = nn.Linear(in_features=feature, out_features=1, bias=True) #multi-gpu setting torch.cuda.set_device(CONFIG['GPU_NUM'][0]) model = torch.nn.DataParallel( model, device_ids=CONFIG['GPU_NUM']).to(device=torch.device('cuda')) # load pretrained weights if CONFIG['PRETRAINED_PATH']: state_dict = torch.load(CONFIG['PRETRAINED_PATH']) from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): if 'module' not in k: k = 'module.' + k else: k = k.replace('features.module.', 'module.features.') new_state_dict[k] = v model.load_state_dict(new_state_dict) # evaluate model.eval() torch.set_grad_enabled(False) correct = 0 total = 0 all_targ = torch.tensor([]).to(dtype=torch.int64).cuda() all_pred = torch.tensor([]).to(dtype=torch.int64).cuda() logit_pred_y = [] logit_targ_y = [] for test_data in test_loader: X, y = test_data X, y = X.cuda(), y.cuda() y_pred = model(X) y_pred_classified = y_pred.view(-1).clone() for i in range(len(y_pred)): y_pred_classified[i] = classify(y_pred[i]) all_pred = torch.cat((all_pred, y_pred_classified.to(torch.int64))) all_targ = torch.cat((all_targ, y.to(torch.int64))) total += y.size(0) correct += accuracy(y_pred.cpu(), y.cpu().float()) * y.size(0) logit_pred_y += list(y_pred.view(-1).cpu().numpy()) logit_targ_y += list(y.cpu().float().numpy()) acc = round(correct / total, 4) c_matrix, kappa = quadratic_weighted_kappa(all_targ.cpu().numpy(), all_pred.cpu().numpy()) ks_dataframe = pd.DataFrame({'pred': logit_pred_y, 'targ': logit_targ_y}) ks_dataframe.to_csv(os.path.join(CONFIG['SAVE_PATH'], model_name + '_eval_results.csv'), index=False, sep=',') print('==============================') print('Test acc: {}'.format(acc)) print('Confusion Matrix:\n{}'.format(c_matrix)) print('quadratic kappa: {}'.format(kappa)) print('==============================')
def run(): stats = {} stats['runtime_second'] = time.time() startTime = time.time() # Initialize using the same seed (to get stable results on comparisons) np.random.seed(RP['seed']) rawData = db.getData() # filter infs and nans from data cols cols = rawData.columns.tolist()[1:-1] print(cols) for col in cols: rawData = rawData.drop(rawData[np.isinf(rawData[col])].index) rawData = rawData.drop(rawData[np.isnan(rawData[col])].index) rawData.reset_index(drop=True, inplace=True) rawData.reindex(np.random.permutation(rawData.index)) # print(rawData) X_raw = rawData.iloc[:, 2:-1] y_raw = rawData.iloc[:, 1:2] scalerX = preprocessing.StandardScaler(copy=False) scalerX.fit(X_raw) scalery = preprocessing.StandardScaler(copy=False) scalery.fit(y_raw) if RP['zscore_norm']: X = pd.DataFrame(scalerX.transform(X_raw), columns=X_raw.columns.values) y = pd.DataFrame(scalery.transform(y_raw), columns=y_raw.columns.values) else: X = X_raw y = y_raw # print(X.head(), y.head()) model = Sequential() # hidden model.add( Dense(300, W_regularizer=l2(0.0), activity_regularizer=activity_l2(0.0), input_shape=(X.shape[1], ))) model.add(Activation('relu')) model.add(Dropout(0.300)) model.add( Dense(300, W_regularizer=l2(0.0), activity_regularizer=activity_l2(0.0))) model.add(Activation('relu')) model.add(Dropout(0.200)) model.add(Dense(1)) model.compile(loss='mse', optimizer=OPTIMIZER) if RD['use_test_flags']: maskTrain = np.zeros(len(X), dtype=bool) maskTest = np.zeros(len(X), dtype=bool) for i in range(len(X)): maskTrain[i] = rawData[RD['testing']][i] == 0 maskTest[i] = rawData[RD['testing']][i] == 1 trainX = X.loc[maskTrain] testX = X.loc[maskTest] trainy = y.loc[maskTrain] testy = y.loc[maskTest] else: ratio = 0.8 split = int(X.shape[0] * ratio) trainX, testX = X.iloc[:split], X.iloc[split:] trainy, testy = y.iloc[:split], y.iloc[split:] trainX.reset_index(drop=True, inplace=True) testX.reset_index(drop=True, inplace=True) trainy.reset_index(drop=True, inplace=True) testy.reset_index(drop=True, inplace=True) stats['training_row_count'] = len(trainX) stats['testing_row_count'] = len(testX) print(trainX.shape, testX.shape, trainy.shape, testy.shape) early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20) history = model.fit(trainX.values, trainy.values, nb_epoch=RP['epochs'], batch_size=RP['batch'], callbacks=[early], validation_data=(testX.values, testy.values)) preprocessMeta = {'scaler': scalery} # compute metrics for the model based on the task for both testing and training data print('\nGetting metrics for training data:') if RP['classify']: trainMetrics = metrics.classify(model, trainX.values, trainy.values, preprocessMeta) else: trainMetrics = metrics.predict(model, trainX.values, trainy.values, preprocessMeta) print('\nGetting metrics for test data:') if RP['classify']: testMetrics = metrics.classify(model, testX.values, testy.values, preprocessMeta) else: testMetrics = metrics.predict(model, testX.values, testy.values, preprocessMeta) print('Plot:') values = np.zeros((len(history.history['loss']), 2)) for i in range(len(history.history['loss'])): values[i][0] = history.history['loss'][i] values[i][1] = history.history['val_loss'][i] utility.plotLoss(values) print('Dump csv pred') pred = model.predict(testX.values, batch_size=RP['batch']) if RP['zscore_norm']: predScaled = pd.DataFrame(scalery.inverse_transform(pred), columns=['pred']) testScaled = pd.DataFrame(scalery.inverse_transform(testy), columns=['true']) else: predScaled = pd.DataFrame(pred, columns=['pred']) testScaled = pd.DataFrame(testy, columns=['true']) predByTruth = pd.concat([predScaled, testScaled], axis=1) # predByTruth.plot(x='pred',y='true', kind='scatter') # plt.show() # predByTruth.to_csv('local/pred.csv') # statistics to send to journal stats['runtime_second'] = time.time() - stats['runtime_second'] stats['memory_pm_mb'], stats['memory_vm_mb'] = utility.getMemoryUsage() stats['git_commit'] = utility.getGitCommitHash() stats['comment'] = RP['comment'] stats['hostname'] = socket.gethostname() stats['experiment_config'] = yaml.dump(cc.exp, default_flow_style=False) stats['model'] = utility.modelToString(model) stats['loaded_model'] = RP['load_model'] stats['parameter_count'] = model.count_params() stats['task'] = 'classification' if RP['classify'] else 'regression' stats['dataset_name'] = cc.exp['fetch']['table'] stats['split_name'] = RD['testing'] stats['label_name'] = ','.join(RD['labels']) stats['epoch_max'] = RP['epochs'] stats['learning_rate'] = RP['learning_rate'] stats['optimization_method'] = OPTIMIZER.__class__.__name__ stats['batch_size'] = RP['batch'] stats['seed'] = RP['seed'] stats['objective'] = RP['objective'] stats['learning_curve'] = { 'val': open('{}/{}'.format(cc.cfg['plots']['dir'], utility.PLOT_NAME), 'rb').read(), 'type': 'bin' } # metric statistics to send metricStats = {} if RP['classify']: metricStats['relevance_training'] = trainMetrics['acc_avg'] metricStats['relevance_training_std'] = trainMetrics['acc_std'] metricStats['relevance_testing'] = testMetrics['acc_avg'] metricStats['relevance_testing_std'] = testMetrics['acc_std'] metricStats['log_loss'] = testMetrics['log_loss_avg'] metricStats['log_loss_std'] = testMetrics['log_loss_std'] metricStats['auc'] = testMetrics['auc_avg'] metricStats['auc_std'] = testMetrics['auc_std'] else: metricStats['relevance_training'] = trainMetrics['r2_avg'] metricStats['relevance_training_std'] = trainMetrics['r2_std'] metricStats['relevance_testing'] = testMetrics['r2_avg'] metricStats['relevance_testing_std'] = testMetrics['r2_std'] metricStats['mse'] = testMetrics['mse_avg'] metricStats['mse_std'] = testMetrics['mse_std'] stats.update(metricStats) db.sendStatistics(**stats)
def run(grid = None): stats = {} stats['runtime_second'] = time.time() # initialize using the same seed (to get stable results on comparisons) np.random.seed(RP['seed']) # grab the commit at start stats['git_commit'] = utility.getGitCommitHash() # get the training and testing datasets along with some meta info if RP['edge_prediction']: trainIn, trainLabel, testIn, testLabel, preprocessMeta = data.preprocessEdgeData(db.getData()) else: trainIn, trainLabel, testIn, testLabel, preprocessMeta = data.preprocessData(db.getData()) # trainIn, trainLabel, testIn, testLabel, preprocessMeta = data.preprocessFastaOneHotData(db.getData()) stats['training_row_count'] = len(testLabel) stats['testing_row_count'] = len(testLabel) # load model from file or create and train one from scratch if RP['load_model']: model = utility.loadModel(RP['load_model']) else: if RP['edge_prediction']: model = configureEdgeModel(trainIn[0],trainIn[1]) elif RP['discrete_label']: model = configureModel(trainIn, len(trainLabel[0])) else: model = configureModel(trainIn) stats['epoch_count'] = train(model, trainIn, trainLabel, (testIn, testLabel)) # persistence first if cc.cfg['persistence']['model']: name = '{}_rg_{}'.format(stats['git_commit'],':'.join([str(x) for x in RG['ratios']])) # name = stats['git_commit'] stats['persistent_model_name'] = name utility.saveModel(model, name) # compute metrics for the model based on the task for both testing and training data print('\nGetting metrics for training data:') if RP['classify']: if RP['discrete_label']: trainMetrics = metrics.discreteClassify(model, trainIn, trainLabel, preprocessMeta) else: trainMetrics = metrics.classify(model, trainIn, trainLabel, preprocessMeta) else: trainMetrics = metrics.predict(model, trainIn, trainLabel, preprocessMeta) print('\nGetting metrics for test data:') if RP['classify']: if RP['discrete_label']: testMetrics = metrics.discreteClassify(model, testIn, testLabel, preprocessMeta) else: testMetrics = metrics.classify(model, testIn, testLabel, preprocessMeta) else: testMetrics = metrics.predict(model, testIn, testLabel, preprocessMeta) # utilities and visualizations if cc.cfg['plots']['layer_activations']: visualization.layerActivations(model, testIn, testLabel) if cc.cfg['plots']['seq_output']: df = pd.DataFrame(cc.cfg['plots']['seq_output_seq_input'], columns=[RD['fasta'] if cc.cfg['plots']['seq_output_seq_input_name'] == 'fasta' else RD['smiles']]) visualization.visualizeSequentialOutput(model, cc.cfg['plots']['seq_output_layer_idx'], df) if cc.cfg['plots']['print_pred']: visualization.printPrediction(model, cc.cfg['plots']['print_pred_smiles']) if cc.cfg['plots']['print_train_test_pred']: visualization.printTrainTestPred(model, cc.cfg['plots']['print_train_test_pred_cnt'], trainIn, trainLabel, testIn, testLabel, preprocessMeta) # statistics to send to journal stats['runtime_second'] = time.time() - stats['runtime_second'] stats['memory_pm_mb'], stats['memory_vm_mb'] = utility.getMemoryUsage() stats['comment'] = RP['comment'] stats['hostname'] = socket.gethostname() stats['experiment_config'] = yaml.dump(cc.exp,default_flow_style=False) stats['model'] = utility.modelToString(model) stats['loaded_model'] = RP['load_model'] stats['parameter_count'] = model.count_params() stats['task'] = 'classification' if RP['classify'] else 'regression' stats['dataset_name'] = cc.exp['fetch']['table'] stats['split_name'] = RD['testing'] stats['label_name'] = ','.join(RD['labels']) stats['epoch_max'] = RP['epochs'] stats['learning_rate'] = RP['learning_rate'] stats['optimization_method'] = OPTIMIZER.__class__.__name__ stats['batch_size'] = RP['batch'] stats['seed'] = RP['seed'] stats['objective'] = RP['objective'] stats['learning_curve'] = {'val':open('{}/{}'.format(cc.cfg['plots']['dir'], utility.PLOT_NAME),'rb').read(),'type':'bin'} # metric statistics to send metricStats = {} if RP['classify']: metricStats['relevance_training'] = trainMetrics['acc_avg'] metricStats['relevance_training_std'] = trainMetrics['acc_std'] metricStats['relevance_testing'] = testMetrics['acc_avg'] metricStats['relevance_testing_std'] = testMetrics['acc_std'] metricStats['log_loss'] = testMetrics['log_loss_avg'] metricStats['log_loss_std'] = testMetrics['log_loss_std'] metricStats['auc'] = testMetrics['auc_avg'] metricStats['auc_std'] = testMetrics['auc_std'] metricStats['auc_micro'] = testMetrics['auc_avg'] metricStats['auc_micro_std'] = testMetrics['auc_std'] else: metricStats['relevance_training'] = trainMetrics['r2_avg'] metricStats['relevance_training_std'] = trainMetrics['r2_std'] metricStats['relevance_testing'] = testMetrics['r2_avg'] metricStats['relevance_testing_std'] = testMetrics['r2_std'] metricStats['mse'] = testMetrics['mse_avg'] metricStats['mse_std'] = testMetrics['mse_std'] metricStats['mae'] = testMetrics['mae_avg'] metricStats['mae_std'] = testMetrics['mae_std'] stats.update(metricStats) db.sendStatistics(**stats) utility.freeModel(model)
def run(): stats = {} stats['runtime_second'] = time.time() startTime = time.time() # Initialize using the same seed (to get stable results on comparisons) np.random.seed(RP['seed']) rawData = db.getData() # filter infs and nans from data cols cols = rawData.columns.tolist()[1:-1] print(cols) for col in cols: rawData = rawData.drop(rawData[np.isinf(rawData[col])].index) rawData = rawData.drop(rawData[np.isnan(rawData[col])].index) rawData.reset_index(drop=True,inplace=True) rawData.reindex(np.random.permutation(rawData.index)) # print(rawData) X_raw = rawData.iloc[:, 2:-1] y_raw = rawData.iloc[:, 1:2] scalerX = preprocessing.StandardScaler(copy=False) scalerX.fit(X_raw) scalery = preprocessing.StandardScaler(copy=False) scalery.fit(y_raw) if RP['zscore_norm']: X = pd.DataFrame(scalerX.transform(X_raw), columns=X_raw.columns.values) y = pd.DataFrame(scalery.transform(y_raw), columns=y_raw.columns.values) else: X = X_raw y = y_raw # print(X.head(), y.head()) model = Sequential() # hidden model.add(Dense(300, W_regularizer=l2(0.0),activity_regularizer=activity_l2(0.0), input_shape=(X.shape[1], ))) model.add(Activation('relu')) model.add(Dropout(0.300)) model.add(Dense(300, W_regularizer=l2(0.0),activity_regularizer=activity_l2(0.0))) model.add(Activation('relu')) model.add(Dropout(0.200)) model.add(Dense(1)) model.compile(loss = 'mse', optimizer = OPTIMIZER) if RD['use_test_flags']: maskTrain = np.zeros(len(X),dtype=bool) maskTest = np.zeros(len(X),dtype=bool) for i in range(len(X)): maskTrain[i] = rawData[RD['testing']][i] == 0 maskTest[i] = rawData[RD['testing']][i] == 1 trainX = X.loc[maskTrain] testX = X.loc[maskTest] trainy = y.loc[maskTrain] testy = y.loc[maskTest] else: ratio = 0.8 split = int(X.shape[0] * ratio) trainX, testX = X.iloc[:split], X.iloc[split:] trainy, testy = y.iloc[:split], y.iloc[split:] trainX.reset_index(drop=True,inplace=True) testX.reset_index(drop=True,inplace=True) trainy.reset_index(drop=True,inplace=True) testy.reset_index(drop=True,inplace=True) stats['training_row_count'] = len(trainX) stats['testing_row_count'] = len(testX) print(trainX.shape, testX.shape, trainy.shape, testy.shape) early = keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 20) history = model.fit(trainX.values, trainy.values, nb_epoch = RP['epochs'], batch_size = RP['batch'], callbacks = [early], validation_data = (testX.values, testy.values)) preprocessMeta = { 'scaler': scalery } # compute metrics for the model based on the task for both testing and training data print('\nGetting metrics for training data:') if RP['classify']: trainMetrics = metrics.classify(model, trainX.values, trainy.values, preprocessMeta) else: trainMetrics = metrics.predict(model, trainX.values, trainy.values, preprocessMeta) print('\nGetting metrics for test data:') if RP['classify']: testMetrics = metrics.classify(model, testX.values, testy.values, preprocessMeta) else: testMetrics = metrics.predict(model, testX.values, testy.values, preprocessMeta) print('Plot:') values = np.zeros((len(history.history['loss']), 2)) for i in range(len(history.history['loss'])): values[i][0] = history.history['loss'][i] values[i][1] = history.history['val_loss'][i] utility.plotLoss(values) print('Dump csv pred') pred = model.predict(testX.values, batch_size = RP['batch']) if RP['zscore_norm']: predScaled = pd.DataFrame(scalery.inverse_transform(pred), columns=['pred']) testScaled = pd.DataFrame(scalery.inverse_transform(testy), columns=['true']) else: predScaled = pd.DataFrame(pred,columns=['pred']) testScaled = pd.DataFrame(testy,columns=['true']) predByTruth = pd.concat([predScaled, testScaled],axis=1) # predByTruth.plot(x='pred',y='true', kind='scatter') # plt.show() # predByTruth.to_csv('local/pred.csv') # statistics to send to journal stats['runtime_second'] = time.time() - stats['runtime_second'] stats['memory_pm_mb'], stats['memory_vm_mb'] = utility.getMemoryUsage() stats['git_commit'] = utility.getGitCommitHash() stats['comment'] = RP['comment'] stats['hostname'] = socket.gethostname() stats['experiment_config'] = yaml.dump(cc.exp,default_flow_style=False) stats['model'] = utility.modelToString(model) stats['loaded_model'] = RP['load_model'] stats['parameter_count'] = model.count_params() stats['task'] = 'classification' if RP['classify'] else 'regression' stats['dataset_name'] = cc.exp['fetch']['table'] stats['split_name'] = RD['testing'] stats['label_name'] = ','.join(RD['labels']) stats['epoch_max'] = RP['epochs'] stats['learning_rate'] = RP['learning_rate'] stats['optimization_method'] = OPTIMIZER.__class__.__name__ stats['batch_size'] = RP['batch'] stats['seed'] = RP['seed'] stats['objective'] = RP['objective'] stats['learning_curve'] = {'val':open('{}/{}'.format(cc.cfg['plots']['dir'], utility.PLOT_NAME),'rb').read(),'type':'bin'} # metric statistics to send metricStats = {} if RP['classify']: metricStats['relevance_training'] = trainMetrics['acc_avg'] metricStats['relevance_training_std'] = trainMetrics['acc_std'] metricStats['relevance_testing'] = testMetrics['acc_avg'] metricStats['relevance_testing_std'] = testMetrics['acc_std'] metricStats['log_loss'] = testMetrics['log_loss_avg'] metricStats['log_loss_std'] = testMetrics['log_loss_std'] metricStats['auc'] = testMetrics['auc_avg'] metricStats['auc_std'] = testMetrics['auc_std'] else: metricStats['relevance_training'] = trainMetrics['r2_avg'] metricStats['relevance_training_std'] = trainMetrics['r2_std'] metricStats['relevance_testing'] = testMetrics['r2_avg'] metricStats['relevance_testing_std'] = testMetrics['r2_std'] metricStats['mse'] = testMetrics['mse_avg'] metricStats['mse_std'] = testMetrics['mse_std'] stats.update(metricStats) db.sendStatistics(**stats)