def fit_clfs(chid, n_estimators, n_jobs): """ Args: chid: which assay to use: external_file: Returns: clfs: Dictionary of fitted classifiers aucs: Dictionary of AUCs balance: Two numbers showing the number of actives in split 1 / split 2 df1: data in split 1 df2: data in split 2 """ # read data and calculate ecfp fingerprints assay_file = f'./assays/processed/{chid}.csv' print(f'Reading data from: {assay_file}') df = pd.read_csv(assay_file) df['ecfp'] = ecfp(df.smiles) df1, df2 = train_test_split(df, test_size=0.5, stratify=df['label']) X1 = np.array(list(df1['ecfp'])) X2 = np.array(list(df2['ecfp'])) y1 = np.array(list(df1['label'])) y2 = np.array(list(df2['label'])) del df1['ecfp'] del df2['ecfp'] balance = (np.mean(y1), np.mean(y2)) # train classifiers and store them in dictionary clfs = {} clfs['Split1'] = RandomForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs) clfs['Split1'].fit(X1, y1) clfs['Split1_alt'] = RandomForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs) clfs['Split1_alt'].fit(X1, y1) clfs['Split2'] = RandomForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs) clfs['Split2'].fit(X2, y2) # calculate AUCs for the clfs aucs = {} aucs['Split1'] = calc_auc(clfs['Split1'], X2, y2) aucs['Split1_alt'] = calc_auc(clfs['Split1_alt'], X2, y2) aucs['Split2'] = calc_auc(clfs['Split2'], X1, y1) print("AUCs:") for k, v in aucs.items(): print(f'{k}: {v}') return clfs, aucs, balance, df1, df2
def train_epoch(model, optimizer, criterion, x_train, x_train_external, y_train): model.train() auc_meter, loss_meter, it_count = 0, 0, 0 batch_size = config.batch_size for i in range(0, len(x_train) - batch_size + 1, batch_size): inputs1 = torch.tensor(x_train[i:i + batch_size], dtype=torch.float, device=device) inputs2 = torch.tensor(x_train_external[i:i + batch_size], dtype=torch.float, device=device) target = torch.tensor(y_train[i:i + batch_size], dtype=torch.float, device=device) output = model.forward(inputs1, inputs2) # zero the parameter gradients optimizer.zero_grad() # forward loss = criterion(output, target) loss.backward() optimizer.step() loss_meter += loss.item() it_count += 1 auc_meter = auc_meter + utils.calc_auc(target, torch.sigmoid(output)) return loss_meter / it_count, auc_meter / it_count
def evalute(sess: tf.Session, test_data: TrainDataIter, model: Model) -> typing.Tuple: loss_sum = 0.0 accuracy_sum = 0.0 aux_loss_sum = 0.0 cnt = 0 store_arr = [] for feature, target in test_data: cnt += 1 user_ids, ad_ids, code_ids, ad_his, code_his, ad_mask, lengths_xx, target = prepare_data( feature, target, choose_len=0) prob, loss, acc, aux_loss = model.calculate(sess, [ user_ids, ad_ids, code_ids, ad_his, code_his, ad_mask, target, lengths_xx ]) loss_sum += loss accuracy_sum += acc aux_loss_sum += aux_loss prob_1 = prob[:, 1].tolist() target_1 = target[:, 1].tolist() for p, t in zip(prob_1, target_1): store_arr.append([p, t]) all_auc, r, p, f1 = calc_auc(store_arr) return all_auc, r, p, f1, loss_sum / cnt, accuracy_sum / cnt, aux_loss_sum / cnt
def evaluate(sess, test_data, model): test_loss_sum = 0.0 test_accuracy_sum = 0.0 test_aux_loss_sum = 0.0 nums = 0 stored_arr = [] for src, tgt in test_data: nums += 1 uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data( src, tgt, return_neg=True) prob, loss, acc, aux_loss = model.calculate(sess, [ uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats, temp ]) test_loss_sum += loss test_accuracy_sum += acc test_aux_loss_sum = aux_loss prob_1 = prob[:, 0].tolist() target_1 = target[:, 0].tolist() for p, t in zip(prob_1, target_1): stored_arr.append([p, t]) test_auc = utils.calc_auc(stored_arr) test_loss_avg = test_loss_sum / nums test_accuracy_avg = test_accuracy_sum / nums test_aux_loss_avg = test_aux_loss_sum / nums return test_auc, test_loss_avg, test_accuracy_avg, test_aux_loss_avg
def val_epoch(model, criterion, x_val,x_val_external,y_val): model.eval() auc_meter,loss_meter, it_count = 0, 0,0 batch_size=config.batch_size with torch.no_grad(): for i in range(0,len(x_val)-batch_size,batch_size): inputs1 = torch.tensor(x_val[i:i+batch_size],dtype=torch.float,device=device) inputs2 = torch.tensor(x_val_external[i:i+batch_size],dtype=torch.float,device=device) target = torch.tensor(y_val[i:i+batch_size],dtype=torch.float,device=device) output = model(inputs1,inputs2) loss = criterion(output, target) loss_meter += loss.item() it_count += 1 auc_meter =auc_meter + utils.calc_auc(target, torch.sigmoid(output)) return loss_meter / it_count, auc_meter/ it_count
def _eval(sess, model, test_data, label): ano_scores = [] for _, batch_test_data in DataInput(test_data, test_batch_size): _ano_score, _, _ = model.eval(sess, batch_test_data) # Extend ano_scores += list(_ano_score) ano_scores = np.array(ano_scores).reshape((-1, 1)) # Highest 80% are anomalous prec, rec, f1 = calc_metric(label, ano_scores) # Calculate auprc _auprc = calc_auc(label, ano_scores) global best_f1 if best_f1 < f1: best_f1 = f1 global best_auprc if best_auprc < _auprc: best_auprc = _auprc model.save(sess, '{}/ckpt'.format(save_path)) return prec, rec, f1, _auprc
test_set = pickle.load(f) x_test, y_test = test_set print('test set', x_test.shape) with tf.Session() as sess: model = BiWGAN(input_dim, method, weight, degree) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) model.restore(sess, '{}/ckpt'.format(save_path)) ano_scores = [] for _, batch_test_data in DataInput(x_test, test_batch_size): _ano_score, _, _ = model.eval(sess, batch_test_data) # extend ano_scores += list(_ano_score) ano_scores = np.array(ano_scores).reshape((-1, 1)) # Highest 80% are anomalous prec, rec, f1 = calc_metric(y_test, ano_scores, percentile=80) # Calculate auc auprc = calc_auc(y_test, ano_scores) print('Prec:{:.4f} | Rec:{:.4f} | F1:{:.4f} | AUPRC:{:.4f}'.format( prec, rec, f1, auprc)) # draw prc curve # draw_prc(y_test, ano_scores)
fp, tp = parsers.eval_hof( [gp.compile(i, evo.pset) for i in hof], X[test], y[test]) elif (cond[0] == 'rf') or (cond[0] == 'svm'): fp, tp = classifier.eval(X[train], X[test], y[train], y[test], clf=cond[0], seed=seed) tprs.append(tp) fprs.append(fp) auc_scores[r, :] = utils.calc_auc(fprs, tprs, figure, plot_roc=True) plt.savefig(r"./results/images/TrimmedDataset/" + method[n] + "-AUC" + "_reps" + str(r)) figure += 1 print('-' * 75) utils.csv_save(method[n], auc_scores) print(feat) print('Done') # plt.show() # show figures