def test_mannwhithney(predfile1, predfile2, testfile, testfile2): y_true1, y_pred1, y_true_prec1, y_pred_prec1 = evaluate( testfile, predfile1) y_true2, y_pred2, y_true_prec2, y_pred_prec2 = evaluate( testfile2, predfile2) print('\n First model: ', predfile1) print('Ex: ', y_pred1[:10], ' Len: ', len(y_pred1)) print('Second model: ', predfile2) print('Ex: ', y_pred2[:10], ' Len: ', len(y_pred2)) print( 'Is testset the same? ', len([ i for i in np.equal(np.array(y_true1), np.array(y_true2)) if i is False ])) mc_tb = mcnemar_table(y_target=np.array(y_true1), y_model1=np.array(y_pred1), y_model2=np.array(y_pred2)) print('Contingency table: ', mc_tb) mcnemar_res = mcnemar(mc_tb) print('McNemar: p value: {:.20f}'.format(mcnemar_res.pvalue)) chi2, p = mlx_mcnemar(ary=mc_tb, corrected=True) print('McNemar: chi:{:.4f} p value: {}'.format(chi2, p)) mc_tb_prec = mcnemar_table(y_target=np.array(y_true_prec1), y_model1=np.array(y_pred_prec1), y_model2=np.array(y_pred_prec2)) mcnemar_res_prec = mcnemar(mc_tb_prec) print('McNemar PRECISION: p value: {}'.format(mcnemar_res_prec.pvalue))
def statistics(): # RUN combined_classify_to_csv() FIRST! # Get normpneum inceptionv3 model predictions csv_path = normpneum_bin_file_dir + '_incv3.csv' normpneum_res = pd.read_csv(csv_path, header=None).to_numpy() normpneum_incv3_class_preds = normpneum_res[:, 3] # Get normpneum inceptionv3 model predictions csv_path = normpneum_bin_file_dir + '_resnetv2.csv' normpneum_res = pd.read_csv(csv_path, header=None).to_numpy() normpneum_resnetv2_class_preds = normpneum_res[:, 3] # Get the test labels normpneum_test = np.argmax(normpneum_test_labels, axis=-1) # Contingency Table tb = mcnemar_table(y_target=normpneum_test, y_model1=normpneum_incv3_class_preds, y_model2=normpneum_resnetv2_class_preds) print(tb) # McNemar's test chi2, p = mcnemar(ary=tb, corrected=True) print('chi-squared:', chi2) print('p-value:', p) accuracy_normpneum_incv3 = accuracy_score(normpneum_test, normpneum_incv3_class_preds) accuracy_normpneum_resnetv2 = accuracy_score( normpneum_test, normpneum_resnetv2_class_preds) print(f"Test accuracy normpneum incv3: {accuracy_normpneum_incv3}") print( f"Test accuracy normpneum incresnetv2: {accuracy_normpneum_resnetv2}")
def test_compare_to_mcnemar_on_2_models(): y_true = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) ym1 = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) ym2 = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) q, p = cochrans_q(y_true, ym1, ym2) mcn_q, mcn_p = mcnemar(mcnemar_table(y_true, ym1, ym2), corrected=False, exact=False) assert q == mcn_q assert p == mcn_p
def run_mcnemar_test(y_test, model_1_class_predictions, model_2_class_predictions, model_1_name, model_2_name): """ Runs the McNemar test to determine if there is a statistically significant difference in the class predictions. Writes the results and associated contingency table locally. :param y_test: y_test series :param model_1_class_predictions: class predictions from model 1 :param model_2_class_predictions: class predictions from model 2 :param model_1_name: name of the first model :param model_2_name: name of the second model """ results_table = mcnemar_table(y_target=y_test, y_model1=model_1_class_predictions, y_model2=model_2_class_predictions) chi2, p = mcnemar(ary=results_table, corrected=True) pd.DataFrame({ 'chi2': [chi2], 'p': [p] }).to_csv(os.path.join(f'{model_1_name}_{model_2_name}_mcnemar_test.csv')) board = checkerboard_plot( results_table, figsize=(6, 6), fmt='%d', col_labels=[f'{model_2_name} wrong', f'{model_2_name} right'], row_labels=[f'{model_1_name} wrong', f'{model_1_name} right']) plt.tight_layout() plt.savefig( os.path.join('modeling', 'comparison_files', f'{model_1_name}_{model_2_name}_mcnemar_test.png')) plt.clf()
def test_input_binary_all_right(): y_target = np.array([0, 0, 0, 0, 1, 1, 1, 1]) y_model1 = np.array([0, 0, 0, 0, 1, 1, 1, 1]) y_model2 = np.array([0, 0, 0, 0, 1, 1, 1, 1]) tb = mcnemar_table(y_target=y_target, y_model1=y_model1, y_model2=y_model2) expect = np.array([[8, 0], [0, 0]]) np.testing.assert_array_equal(tb, expect)
def summarize_feature_comparisons( base_clf: BaseEstimator, comparison_clfs: Dict[str, BaseEstimator], X_test, y_test ): from mlxtend.evaluate import mcnemar, cochrans_q, mcnemar_table summary_dict = collections.OrderedDict() mcnemar_tbs = dict() # create list of predicted values base_y_predict = base_clf.predict(X_test) y_predictions = [base_y_predict] for idx, (name, clf) in enumerate(comparison_clfs.items()): # get the probability y_predict_proba = clf.predict_proba(X_test) y_predict = clf.predict(X_test) # form mcnemar tables against base classifier tb = mcnemar_table(y_test, base_y_predict, y_predict) mcnemar_tbs[f"base vs {name}"] = tb.values() # store predictions per classifier y_predictions.append(y_predict) # first run cochrans Q test qstat, pval = cochrans_q(y_test, *y_predictions) summary_dict["cochrans_q"] = qstat summary_dict["cochrans_q_pval"] = pval # run mcnemars test against all the predictions for name, table in mcnemar_tbs.items(): chi2stat, pval = mcnemar(table, exact=True) summary_dict[f"mcnemar_{name}_chi2stat"] = chi2stat summary_dict[f"mcnemar_{name}_pval"] = pval return summary_dict
def test_input_binary(): y_target = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) y_model1 = np.array([0, 1, 0, 0, 0, 1, 1, 0, 0, 0]) y_model2 = np.array([0, 0, 1, 1, 0, 1, 1, 0, 0, 0]) tb = mcnemar_table(y_target=y_target, y_model1=y_model1, y_model2=y_model2) expect = np.array([[4, 1], [2, 3]]) np.testing.assert_array_equal(tb, expect)
def test_input_binary_all_right(): y_target = np.array([0, 0, 0, 0, 1, 1, 1, 1]) y_model1 = np.array([0, 0, 0, 0, 1, 1, 1, 1]) y_model2 = np.array([0, 0, 0, 0, 1, 1, 1, 1]) tb = mcnemar_table(y_target=y_target, y_model1=y_model1, y_model2=y_model2) expect = np.array([[8, 0], [0, 0]])
def test_input_nonbinary(): y_target = np.array([0, 0, 0, 0, 0, 2, 1, 1, 1, 1]) y_model1 = np.array([0, 5, 0, 0, 0, 2, 1, 0, 0, 0]) y_model2 = np.array([0, 0, 1, 3, 0, 2, 1, 0, 0, 0]) tb = mcnemar_table(y_target=y_target, y_model1=y_model1, y_model2=y_model2) expect = np.array([[4, 1], [2, 3]]) np.testing.assert_array_equal(tb, expect)
def mcNemar(target, model1, model2): y_target = np.array(target) # Class labels predicted by model 1 y_model1 = np.array(model1) # Class labels predicted by model 2 y_model2 = np.array(model2) tb = mcnemar_table(y_target=y_target, y_model1=y_model1, y_model2=y_model2) #print (tb) return tb
def svm_p_value(trainData,testData, input_pred): svc = LinearSVC(max_iter = 10000, verbose=50, C= 0.1) train_x = np.array(list(trainData['player_array'])) train_y = np.array(list(trainData['win'])) test_x = np.array(list(testData['player_array'])) test_y = np.array(list(testData['win'])) svc.fit(train_x, train_y) test_pred = svc.predict(test_x) tb = mcnemar_table(y_target=test_y, y_model1=input_pred, y_model2=test_pred) chi2, p = mcnemar(ary=tb, corrected=True) return p
def mlp_p_value(trainData,testData, input_pred): ann = MLPClassifier(verbose=True, max_iter= 500,tol= 0.0005, solver= 'adam', alpha= 0.0001, activation= 'logistic', hidden_layer_sizes = (50,40)) train_x = np.array(list(trainData['player_array'])) train_y = np.array(list(trainData['win'])) test_x = np.array(list(testData['player_array'])) test_y = np.array(list(testData['win'])) ann.fit(train_x, train_y) test_pred = ann.predict(test_x) tb = mcnemar_table(y_target=test_y, y_model1=input_pred, y_model2=test_pred) chi2, p = mcnemar(ary=tb, corrected=True) return p
def log_p_value(trainData,testData, input_pred): log = LogR(max_iter=500, solver='newton-cg', C=0.1) train_x = np.array(list(trainData['player_array'])) train_y = np.array(list(trainData['win'])) test_x = np.array(list(testData['player_array'])) test_y = np.array(list(testData['win'])) log.fit(train_x, train_y) test_pred = log.predict(test_x) tb = mcnemar_table(y_target=test_y, y_model1=input_pred, y_model2=test_pred) chi2, p = mcnemar(ary=tb, corrected=True) return p
def mcnemar_test(target, model_1_pred, model_2_pred): """ Calculates p-value of the mcnemar test It builds a contingency table and uses that to calculate the p-value :param target: a numpy array that has the actual target values :param model_1_pred: a numpy array that contains values based on prediction of model 1 :param model_2_pred: a numpy array that contains values based on prediction of model 2 :return p_value: the probability calculated under the chi-squared distribution """ mc_table = mcnemar_table(y_target=target, y_model1=model_1_pred, y_model2=model_2_pred) n = mc_table[0, 1] + mc_table[1, 0] # if the sum of b + c is less than 25, we should you use the binomial distribution # instead of the chi-squared distribution. Check https://en.wikipedia.org/wiki/McNemar%27s_test binomial = True if n < 25 else False _, p_value = mcnemar(ary=mc_table, exact=binomial) return p_value
def main(file_1, file_2): a, b, c, d = 0, 0, 0, 0 nb_lines = 0 y_ground, y_1, y_2 = [], [], [] with open(file_1) as f1, open(file_2) as f2: for line_1, line_2 in zip(f1, f2): ground_1, pred_1 = map(int, line_1.strip().split()[1:]) ground_2, pred_2 = map(int, line_2.strip().split()[1:]) if ground_1 != ground_2: logger.error('Files do not belong to the same dataset') sys.exit(1) y_ground.append(ground_1) y_1.append(pred_1) y_2.append(pred_2) if pred_1 == ground_1: if pred_2 == ground_1: a += 1 else: b += 1 else: if pred_2 == ground_1: c += 1 else: d += 1 nb_lines += 1 logger.info('Loaded {} lines..'.format(nb_lines)) logger.info('| {} | {} |'.format(a, b)) logger.info('| {} | {} |'.format(c, d)) y_ground = np.array(y_ground) y_1 = np.array(y_1) y_2 = np.array(y_2) tb = mcnemar_table(y_target=y_ground, y_model1=y_1, y_model2=y_2) logger.info('\n {}'.format(tb)) chi2, p = mcnemar(ary=tb, corrected=True) logger.info('chi-squared: {}'.format(chi2)) logger.info('p-value: {}'.format(p))
def stat_test(df, classifier1, classifier2): x = df['Cleaned'].values y = df['Class'].values # split dataset into training and test sets, with 80:20 split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1000, stratify=y) # vectorizer for first classifier # vectorizer = CountVectorizer() vectorizer = TfidfVectorizer() vectorizer.fit(x_train) X_test = vectorizer.transform(x_test) y_pred_1 = classifier1.predict(X_test) # vectorizer for second classifier # vectorizer = CountVectorizer() vectorizer = TfidfVectorizer() vectorizer.fit(x_train) X_test = vectorizer.transform(x_test) y_pred_2 = classifier2.predict(X_test) contingency_table = mcnemar_table(y_target=y_test, y_model1=y_pred_1, y_model2=y_pred_2) print(contingency_table) chi2, p_val = mcnemar(ary=contingency_table, corrected=True) print('chi-squared:', chi2) print('p-value:', p_val)
def compute_stat_sig(systems_data, measure): significance = defaultdict(list) for system in ["systemNoFcNoFs", "systemNoFc", "systemFcPost", "sota"]: for other_system in ["sota", "human"]: if system == other_system: continue sys_data = [x[measure] for x in systems_data[system]] other_sys_data = [x[measure] for x in systems_data[other_system]] true_data = [1] * len(sys_data) tb_b = mcnemar_table(y_target=np.array(true_data), y_model1=np.array(sys_data), y_model2=np.array(other_sys_data)) chi2, p_value = mcnemar(ary=tb_b, corrected=True) print(tb_b) print( f"mcnemar {system},{other_system}: chi2: {chi2}, p-value {p_value}" ) if p_value <= 0.05 and p_value >= 0: significance[system].append(other_system[0]) significance[system] = ",".join(significance[system]) return significance
df_result['bi_F1'].to_numpy(), df_result['unibi_F1'].to_numpy()) print("ANNOVA F1 : %0.5f, %0.5f" % result) # Coher Q analysis y_uni = sr_uni.to_numpy() y_bi = sr_bi.to_numpy() y_unibi = sr_unibi.to_numpy() q, p_value = cochrans_q(y, y_uni, y_bi, y_unibi) print("COHRAN Q-Test: q: %0.5f, p_value: %0.5f" % (q, p_value)) l_grams = ['uni', 'bi', 'unibi'] l_rslt = [y_uni, y_bi, y_unibi] l_pair = list(zip(l_grams, l_rslt)) l_mcnemar_rslt = [] for i, t0 in enumerate(l_pair): for j, t1 in enumerate(l_pair[i + 1:]): k0 = t0[0] k1 = t1[0] v0 = t0[1] v1 = t1[1] tb = mcnemar_table(y_target=y, y_model1=v0, y_model2=v1) chi2, p = mcnemar(ary=tb, corrected=True) l_mcnemar_rslt.append("{chi2:.5f}".format(chi2=chi2)) l_mcnemar_rslt.append("{p:.5f}".format(p=p)) print(f"McNemar %s v %s: chi2 : %0.5f, p_value: %0.5f" % (k0, k1, chi2, p)) print(" ".join(l_mcnemar_rslt))
def train(): data_x1, data_x2, data_y = load_tensors() sizedata = len(data_x1) print("Data of size:", sizedata) print("Data 2 of size:", len(data_x2)) # Split dataset into 5 sub-datasets splitted_x1 = list(split(data_x1, 5)) splitted_x2 = list(split(data_x2, 5)) splitted_y = list(split(data_y, 5)) print("Available GPU :", torch.cuda.is_available()) torch.cuda.set_device(0) k = ARGS.kFold # Prepare array of scores precision_list = [] recall_list = [] # valloss_list = [] AUC_list = [] for ind_i in range(0, k): # Prepare X_train Y_train X_test Y_test X_test1 = splitted_x1[ind_i] X_test2 = splitted_x2[ind_i] Y_test = splitted_y[ind_i] # Deep copy, otherwise iteration problem copysplitX1 = list(splitted_x1) copysplitX2 = list(splitted_x2) copysplitY = list(splitted_y) del copysplitX1[ind_i] del copysplitX2[ind_i] del copysplitY[ind_i] X_train1 = copysplitX1 # CUI + CCS X_train2 = copysplitX2 # CUI only Y_train = copysplitY modelCUI = Network(0).cuda() modelCCS = Network(1).cuda() # XAVIER Init modelCUI.apply(init_weights) modelCCS.apply(init_weights) with torch.cuda.device(0): # Hyperparameters : epochs = ARGS.nEpochs batchsize = ARGS.batchSize learning_rate = ARGS.lr log_interval = 2 criterion = nn.BCEWithLogitsLoss() # criterion = nn.BCELoss() # criterion = nn.CrossEntropyLoss() optimizer1 = optim.SGD(modelCUI.parameters(), lr=learning_rate) optimizer2 = optim.SGD(modelCCS.parameters(), lr=learning_rate) # optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Train loader numplist = np.array(X_train2) arrX = np.concatenate(numplist).tolist() tensor_x = torch.Tensor(arrX).cuda() numplist = np.array(Y_train) arrY = np.concatenate(numplist).tolist() tensor_y = torch.Tensor(arrY).cuda() print("Shape X:", np.shape(arrX), "Shape Y:", np.shape(arrY)) dataset = dt.TensorDataset(tensor_x, tensor_y) # create your dataset train_loader1 = dt.DataLoader(dataset, batch_size=batchsize, shuffle=True) numplist = np.array(X_train1) arrX = np.concatenate(numplist).tolist() tensor_x = torch.Tensor(arrX).cuda() dataset = dt.TensorDataset(tensor_x, tensor_y) train_loader2 = dt.DataLoader(dataset, batch_size=batchsize, shuffle=True) # Test loader tensor_x = torch.Tensor( np.array(X_test2).tolist()).cuda() # transform to torch tensor tensor_y = torch.Tensor(np.array(Y_test).tolist()).cuda() dataset = dt.TensorDataset(tensor_x, tensor_y) # create your dataset test_loader1 = dt.DataLoader(dataset, batch_size=batchsize, shuffle=False) tensor_x = torch.Tensor(np.array(X_test1).tolist()).cuda() dataset = dt.TensorDataset(tensor_x, tensor_y) test_loader2 = dt.DataLoader(dataset, batch_size=batchsize, shuffle=False) # Training model CUI print("Training CUI model...") for epoch in range(epochs): for batch_idx, (data, target) in enumerate(train_loader1): data, target = Variable(data), Variable(target) optimizer1.zero_grad() net_out = modelCUI(data) loss = criterion(net_out, target) loss.backward() optimizer1.step() # if batch_idx % log_interval == 0: # print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: '.format( # epoch, batch_idx * len(data), len(train_loader1.dataset), # 100. * batch_idx / len(train_loader1))) # print(loss.data) print("Training CUI+CCS model...") # Training model CUI+CCS for epoch in range(epochs): for batch_idx, (data, target) in enumerate(train_loader2): data, target = Variable(data), Variable(target) optimizer2.zero_grad() net_out = modelCCS(data) loss = criterion(net_out, target) loss.backward() optimizer2.step() # if batch_idx % log_interval == 0: # print('Train Epoch: {} [{}/{} ({:.0f}%)]\t Loss: '.format( # epoch, batch_idx * len(data), len(train_loader2.dataset), # 100. * batch_idx / len(train_loader2))) # print(loss.data) # Testing and save score total = 0 correct = 0 modelCUI.eval() modelCCS.eval() P = list() R = list() test_loader_list = list([test_loader1, test_loader2]) model_list = list([modelCUI, modelCCS]) nemarlist = list([np.array([]), np.array([])]) # Precisions for model, test_loader in zip(model_list, test_loader_list): for i in range(1, 4): for data in test_loader: x, labels = data outputs = model(Variable(x)).detach( ) # output is a tensor of size [BATCHSIZE][ARGS.numberOfOutputCodes] _, predicted = torch.topk(outputs.data, i) for y_predlist, y in zip(predicted, labels): for y_pred in y_predlist: total += 1 if y[y_pred] == 1: correct += 1 precision = correct / total P.append(precision) correct = 0 total = 0 for model, test_loader, mcnemar_idx in zip(model_list, test_loader_list, list([0, 1])): # Number of diagnostic for each sample (mean of 12 codes, max of 30 codes, R@10 - R@20 - R@30 seems appropriate) total_true_list = list() for data in test_loader: x, labels = data for y in labels: total_true = 0 for val in y: if val == 1: total_true += 1 total_true_list.append(total_true) # Recalls for i in range(10, 40, 10): total_true_list_cpy = list(total_true_list) for data in test_loader: x, labels = data outputs = model(Variable(x)).detach() _, predicted = torch.topk(outputs.data, i) for y_predlist, y in zip(predicted, labels): total += total_true_list_cpy.pop(0) for y_pred in y_predlist: if y[y_pred] == 1: correct += 1 if i == 30: nemarlist[mcnemar_idx] = np.append( nemarlist[mcnemar_idx], 1) else: if i == 30: if correct < total: nemarlist[mcnemar_idx] = np.append( nemarlist[mcnemar_idx], 0) else: nemarlist[mcnemar_idx] = np.append( nemarlist[mcnemar_idx], 1) # Else, there's no more diagnoses to be found, so we will not consider it as wrong recall = correct / total R.append(recall) correct = 0 total = 0 precision_list.append(P) recall_list.append(R) # AUROC YTRUE = None YPROBA = None for data in test_loader: x, labels = data x, labels = Variable(x), Variable(labels) outputs = model(x).detach().cpu().numpy() labels = labels.detach().cpu().numpy() for batch_true, batch_prob in zip(labels, outputs): YTRUE = np.concatenate( (YTRUE, [batch_true]), axis=0) if YTRUE is not None else [batch_true] YPROBA = np.concatenate( (YPROBA, [batch_prob]), axis=0) if YPROBA is not None else [batch_prob] ROC_avg_score = roc(YTRUE, YPROBA, average='micro', multi_class='ovr') AUC_list.append(ROC_avg_score) # McNemar test nemar_true = np.ones(nemarlist[0].size) nemar_m1 = nemarlist[0] nemar_m2 = nemarlist[1] tb = mcnemar_table(y_target=nemar_true, y_model1=nemar_m1, y_model2=nemar_m2) # print("Matrix: ", tb) chi2, p = mcnemar(ary=tb, corrected=True) # print('chi-squared:', chi2) # print('p-value:', p) filesave = open("McNemar_report.txt", "a") filesave.write("\nMatrix: ") filesave.write(str(tb)) filesave.write("\np-value and chi-squared:") filesave.write(str(p)) filesave.write(" ") filesave.write(str(chi2)) filesave.close() # Output score of each fold + average print("Scores for each fold:") print("Precision:", precision_list) print("Recall:", recall_list) print("AUROC:", AUC_list)
predictions2 = "./results/2_single_COBRE/single/predictions.npz" # --------------------------------------------------------------------------------- # --------------------------------------------------------------------------------- pred1_file = np.load(predictions1) prediction_1 = pred1_file["y_predictions"] pred1 = prediction_1[0] for i in range(1, len(prediction_1)): pred1 = np.hstack((pred1, prediction_1[i])) y_true_list = pred1_file["y_true"] y_true = y_true_list[0] for i in range(1, len(y_true_list)): y_true = np.hstack((y_true, y_true_list[i])) pred2_file = np.load(predictions2) prediction_2 = pred2_file["y_predictions"] pred2 = prediction_2[0] for i in range(1, len(prediction_2)): pred2 = np.hstack((pred2, prediction_2[i])) tb = mcnemar_table(y_target=y_true, y_model1=pred1, y_model2=pred2) chi2, p = mcnemar(ary=tb, corrected=True) print('chi-squared:', chi2) print('p-value:', p)
# Predict Output y_pred_SVM = SVM.predict(data_test[:, :-1]) # Use accuracy_score function to get the accuracy print("SVM Accuracy Score: ", accuracy_score(y_pred_SVM, data_test[:, 4]) * 100) # Comparing Classifiers - McNemar test #pip install mlxtend from mlxtend.evaluate import mcnemar_table tb = mcnemar_table(y_target=data_test[:, 4], y_model1=y_pred_SVM, y_model2=y_pred_GNB) print(tb) chi_GNBC_SVM = ((abs(tb[0, 1] - tb[1, 0]) - 1)**2) / (tb[0, 1] + tb[1, 0]) print(chi_GNBC_SVM) # Comparing Classifiers - Approximate normal test # SVM test from sklearn.metrics import confusion_matrix Conf_matrix_SVM = confusion_matrix(data_test[:, 4], y_pred_SVM) print(Conf_matrix_SVM) X = (Conf_matrix_SVM[0, 1] + Conf_matrix_SVM[1, 0])
def score(data_folder, out_folder, task, score_folder): data_folder = Path(data_folder) out_folder = Path(out_folder) datasets = ["ldc", "viggo", "webnlg", "e2e"] systems = ["systemNoFcNoFs", "systemNoFc", "systemFcPost", "sota", "human"] stats = {} first = [] second = [] for dataset in datasets: print(f"processing {dataset}") systems_data = {} for system in systems: systems_data[system] = json.load( open(data_folder / dataset / f"{system}.json")) print(f"dataset: {dataset}") all_scored = defaultdict(list) score_folder = Path(score_folder) score_file = score_folder / task / (f"{dataset}.csv") total_texts = 5 try: df = pd.read_csv(score_file) except: print(f"{score_file} not available.") continue scores = df.to_dict(orient="records") try: input_df = pd.read_csv(out_folder / task / (f"mturk_{dataset}.csv")) except: print(f"ignoring {dataset}") continue input_data = input_df.to_dict(orient="records") if task == "fidelity_annotations": for item in scores: for i in range(total_texts): text = item[f"Input.text{i + 1}"] index = item["Input.index"] accurate = f"Answer.text{i + 1}_accurate.text{i + 1}_accurate" key = f"{index}_{text}" try: all_scored[key].append({"accurate": item[accurate]}) except: import ipdb ipdb.set_trace() fidelity_scores = [] all_ser_scores = [] all_sfc_scores = [] true_scores_sfc = [] true_scores_ser = [] sfc_data = defaultdict(list) ser_data = defaultdict(list) for x in all_scored: try: one = all_scored[x][0]["accurate"] two = all_scored[x][1]["accurate"] first.append(one) second.append(two) except: pass for item in input_data: for i in range(total_texts): text_i = item[f"text{i + 1}"] system = item[f"system{i + 1}"] index = item["index"] key = f"{index}_{text_i}" if key in all_scored: obj = systems_data[system][index] score = np.mean( [int(x["accurate"]) for x in all_scored[key]]) # these have to be reconciled if disagreeing: take ceil or floor sample_type = f'{"A_D" if obj["sfc_correct"] else "E_D"}' if dataset != "ldc": sample_type += f',{"A_H" if obj["ser_correct"] else "E_H"}' fidelity_scores.append({ "ind": index, "system": system, "value": math.ceil(score), "sample_type": sample_type, "text": text_i, "data": item["data"], "original_text": obj["original_" + dataset_fields[dataset]["text"].strip()], "sfc_correct": obj["sfc_correct"], "ser_correct": obj["ser_correct"] if "ser_correct" in obj else "", }) # Reconciled cases are those where the expert annotators disagreed. They discussed these and # reached the following agreements reconciled = { "Example 1": 0, "Example 2": 1, } if text_i in reconciled: true_scores_sfc.append(reconciled[text_i]) true_scores_ser.append(reconciled[text_i]) else: add_closest_score(score, true_scores_sfc, obj["sfc_correct"]) if dataset != "ldc": add_closest_score(score, true_scores_ser, obj["ser_correct"]) all_sfc_scores.append(obj["sfc_correct"]) sfc_data[system].append(obj["sfc_correct"]) if dataset != "ldc": all_ser_scores.append(obj["ser_correct"]) ser_data[system].append(obj["ser_correct"]) if dataset != "ldc": c_report = classification_report(true_scores_ser, all_ser_scores) stats[f"{dataset}_ser_report"] = classification_report( true_scores_ser, all_ser_scores, output_dict=True) print("SER") print(c_report) c_report = classification_report(true_scores_sfc, all_sfc_scores) stats[f"{dataset}_sfc_report"] = classification_report( true_scores_sfc, all_sfc_scores, output_dict=True) print("SFC") print(c_report) mturk_df = pd.DataFrame(fidelity_scores) agg_stats = mturk_df.groupby(["system"]).agg(["mean", "count"]) print(agg_stats) stats[f"{dataset}_score"] = agg_stats.to_dict()[("value", "mean")] stats[f"{dataset}_count"] = agg_stats.to_dict()[("value", "count")] print( mturk_df.groupby(["system", "sample_type"]).agg(["mean", "count"])) if dataset != "ldc": tb_b = mcnemar_table( y_target=np.array(true_scores_sfc), y_model1=np.array(all_sfc_scores), y_model2=np.array(all_ser_scores), ) print(tb_b) chi2, p = mcnemar(ary=tb_b, corrected=True) print(f"mcnemar chi2: {chi2}, p-value {p}") for measure in ["sfc_correct", "ser_correct"]: if measure == "ser_correct" and dataset == "ldc": continue stats[f"{dataset}_significance_{measure}"] = compute_stat_sig( systems_data, system, measure) elif task == "fluency": for item in scores: for i in range(total_texts): field = f"Input.text{i + 1}" answer_field = f"Answer.fluency{i + 1}" all_scored[item[field]].append(item[answer_field]) for x in all_scored: all_scored[x] = { "average": np.mean(all_scored[x]), "count": len(all_scored[x]) } fluency_scores = defaultdict(list) for item in input_data: for i in range(total_texts): if item[f"text{i + 1}"] in all_scored: score = all_scored[item[f"text{i + 1}"]]["average"] system = item[f"system{i + 1}"] fluency_scores[system].append(score) fluency_df_values = [] for system in fluency_scores: fluency_df_values.extend([{ "system": system, "value": fluency_scores[system][i] } for i in range(len(fluency_scores[system]))]) mturk_df = pd.DataFrame(fluency_df_values) agg_stats = mturk_df.groupby(["system" ]).agg(["mean", "count", "median"]) print(agg_stats) stats[dataset] = agg_stats.to_dict()[("value", "mean")] test_stats = sp.posthoc_wilcoxon(mturk_df, val_col="value", group_col="system", sort=True, zero_method="zsplit") print(test_stats) significance = defaultdict(list) for system in [ "systemNoFcNoFs", "systemNoFc", "systemFcPost", "sota" ]: for other_system in ["sota", "human"]: p_value = test_stats.loc[system, other_system] if p_value <= 0.05 and p_value >= 0: significance[system].append(other_system[0]) significance[system] = ",".join(significance[system]) stats[f"{dataset}_significance"] = significance print(cohen_kappa_score(first, second)) json.dump(stats, open(data_folder / f"{task}.json", "w"), indent=2)
def main(): #open needed files test_data = pd.read_csv('data/test_data.csv', encoding='ISO-8859-1') train_data = pd.read_csv('data/train_data.csv', encoding='ISO-8859-1') train_bigram = pd.read_pickle('saved_pickles_models/bigram.pkl') train_id2word = pd.read_pickle('saved_pickles_models/id2word.pkl') train_corpus = pd.read_pickle('saved_pickles_models/corpus.pkl') model = pd.read_pickle('saved_pickles_models/lda_model2.model') scaler = StandardScaler() test_data_list = [] feature_vectors = [] test_vectors = [] #get distributions from every tweet in train_data print('Getting distribution...') for i in range(len(train_data)): train_top_topics = model.get_document_topics(train_corpus[i], minimum_probability=0.0) train_topic_vector = [train_top_topics[i][1] for i in range(10)] feature_vectors.append(train_topic_vector) x = np.array(feature_vectors) y = np.array(train_data.relevant) kf = KFold(5, shuffle=True, random_state=42) log_res_train_f1, log_res_sgd_train_f1, mod_huber_train_f1 = [], [], [] print('Starting classification algorithm calculations on training data...') for train_ind, val_ind in kf.split(x, y): x_train, y_train = x[train_ind], y[train_ind] x_val, y_val = x[val_ind], y[val_ind] x_train_scale = scaler.fit_transform(x_train) x_val_scale = scaler.transform(x_val) #logistic regression log_reg_train = LogisticRegression(class_weight='balanced', solver='newton-cg', fit_intercept=True).fit( x_train_scale, y_train) log_reg_train_y_pred = log_reg_train.predict(x_val_scale) log_res_train_f1.append( f1_score(y_val, log_reg_train_y_pred, average='binary')) #loss=log sgd = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, loss='log', class_weight='balanced').fit( x_train_scale, y_train) sgd_y_pred = sgd.predict(x_val_scale) log_res_sgd_train_f1.append( f1_score(y_val, sgd_y_pred, average='binary')) #modified huber sgd_huber = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, alpha=20, loss='modified_huber', class_weight='balanced').fit( x_train_scale, y_train) sgd_huber_y_pred = sgd_huber.predict(x_val_scale) mod_huber_train_f1.append( f1_score(y_val, sgd_huber_y_pred, average='binary')) print('Done with training data. Starting on testing data...\n') #gather all test tweets and apply the clean_data() and get_bigram() functions print('Cleaning testing data...') for row in test_data['tweets']: cleaned_status = clean_status(row) test_data_list.append(cleaned_status) bigrams = get_bigram(test_data_list) test_bigram = [bigrams[entry] for entry in test_data_list] test_corpus = [train_id2word.doc2bow(tweets) for tweets in test_bigram] #test model on testing data print('Starting classification algorithm calculations on testing data...') for i in range(len((test_data))): top_topics = model.get_document_topics(test_corpus[i], minimum_probability=0.0) topic_vector = [top_topics[i][1] for i in range(10)] test_vectors.append(topic_vector) x_test = np.array(test_vectors) y_test = np.array(test_data.relevant) x_fit = scaler.fit_transform(x_test) #logistic regression log_reg_test = LogisticRegression(class_weight='balanced', solver='newton-cg', fit_intercept=True).fit(x_fit, y_test) y_pred_log_res_test = log_reg_test.predict(x_test) #modified huber sgd_huber_test = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, alpha=20, loss='modified_huber', class_weight='balanced', shuffle=True).fit( x_fit, y_test) y_pred_huber_test = sgd_huber_test.predict(x_fit) #print results for both cases print('Calculating Summary...') y_target = y_test y_model1 = y_pred_log_res_test y_model2 = y_pred_huber_test m_table = mcnemar_table(y_target=y_test, y_model1=y_model1, y_model2=y_model2) chi2, p = mcnemar(ary=m_table, corrected=True) print('\n') print('Results from using training data distribution:') print( f'Logistic Regression Val f1: {np.mean(log_res_train_f1):.3f} +- {np.std(log_res_train_f1):.3f}' ) print( f'Logisitic Regression SGD Val f1: {np.mean(log_res_sgd_train_f1):.3f} +- {np.std(log_res_sgd_train_f1):.3f}' ) print( f'SVM Huber Val f1: {np.mean(mod_huber_train_f1):.3f} +- {np.std(mod_huber_train_f1):.3f}' ) print('\n') print('Results from using unseen test data:') print('Logistic regression Val f1: ' + str(f1_score(y_test, y_pred_log_res_test, average='binary'))) print('Logistic regression SGD f1: ' + str(f1_score(y_test, y_pred_huber_test, average='binary'))) print('\n') print('Summary: ') print('ncmamor table: ', m_table) print('chi-squared: ', chi2) print('p-value: ', p) #Save feature vector and huber classifier for later use print('\n') print('Saving feature vector...') save_vector = open('saved_pickles_models/feature_vector.pkl', 'wb') pickle.dump(feature_vectors, save_vector) save_vector.close() print('\n') print('Saving the huber classifier...') save_huber = open('saved_pickles_models/huber_classifier.pkl', 'wb') pickle.dump(sgd_huber, save_huber) save_huber.close() print('done')
def main( mlflow_server: str, significance: float, ): # We start by setting the tracking uri to make sure the mlflow server is reachable mlflow.set_tracking_uri(mlflow_server) # We need to instantiate the MlflowClient class for certain operations mlflow_client = MlflowClient() # We create and set an experiment to group all runs mlflow.set_experiment("Model Comparison") # We create classification data and split it into training and testing sets X, y = make_classification( n_samples=10000, n_classes=2, n_features=20, n_informative=9, random_state=random_seed, ) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2) # We first train a Logistic regression model, log it in mlflow and then move it to the production stage with mlflow.start_run(): lr_model = LogisticRegression() lr_model.fit(X_train, y_train) y_pred = lr_model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) mlflow.log_metric("accuracy", accuracy) mlflow.sklearn.log_model(lr_model, artifact_path="model", registered_model_name="Logistic Regression") mlflow_client.transition_model_version_stage(name="Logistic Regression", version=1, stage="Production") # We then train a Random Forest model, log it in mlflow and then move it to the staging stage with mlflow.start_run(): rf_model = RandomForestClassifier() rf_model.fit(X_train, y_train) y_pred = rf_model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) mlflow.log_metric("accuracy", accuracy) mlflow.sklearn.log_model(rf_model, artifact_path="model", registered_model_name="Random Forest") mlflow_client.transition_model_version_stage(name="Random Forest", version=1, stage="Staging") del lr_model del rf_model # We finally load both models from MLFlow # and compare them using the McNemar test # We get the download uris of both models and then we load them lr_model_download_uri = mlflow_client.get_model_version_download_uri( name="Logistic Regression", version=1, ) rf_model_download_uri = mlflow_client.get_model_version_download_uri( name="Random Forest", version=1, ) lr_model = mlflow.sklearn.load_model(lr_model_download_uri) rf_model = mlflow.sklearn.load_model(rf_model_download_uri) y_pred_lr = lr_model.predict(X_test) y_pred_rf = rf_model.predict(X_test) contingency_table = mcnemar_table(y_test, y_pred_lr, y_pred_rf) _, p_value = mcnemar(contingency_table, corrected=True) if p_value < significance: # In this case we reject the null hypothesis that the two models' are similar # We then archive the logistic regression model # and move the random forest model to the Production stage print( f"p-value {p_value} smaller than significance level {significance}" ) accuracy_lr = accuracy_score(y_test, y_pred_lr) accuracy_rf = accuracy_score(y_test, y_pred_rf) if accuracy_lr < accuracy_rf: print( f"Random Forest model's accuracy, {accuracy_rf}, is greater than " f"the Logistic Regression model's accuracy, {accuracy_lr}") print( "Archiving logistic regression model and moving random forest model to production" ) mlflow_client.transition_model_version_stage( name="Logistic Regression", version=1, stage="Archived", ) mlflow_client.transition_model_version_stage( name="Random Forest", version=1, stage="Production", ) else: print( f"Random Forest model's accuracy, {accuracy_rf}, is less than or equal to " f"the Logistic Regression model's accuracy, {accuracy_lr}") print("Keeping logistic regression model in production") else: print( f"p-value {p_value} greater than significance level {significance}" ) print("Keeping logistic regression model in production")
i = 1 counter_1 += 1 condel_binary.append(i) # print(1) elif i < 0.522: i = 0 counter_0 += 1 condel_binary.append(i) # print(0) print('matthews_corr_coef (condel): ', matthews_corrcoef(true_class_binary, condel_binary)) ################################################################# '''SIFT''' sift_and_model = mcnemar_table(y_target=np.array(true_class_binary), y_model1=np.array(model_binary), y_model2=np.array(sift_binary)) print('model & sift: ', '\n', sift_and_model) chi2, p = mcnemar(ary=sift_and_model, corrected=True) print(' chi_squared: ', chi2) print(' p-value: ', p) brd = checkerboard_plot(sift_and_model, figsize=(2, 2), fmt='%d', col_labels=['model 2 wrong', 'model 2 right'], row_labels=['model 1 wrong', 'model 1 right']) plt.show() '''PPH2''' pph2_and_model = mcnemar_table(y_target=np.array(true_class_binary),
l_pval.append("%0.5f" % result[1]) result = ttest_rel(df_result['bow_uni_R'].to_numpy(), df_result['tfidf_uni_R'].to_numpy()) print("Paired Test R: %0.5f, %0.5f" % result) l_stat.append("%0.5f" % result[0]) l_pval.append("%0.5f" % result[1]) result = ttest_rel(df_result['bow_uni_F1'].to_numpy(), df_result['tfidf_uni_F1'].to_numpy()) print("Paired Test F1: %0.5f, %0.5f" % result) l_stat.append("%0.5f" % result[0]) l_pval.append("%0.5f" % result[1]) # McNemar Test y_bow_uni = sr_bow_uni.to_numpy() y_tfidf_uni = sr_tfidf_uni.to_numpy() tb = mcnemar_table(y_target=y, y_model1=y_bow_uni, y_model2=y_tfidf_uni) chi2, p = mcnemar(ary=tb, corrected=True) print(tb) print("Mcnemar: chi2: %0.5f %0.5f" % (chi2, p)) # print("T_STAT: ", ' '.join(l_stat)) # print("P_VAL: ", ' '.join(l_pval)) # df_out = pd.DataFrame() # df_out['Y_TRUE'] = df['label'] # df_out['Y_BOW_UNI'] = sr_bow_uni # df_out['Y_TFIDF_UNI'] = sr_tfidf_uni # df_out.to_excel("RESULT.xlsx")
if len(g[1].strip()) == 1: y_true.append(0) else: y_true.append(1) # Print evaluation report, confusion matrix and f2 scores print("Evaluation full featured model:") print(classification_report(y_true, y_pred2, labels=[1, 0])) print("Confusion matrix:") print(confusion_matrix(y_true, y_pred2, labels=[1, 0])) print("MCC", matthews_corrcoef(y_true, y_pred2)) print("F2 - None", fbeta_score(y_true, y_pred2, average=None, beta=2)) print("F2 - weighted", fbeta_score(y_true, y_pred2, average='weighted', beta=2)) print("F2 - micro", fbeta_score(y_true, y_pred2, average='micro', beta=2)) print("F2 - macro", fbeta_score(y_true, y_pred2, average='macro', beta=2)) # McNemar test y_true = np.array(y_true) y_pred = np.array(y_pred) y_pred2 = np.array(y_pred2) tb = mcnemar_table(y_target=y_true, y_model1=y_pred, y_model2=y_pred2) print("McNemar contigency table") print(tb) chi2, p = mcnemar(ary=tb, corrected=True) print('chi-squared:', chi2) print('p-value:', p)