def check_alternative_lrap_implementation(lrap_score, n_classes=5, n_samples=20, random_state=0): _, y_true = make_multilabel_classification(n_features=1, allow_unlabeled=False, random_state=random_state, n_classes=n_classes, n_samples=n_samples) # Score with ties y_score = sparse_random_matrix(n_components=y_true.shape[0], n_features=y_true.shape[1], random_state=random_state) if hasattr(y_score, "toarray"): y_score = y_score.toarray() score_lrap = label_ranking_average_precision_score(y_true, y_score) score_my_lrap = _my_lrap(y_true, y_score) assert_almost_equal(score_lrap, score_my_lrap) # Uniform score random_state = check_random_state(random_state) y_score = random_state.uniform(size=(n_samples, n_classes)) score_lrap = label_ranking_average_precision_score(y_true, y_score) score_my_lrap = _my_lrap(y_true, y_score) assert_almost_equal(score_lrap, score_my_lrap)
def label_ranking_average_precision_score(self, predictor, batch_size=50): from sklearn.metrics import label_ranking_average_precision_score # 计算predict p = [] for xq_batch, xa_batch, _ in super(QaPairsTest, self).sampling(batch_size): delta = predictor(xq_batch, xa_batch) p += delta[0].tolist() p = np.array(p) # 筛选可以用来评估的样本 # 1. 没有正例无法计算得分 # 2. 没有负例评分没有意义 map_record = [] skip1 = 0 skip2 = 0 for question, entry in self.questions.items(): idx = np.array(entry['idx']) if self.y_np[idx].max() == 0: skip1 += 1 continue if self.y_np[idx].min() != 0: skip2 += 1 #continue score = p[idx].reshape(idx.shape).tolist() map = label_ranking_average_precision_score(np.array([entry['label']]), np.array([score])) map_record.append(map) logging.info('Skip1 %d Skip2 %d' % (skip1, skip2)) return np.array(map_record).mean()
def forward(self, bottom, top): """Compute the label ranking average precision.""" y_score = bottom[0].data y_true = bottom[1].data label_rank_avg_prec = metrics.label_ranking_average_precision_score(y_true, y_score) top[0].data[...] = label_rank_avg_prec
def label_ranking_average_precision_score2(self, model, batch_size=50): def label_ranking_average_precision_score(label, score): assert len(label) == len(score) data = zip(label, score) data = sorted(data, key=lambda x:x[1],reverse=True) count = 0.0 values = [] for i in range(len(data)): if data[i][0]: count += 1 values.append(count / (i + 1)) assert len(values) return sum(values) / count, values[0] p = model.predict( {'q_input': self.xq_np, 'a_input':self.xa_np}, batch_size=batch_size ) map_record = [] for question, entry in self.questions.items(): idx = np.array(entry['idx']) if self.y_np[idx].max() == 0: continue score = p[idx].reshape(idx.shape).tolist() map, _ = label_ranking_average_precision_score(entry['label'], score) map_record.append(map) self.saveResult(question, map, score) map = np.array(map_record).mean() self.saveResult('__TOTAL_MAP__', map) return map
def print_report(name_classificator, testing_problems, testing_tags, predicted_problems, predicted_tags): predicted_problems, predicted_tags = make_right_order(testing_problems, predicted_problems, predicted_tags) mlb = MultiLabelBinarizer().fit(testing_tags + predicted_tags) testing_tags = mlb.transform(testing_tags) predicted_tags = mlb.transform(predicted_tags) print(name_classificator) print(classification_report(testing_tags, predicted_tags, target_names=mlb.classes_)) print('label ranking average precision score =', label_ranking_average_precision_score(testing_tags, predicted_tags)) print('\n', ('#'*100), '\n')
def evaluate_network(network, X_test, y_test, classes_names, length=1000, batch_size=64): resp = network.predict_proba(X_test[:length], batch_size=batch_size, verbose=False) resc = network.predict_classes(X_test[:length], batch_size=batch_size, verbose=False) a1 = [] a2 = [] cpt = 0 cpt_on = [] cpt_real = [] cpt_should = 0 should = [] cpt_shouldnt = 0 shouldnt = [] for idx, i in enumerate(resc): a1.append(i) a2.append(np.array(y_test[idx]).argmax()) if i.tolist() == [0, 0, 0, 0]: cpt += 1 cpt_on.append(resp[idx].argmax()) cpt_real.append(np.array(y_test[idx]).argmax()) if cpt_on[-1] == cpt_real[-1]: cpt_should += 1 should.append(resp[idx].argmax()) else: cpt_shouldnt += 1 shouldnt.append(resp[idx].argmax()) # print(resp[idx]) print("No decision: %d / %d [%.02f%%]" % (cpt, len(resc), (cpt / float(len(resc))) * 100), end="") print(cpt_should, cpt_shouldnt) print("Accuracy: %.06f" % metrics.label_ranking_average_precision_score(y_test[:length], resp)) cpt_on = np.array(cpt_on) print(metrics.classification_report(a1, a2, target_names=classes_names)) print("Confusion matrix:") cm = confusion_matrix(a1, a2) print(cm) sns.set_style("ticks") sns.mpl.rc("figure", figsize=(8, 4)) np.set_printoptions(precision=2) cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] fig = plt.figure() plt.imshow(cm_normalized, interpolation="nearest", cmap=plt.cm.Blues) plt.title("Normalized confusion matrix") plt.colorbar() tick_marks = np.arange(len(classes_names)) plt.xticks(tick_marks, classes_names, rotation=45) plt.yticks(tick_marks, classes_names) plt.ylabel("True label") plt.xlabel("Predicted label") plt.tick_params(which="both", direction="in", length=0) plt.show()
def ranking_precision(self): """ Label ranking average precision (LRAP) is the average over each ground truth label assigned to each sample, of the ratio of true vs. total labels with lower score. This metric will yield better scores if you are able to give better rank to the labels associated with each sample. The obtained score is always strictly greater than 0, and the best value is 1. """ self.ranking_precision = metrics.label_ranking_average_precision_score( self.ground_truth, self.predictions_raw) return self.ranking_precision
def multi_label_evaluate(y, y_prob, threshold): statistics = Statistics() y_pred = (y_prob >= threshold).astype(int) y_pred_50 = (y_prob >= 0.5).astype(int) ranking_loss = label_ranking_loss(y, y_pred) lraps = label_ranking_average_precision_score(y, y_pred) ranking_loss_50 = label_ranking_loss(y, y_pred_50) lraps_50 = label_ranking_average_precision_score(y, y_pred_50) f1_macro = f1_score(y, y_pred, average='macro') f1_macro_50 = f1_score(y, y_pred_50, average='macro') statistics.update_statistics("Multi-Label", "Ranking Loss", ranking_loss) statistics.update_statistics("Multi-Label", "Ranking Precision", lraps) statistics.update_statistics("Multi-Label", "Ranking Loss (t=0.5)", ranking_loss_50) statistics.update_statistics("Multi-Label", "Ranking Precision (t=0.5)", lraps_50) statistics.update_statistics("Multi-Label", "Macro F1", f1_macro) statistics.update_statistics("Multi-Label", "Macro F1 (t=0.5)", f1_macro_50) try: auc_macro = roc_auc_score(y, y_pred, average='macro') auc_macro_50 = roc_auc_score(y, y_pred_50, average='macro') auc_pr_macro = roc_auc_score(y, y_prob, average='macro') statistics.update_statistics("Multi-Label", "Macro AUC", auc_macro) statistics.update_statistics("Multi-Label", "Macro AUC (t=0.5)", auc_macro_50) statistics.update_statistics("Multi-Label", "Macro AUC (Pr)", auc_pr_macro) except ValueError: statistics.update_statistics("Multi-Label", "Macro AUC", np.NaN) statistics.update_statistics("Multi-Label", "Macro AUC (t=0.5)", np.NaN) statistics.update_statistics("Multi-Label", "Macro AUC (Pr)", np.NaN) return statistics
def _generate_classification_reports(y_true, y_pred, target_names=None): # Calculate additional stats total_accuracy = accuracy_score(y_true, y_pred) cov_error = coverage_error(y_true, y_pred) lrap = label_ranking_average_precision_score(y_true, y_pred) report = metrics.multilabel_prediction_report(y_true, y_pred) report += '\n\n' report += metrics.multilabel_classification_report(y_true, y_pred, target_names=target_names) report += '\n\n' report += 'coverage error: %.3f' % cov_error report += '\n' report += 'LRAP: %.3f' % lrap report += '\n' report += 'total accuracy: %.3f' % total_accuracy return report
def _local_evaluate(n_plain_t, n_plain_p): c = 0 for idx, i in enumerate(n_plain_p): isit = False for idx2, x in enumerate(i): if x == 1 and x == n_plain_t[idx][idx2]: isit = True if isit: c += 1 acc = float(c) / len(n_plain_p) rps = metrics.label_ranking_average_precision_score(n_plain_t, n_plain_p) print( "\x1b[33mAccuracy: %.02f%%\x1b[0m [%d/%d], \x1b[33mRPS: %.03f\x1b[0m" % (acc * 100, c, len(n_plain_p), rps), end="", ) return acc, rps
def test_lrap_sample_weighting_zero_labels(): # Degenerate sample labeling (e.g., zero labels for a sample) is a valid # special case for lrap (the sample is considered to achieve perfect # precision), but this case is not tested in test_common. # For these test samples, the APs are 0.5, 0.75, and 1.0 (default for zero # labels). y_true = np.array([[1, 0, 0, 0], [1, 0, 0, 1], [0, 0, 0, 0]], dtype=np.bool) y_score = np.array([[0.3, 0.4, 0.2, 0.1], [0.1, 0.2, 0.3, 0.4], [0.4, 0.3, 0.2, 0.1]]) samplewise_lraps = np.array([0.5, 0.75, 1.0]) sample_weight = np.array([1.0, 1.0, 0.0]) assert_almost_equal( label_ranking_average_precision_score(y_true, y_score, sample_weight=sample_weight), np.sum(sample_weight * samplewise_lraps) / np.sum(sample_weight))
def evaluate(predictions, labels, threshold=0.4, multi_label=True): ''' True Positive : Label : 1, Prediction : 1 False Positive : Label : 0, Prediction : 1 False Negative : Label : 0, Prediction : 0 True Negative : Label : 1, Prediction : 0 Precision : TP/(TP + FP) Recall : TP/(TP + FN) F Score : 2.P.R/(P + R) Ranking Loss : The average number of label pairs that are incorrectly ordered given predictions Hammming Loss : The fraction of labels that are incorrectly predicted. (Hamming Distance between predictions and labels) ''' assert predictions.shape == labels.shape, "Shapes: %s, %s" % (predictions.shape, labels.shape,) metrics = dict() if not multi_label: metrics['bae'] = BAE(labels, predictions) labels, predictions = np.argmax(labels, axis=1), np.argmax(predictions, axis=1) metrics['accuracy'] = accuracy_score(labels, predictions) metrics['micro_precision'], metrics['micro_recall'], metrics['micro_f1'], _ = \ precision_recall_fscore_support(labels, predictions, average='micro') metrics['macro_precision'], metrics['macro_recall'], metrics['macro_f1'], metrics['coverage'], \ metrics['average_precision'], metrics['ranking_loss'], metrics['pak'], metrics['hamming_loss'] \ = 0, 0, 0, 0, 0, 0, 0, 0 else: metrics['coverage'] = coverage_error(labels, predictions) metrics['average_precision'] = label_ranking_average_precision_score(labels, predictions) metrics['ranking_loss'] = label_ranking_loss(labels, predictions) for i in range(predictions.shape[0]): predictions[i, :][predictions[i, :] >= threshold] = 1 predictions[i, :][predictions[i, :] < threshold] = 0 metrics['bae'] = 0 metrics['patk'] = patk(predictions, labels) metrics['micro_precision'], metrics['micro_recall'], metrics['micro_f1'], metrics['macro_precision'], \ metrics['macro_recall'], metrics['macro_f1'] = bipartition_scores(labels, predictions) return metrics
if not check_outputs(outputs, False): warnings.warn( "Warning, the ouputs appear to have wrong values!!!!!") allPreds = np.append(allPreds, outputs, axis=0) allLabels = np.append(allLabels, labels, axis=0) total += labels.shape[0] numBatches += 1 try: score_accuracy = accuracy_score( allLabels, indices_to_one_hot(allPreds.argmax(axis=1), len(train_mnist.classes))) score_lwlrap = label_ranking_average_precision_score( allLabels, allPreds) score_mse = math.sqrt(mean_squared_error(allLabels, allPreds)) score_pr_auc = average_precision_score(allLabels, allPreds) score_roc_auc = roc_auc_score(allLabels, allPreds) except ValueError as e: print("Soemthing wrong with evaluation") print(e) print("Accuracy = %f" % (score_accuracy)) print("Label ranking average precision for train = %f" % (score_lwlrap)) print("AUC_ROC score for train = %f" % (score_roc_auc)) print("AUC_PR score for train = %f" % (score_pr_auc)) print("MSE score for train = %f" % (score_mse))
def train_eval_dummy(dummy_strategy, train_df, eval_df, output_dirp): """ Train and eval test a dummy model :param train_df: :param eval_df: :param output_dirp: :return: """ print(train_df.head()) # Write train Path(output_dirp).mkdir(parents=True, exist_ok=True) train_fp = Path(output_dirp) / "trainset.tsv" train_df.to_csv(train_fp, sep="\t", index=False) # reload train for testing train_df = pd.read_csv(train_fp, sep="\t", converters={"labels": literal_eval}) # write and reload eval set for testing eval_fp = Path(output_dirp) / "testset.tsv" eval_df.to_csv(eval_fp, sep="\t", index=False) eval_df = pd.read_csv(eval_fp, sep="\t", converters={"labels": literal_eval}) # Dataframe to sklearn matrix X_train = np.random.rand(train_df["text"].shape[0], ) # random dummy data with same shape as train X_eval = np.random.rand(eval_df["text"].shape[0], ) # random dummy data with same shape as train y_train = np.array( train_df["labels"].to_list()) # train labels in multihot np.array y_eval = np.array( eval_df["labels"].to_list()) # eval labels in multihot np.array # Define model if dummy_strategy == "constant": c = Counter( np.apply_along_axis(lambda x: str(x.tolist()), 1, y_train).tolist()) most_freq_nn = next(x[0] for x in c.most_common() if "1" in x[0]) most_freq_nn = np.array(literal_eval(most_freq_nn)) model = DummyClassifier(strategy=dummy_strategy, constant=most_freq_nn, random_state=settings.RANDOM_STATE) else: model = DummyClassifier(strategy=dummy_strategy, random_state=settings.RANDOM_STATE) # Train the model print(f"Training dummy model with strategy: {dummy_strategy}") model.fit(X_train, y_train) # Evaluate the model on eval set y_pred = model.predict(X_eval) print(y_pred) result = { "LRAP": label_ranking_average_precision_score(y_eval, y_pred), "eval_loss": label_ranking_loss(y_eval, y_pred), } # Write model result and outputs eval_df["y_pred"] = y_pred.tolist() predictions_fp = Path(output_dirp) / "testset_with_predictions.tsv" eval_df.to_csv(predictions_fp, sep="\t", index=False) with open(Path(output_dirp) / "result.json", "wt") as result_out: json.dump(result, result_out) return result
allPreds = np.empty((0, 80), float) allLabels = np.empty((0, 80), int) for batch in trainloader: images = batch['spectrogram'].to(device) labels = batch['labels'].cpu().numpy() outputs = model(images).cpu().numpy() allPreds = np.append(allPreds, outputs, axis=0) allLabels = np.append(allLabels, labels, axis=0) total += labels.shape[0] numBatches += 1 try: score_lwlrap = label_ranking_average_precision_score(binarize(allLabels, threshold=0.5), allPreds) score_roc_auc = roc_auc_score(binarize(allLabels, threshold=0.5), allPreds) score_pr_auc = average_precision_score(binarize(allLabels, threshold=0.5), allPreds) score_mse = math.sqrt(average_precision_score(allLabels, allPreds)) except ValueError as e: print("Soemthing wrong with evaluation") print(e) print("Label ranking average precision for train = %f" % (score_lwlrap / numBatches)) print("AUC_ROC score for train = %f" % (score_roc_auc / numBatches)) print("AUC_PR score for train = %f" % (score_pr_auc / numBatches)) print("MSE score for train = %f" % (score_mse / numBatches)) ## ------------------------------------------------------------------------
def ranking_average_prescision(y, y_pred_probs): cntr, uc, u0c, dc, jm, pcd, fpc = fuzz.cluster.cmeans(X.T, 19, 2, error=0.005, maxiter=1000, init=None) u0p, up, dp, jm, pp, fpc = fuzz.cluster.cmeans_predict(up, cntr, 4, error=0.0005, maxiter=1000) ranking_score = label_ranking_average_precision_score(y, u0p.T)
def train_predict_all_output_files(self): to_lower_case = None # Used to manage the lower case # parameter in the various configurations j = 1 # Iterator data_set_file_list = None # Temporary variable used to build # the name of each configuration with_lower_case = "_with_to_lower_case." # Variable used to # build the name of each configuration using conversion to # lower case without_lower_case = "_without_to_lower_case." # Variable used # to build the name of each configuration not using conversion # to lower case temp = None # Temporary variable used to build the name of # each configuration print(os.listdir(self._current_dir)) # Debug print(len(os.listdir(self._current_dir))) # Debug for file in os.listdir(self._current_dir): # if j == 2: # break print(file) # Debug if file.endswith(".json") and \ file != "cleaned_pre_processing_experiment_results.json": # print(os.path.join(self._current_dir, file)) # Debug start_time = time.time() # We get the time expressed # in seconds since the epoch self._data_set_file = os.path.join( \ self._current_dir, file) np.random.seed(0) # We set the seed self._build_data_set() # We build the data set self._data_set_file = file for to_lower_case in [False, True]: # We iterate to manage the potential conversion # to lower case temp = self._data_set_file data_set_file_list = self._data_set_file.split(".") if to_lower_case: self._data_set_file = \ data_set_file_list[0] + with_lower_case + \ data_set_file_list[1] else: self._data_set_file = \ data_set_file_list[0] + without_lower_case + \ data_set_file_list[1] print_log("##### File name: {} #####" \ .format(self._data_set_file)) # Debug print_log("--- {} seconds ---" \ .format(time.time() - start_time)) i = 1 for train_indices, val_indices in self._tscv.split( self._train_set): print_log("********* Evaluation on fold {} *********"\ .format(i)) # Debug print_log( "We count the occurrence of each term") # Debug count_vectorizer = CountVectorizer(lowercase=to_lower_case, \ token_pattern=u'(?u)\S+') X_counts = count_vectorizer \ .fit_transform(self._train_set.iloc[train_indices]['text'].values) print_log("Use of the TF-IDF model") # Debug tfidf_transformer = TfidfTransformer(use_idf=False, \ smooth_idf=False) print_log(X_counts.shape) # Debug print_log( "Computation of the weights of the TF-IDF model") X_train = tfidf_transformer.fit_transform(X_counts) y_train = self._train_set.iloc[train_indices][ 'class'].values print_log(X_train.shape) print_log("--- {} seconds ---".format(time.time() - start_time)) print_log("Training of the models") # Debug self._model.fit(X_train, y_train) print_log("--- {} seconds ---".format(time.time() - start_time)) print_log("We count the occurrence of " + \ "each term in the val. set") # Debug X_val_counts = count_vectorizer \ .transform(self._train_set.iloc[val_indices]['text'].values) print_log("Computation of the weights of " + \ "the TF-IDF model for the " + \ "validation set") # Debug X_val = tfidf_transformer. \ transform(X_val_counts) y_val = self._train_set. \ iloc[val_indices]['class'].values print_log("Making predictions") # Debug if i == 1: self._configurations_accuracies[ self._data_set_file] = [] self._configurations_mrr_values[ self._data_set_file] = [] self._configurations_accuracies[self._data_set_file].append(\ np.mean(self._model.predict(X_val) == y_val)) found_function = False try: if callable(getattr(self._model, "predict_proba")): # print_log(self._model.classes_) # print_log(self._model.predict_proba(X_val)) lb = LabelBinarizer() _ = lb.fit_transform(self._model.classes_) # print_log(lb.classes_) # print_log(y_classes_bin) # print_log(lb.transform(["X"])) y_val_bin = lb.transform(y_val) self._configurations_mrr_values[self._data_set_file].append(\ label_ranking_average_precision_score( \ y_val_bin, \ self._model.predict_proba(X_val))) found_function = True except AttributeError: pass try: if not found_function and callable( getattr(self._model, "decision_function")): # print_log(self._model.classes_) # print_log(self._model.decision_function(X_val)) lb = LabelBinarizer() _ = lb.fit_transform(self._model.classes_) # print_log(lb.classes_) # print_log(y_classes_bin) # print_log(lb.transform(["X"])) y_val_bin = lb.transform(y_val) self._configurations_mrr_values[self._data_set_file].append(\ label_ranking_average_precision_score( \ y_val_bin, \ self._model.decision_function(X_val))) found_function = True except AttributeError: pass print_log("Mean Reciprocal Rank:") print_log(self._configurations_mrr_values[ self._data_set_file][-1]) print_log("--- {} seconds ---".format(time.time() - start_time)) i += 1 self._data_set_file = temp print_log("*** File {} done ***".format(j)) # Debug j += 1 self._results_to_save_to_a_file["avg_accuracy"] = {} self._results_to_save_to_a_file["avg_mrr"] = {} avg_accuracy = None avg_mrr = None # Below, we print the average accuracies for key, value in self._configurations_accuracies.items(): print_log("Accuracy of {}".format(key)) # Debug print_log("Each fold") print_log(value) print_log("Average") avg_accuracy = sum(value) / len(value) self._results_to_save_to_a_file["avg_accuracy"][key] = \ avg_accuracy print_log(avg_accuracy) # Debug print_log("MRR of {}".format(key)) # Debug print_log("Each fold") mrr = self._configurations_mrr_values[key] print_log(mrr) print_log("Average") avg_mrr = sum(mrr) / len(mrr) self._results_to_save_to_a_file["avg_mrr"][key] = \ avg_mrr print_log(avg_mrr) self._results_to_save_to_a_file["accuracy_per_fold"] = \ self._configurations_accuracies self._results_to_save_to_a_file["mrr_per_fold"] = \ self._configurations_mrr_values print_log("--- {} seconds ---".format(time.time() - start_time)) self._train_set = None self._test_set = None self._configurations_accuracies = {} self._configurations_mrr_values = {}
print('labels: \n', y_lables) print('probabilities of each label: \n', y_probas) print('(Nsamples, Nlables):', np.shape(y_lables)) print('Coverage Error: ', ce) # 若将y_labels修改为如下所示,则第一行就有两个1了,一个排第2,一个排第3,我们取最大值则需要取3 y_lables = np.array([[1, 1, 0], [0, 0, 1]]) ce = sm.coverage_error(y_lables, y_probas) print('labels: \n', y_lables) print('Coverage Error with : ', ce) # (3 + 3) / 2 print('{0:-^70}'.format('LRAP')) # 计算LRAP, 就是将Coverage Error中的各中间值按调和平均进行计算 y_lables = np.array([[1, 0, 0], [0, 0, 1]]) # LRAP = 1/2 * (1/3 + 1/2) = 5/12 = 0.4166.... lrap = sm.label_ranking_average_precision_score(y_lables, y_probas) print('labels: \n', y_lables) print('probabilities of each label: \n', y_probas) print('Label ranking average precision: ', lrap) print('{0:-^70}'.format('Ranking Loss')) # 对第一个样本来说,k只能取0,因为只有y[0, 0] = 1, # 继而l也只能取2了,因为只有f[0, 0] < f[0, 2](0.75 < 1), 所以第一个样本的ranking loss为1 # 对第二个样本来说,k只能取2,只有y[1, 2] = 1, l可以取0和1都行,因为他们的概率都大于f[1, 2], 所以这一行为2 # 因为labels只包含0和1,所以|yi| = 2, labels总共有3个,所以Nlabels = 3 # 那么最终结果Ranking Loss = (1 / [2 * (3 - 2)] + 2 / [2 * (3 - 2)]) / 2 = 3/4 = 0.75 rl = sm.label_ranking_loss(y_lables, y_probas) print('labels: \n', y_lables) print('probabilities of each label: \n', y_probas) print('Ranking loss: ', rl)
print(t, ':') print(' correct: ', end='') for idx in np.where(y_val[t] > 0.5)[0].tolist(): sys.stdout.write('[' + ntagslist_sorted[idx] + '] ') print() print(' predicted: ', end='') for idx in np.where(predictions[t] > threshold)[0].tolist(): sys.stdout.write('[' + ntagslist_sorted[idx] + '] ') print() # Scikit-learn has some applicable performance [metrics] # (http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics) # we can try: print('Precision: {0:.3f} (threshold: {1:.2f})'.format( metrics.precision_score(y_val.flatten(), predictions.flatten() > threshold), threshold)) print('Recall: {0:.3f} (threshold: {1:.2f})'.format( metrics.recall_score(y_val.flatten(), predictions.flatten() > threshold), threshold)) print('F1 score: {0:.3f} (threshold: {1:.2f})'.format( metrics.f1_score(y_val.flatten(), predictions.flatten() > threshold), threshold)) average_precision = metrics.average_precision_score(y_val.flatten(), predictions.flatten()) print('Average precision: {0:.3f}'.format(average_precision)) print('Coverage: {0:.3f}'.format(metrics.coverage_error(y_val, predictions))) print('LRAP: {0:.3f}'.format( metrics.label_ranking_average_precision_score(y_val, predictions)))
def multilabel_metrics(pred_list, verbose, extra_vars, split): """ Multiclass classification metrics. see multilabel ranking metrics in sklearn library for more info: http://scikit-learn.org/stable/modules/model_evaluation.html#multilabel-ranking-metrics :param pred_list: dictionary of hypothesis sentences :param verbose: if greater than 0 the metric measures are printed out :param extra_vars: extra variables extra_vars['word2idx'] - dictionary mapping from words to indices extra_vars['references'] - list of GT labels :param split: split on which we are evaluating :return: Dictionary of multilabel metrics """ from sklearn import metrics as sklearn_metrics word2idx = extra_vars[split]['word2idx'] # check if an additional dictionary matching raw to basic and general labels is provided # in that case a more general evaluation will be considered raw2basic = extra_vars[split].get('raw2basic', None) if raw2basic is not None: logging.info('Applying general evaluation with raw2basic dictionary.') if raw2basic is None: n_classes = len(word2idx) else: basic_values = set(raw2basic.values()) n_classes = len(basic_values) n_samples = len(pred_list) # Create prediction matrix y_pred = np.zeros((n_samples, n_classes)) for i_s, sample in list(enumerate(pred_list)): for word in sample: if raw2basic is None: y_pred[i_s, word2idx[word]] = 1 else: word = word.strip() y_pred[i_s, raw2basic[word]] = 1 # Prepare GT gt_list = extra_vars[split]['references'] if raw2basic is None: y_gt = np.array(gt_list) else: idx2word = {v: k for k, v in iteritems(word2idx)} y_gt = np.zeros((n_samples, n_classes)) for i_s, sample in list(enumerate(gt_list)): for raw_idx, is_active in list(enumerate(sample)): if is_active: word = idx2word[raw_idx].strip() y_gt[i_s, raw2basic[word]] = 1 # Compute Coverage Error coverr = sklearn_metrics.coverage_error(y_gt, y_pred) # Compute Label Ranking AvgPrec avgprec = sklearn_metrics.label_ranking_average_precision_score(y_gt, y_pred) # Compute Label Ranking Loss rankloss = sklearn_metrics.label_ranking_loss(y_gt, y_pred) # Compute Precision, Recall and F1 score precision, recall, f1, _ = sklearn_metrics.precision_recall_fscore_support(y_gt, y_pred, average='micro') if verbose > 0: logging.info( '"coverage_error" (best: avg labels per sample = %f): %f' % (float(np.sum(y_gt)) / float(n_samples), coverr)) logging.info('Label Ranking "average_precision" (best: 1.0): %f' % avgprec) logging.info('Label "ranking_loss" (best: 0.0): %f' % rankloss) logging.info('precision: %f' % precision) logging.info('recall: %f' % recall) logging.info('f1: %f' % f1) return {'coverage_error': coverr, 'average_precision': avgprec, 'ranking_loss': rankloss, 'precision': precision, 'recall': recall, 'f1': f1}
loss = criterion(output, tags) loss.backward() optimizer.step() output_prob = output if args.loss_type == "cross_entropy": output_prob = F.softmax(output, dim=1) elif args.loss_type == "bce": output_prob = torch.sigmoid(output) predict_vector = np.argmax(to_np(output_prob), axis=1) tags_np = to_np(tags) output_prob_np = to_np(output_prob) ranking_ap_score = label_ranking_average_precision_score( tags_np, output_prob_np) ranking_loss = label_ranking_loss( tags_np, output_prob_np) label_vector = np.argmax(to_np(tags), axis=1) bool_vector = predict_vector == label_vector accuracy = bool_vector.sum() / len(bool_vector) if batch_idx % args.log_interval == 0: print( 'Train [{}] Batch {} / {}: Batch Loss {:2.4f} / Batch Acc {:2.4f} / Lank AP {:2.4f} / Lank Loss {:2.4f}' .format( datetime.now().strftime( '%Y/%m/%d %H:%M:%S'), batch_idx, len(dataloader), loss.item(), accuracy,
y_val = np.array(encoded_labels_df_val) # Define model linsvm = LinearSVC(loss='hinge') #multi_class='ovr', #verbose=True, #max_iter=1000) model = OneVsRestClassifier(linsvm, n_jobs=-1) start = time.process_time() model.fit(X_train, Y_train) elapsed_fit = time.process_time() - start print("Time to fit model (min):", elapsed_fit / 60) start_predict = time.process_time() ### change y_pred = model.decision_function(x_val) elapsed_predict = time.process_time() - start_predict print("Time to predict (min):", elapsed_predict / 60) # Evaluate ### change y_true = y_val LRAP = label_ranking_average_precision_score(y_true, y_pred) print("LRAP:", LRAP) print(y_pred[0:3])
def multilabel_metrics(pred_list, verbose, extra_vars, split): """ Multiclass classification metrics. see multilabel ranking metrics in sklearn library for more info: http://scikit-learn.org/stable/modules/model_evaluation.html#multilabel-ranking-metrics :param pred_list: dictionary of hypothesis sentences :param verbose: if greater than 0 the metric measures are printed out :param extra_vars: extra variables extra_vars['word2idx'] - dictionary mapping from words to indices extra_vars['references'] - list of GT labels :param split: split on which we are evaluating :return: Dictionary of multilabel metrics """ from sklearn import metrics as sklearn_metrics word2idx = extra_vars[split]['word2idx'] # check if an additional dictionary matching raw to basic and general labels is provided # in that case a more general evaluation will be considered raw2basic = extra_vars[split].get('raw2basic', None) if raw2basic is not None: logger.info('Applying general evaluation with raw2basic dictionary.') if raw2basic is None: n_classes = len(word2idx) else: basic_values = set(raw2basic.values()) n_classes = len(basic_values) n_samples = len(pred_list) # Create prediction matrix y_pred = np.zeros((n_samples, n_classes)) for i_s, sample in list(enumerate(pred_list)): for word in sample: if raw2basic is None: y_pred[i_s, word2idx[word]] = 1 else: word = word.strip() y_pred[i_s, raw2basic[word]] = 1 # Prepare GT gt_list = extra_vars[split]['references'] if raw2basic is None: y_gt = np.array(gt_list) else: idx2word = {v: k for k, v in iteritems(word2idx)} y_gt = np.zeros((n_samples, n_classes)) for i_s, sample in list(enumerate(gt_list)): for raw_idx, is_active in list(enumerate(sample)): if is_active: word = idx2word[raw_idx].strip() y_gt[i_s, raw2basic[word]] = 1 # Compute Coverage Error coverr = sklearn_metrics.coverage_error(y_gt, y_pred) # Compute Label Ranking AvgPrec avgprec = sklearn_metrics.label_ranking_average_precision_score(y_gt, y_pred) # Compute Label Ranking Loss rankloss = sklearn_metrics.label_ranking_loss(y_gt, y_pred) # Compute Precision, Recall and F1 score precision, recall, f1, _ = sklearn_metrics.precision_recall_fscore_support(y_gt, y_pred, average='micro') if verbose > 0: logger.info( '"coverage_error" (best: avg labels per sample = %f): %f' % (float(np.sum(y_gt)) / float(n_samples), coverr)) logger.info('Label Ranking "average_precision" (best: 1.0): %f' % avgprec) logger.info('Label "ranking_loss" (best: 0.0): %f' % rankloss) logger.info('precision: %f' % precision) logger.info('recall: %f' % recall) logger.info('f1: %f' % f1) return {'coverage_error': coverr, 'average_precision': avgprec, 'ranking_loss': rankloss, 'precision': precision, 'recall': recall, 'f1': f1}
def stacking(cfg, files): print(list(files.keys())) ave_oof, ave_pred = average(cfg, files, True) tr_oof_files = [ np.load(f'../output/{name}oof.npy')[:, :, np.newaxis] for name in files.keys() ] + [ave_oof[:, :, np.newaxis]] tr_oof = np.concatenate(tr_oof_files, axis=-1) test_files = [ np.load(f'../output/{name}pred.npy')[:, :, np.newaxis] for name in files.keys() ] + [ave_pred[:, :, np.newaxis]] test_pred = np.concatenate(test_files, axis=-1) df = pd.read_csv(f'../input/train_curated.csv') y = split_and_label(df['labels'].values) mskfold = MultilabelStratifiedKFold(cfg.n_folds, shuffle=False, random_state=66666) folds = list(mskfold.split(y, y)) predictions = np.zeros_like(test_pred)[:, :, 0] oof = np.zeros_like((y)) for fold, (tr_idx, val_idx) in enumerate(folds): print('fold ', fold) if True: # init K.clear_session() model = stacker(cfg, tr_oof.shape[2]) best_epoch = 0 best_score = -1 for epoch in range(1000): if epoch - best_epoch > 15: break tr_x, tr_y = tr_oof[tr_idx], y[tr_idx] val_x, val_y = tr_oof[val_idx], y[val_idx] val_pred = model.predict(val_x) score = label_ranking_average_precision_score(val_y, val_pred) if score > best_score: best_score = score best_epoch = epoch oof[val_idx] = val_pred model.save_weights(f"../model/stacker{cfg.name}{fold}.h5") model.fit(x=tr_x, y=tr_y, batch_size=cfg.bs, verbose=0) print(f'{epoch} score {score} , best {best_score}...') model.load_weights(f"../model/stacker{cfg.name}{fold}.h5") predictions += model.predict(test_pred) print('lrap: ', label_ranking_average_precision_score(y, oof)) predictions /= cfg.n_folds print(label_ranking_average_precision_score(y, oof)) test = pd.read_csv('../input/sample_submission.csv') test.loc[:, test.columns[1:].tolist()] = predictions test.to_csv('submission.csv', index=False)
def label_ranking_average_precision(targets, preds, target_threshold=0): targets, preds = to_numpy(targets), to_numpy(preds) if target_threshold is not None: targets = targets > target_threshold return metrics.label_ranking_average_precision_score(targets, preds)
def eval_running_model(dataloader, test=False): loss_fct = CrossEntropyLoss() model.eval() eval_loss, eval_hit_times = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 r10 = r2 = r1 = r5 = 0 mrr = [] if test: results_out = [] for step, batch in enumerate(dataloader): batch = tuple(t.to(device) for t in batch) context_token_ids_list_batch, context_input_masks_list_batch, \ response_token_ids_list_batch, response_input_masks_list_batch, labels_batch = batch with torch.no_grad(): logits = model(context_token_ids_list_batch, context_input_masks_list_batch, response_token_ids_list_batch, response_input_masks_list_batch) loss = loss_fct(logits, torch.argmax(labels_batch, 1)) r2_indices = torch.topk(logits, 2)[1] # R 2 @ 100 r5_indices = torch.topk(logits, 5)[1] # R 5 @ 100 r10_indices = torch.topk(logits, 10)[1] # R 10 @ 100 r1 += (logits.argmax(-1) == 0).sum().item() r2 += ((r2_indices == 0).sum(-1)).sum().item() r5 += ((r5_indices == 0).sum(-1)).sum().item() r10 += ((r10_indices == 0).sum(-1)).sum().item() # mrr logits = logits.data.cpu().numpy() for logit in logits: if test: results_out.append(logit.tolist()) y_true = np.zeros(len(logit)) y_true[0] = 1 mrr.append(label_ranking_average_precision_score([y_true], [logit])) eval_loss += loss.item() nb_eval_examples += labels_batch.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = r1 / nb_eval_examples if not test: result = { 'train_loss': tr_loss / nb_tr_steps, 'eval_loss': eval_loss, 'R1': r1 / nb_eval_examples, 'R2': r2 / nb_eval_examples, 'R5': r5 / nb_eval_examples, 'R10': r10 / nb_eval_examples, 'MR': np.mean(1 / (np.array(mrr))), 'MRR': np.mean(mrr), 'epoch': epoch, 'global_step': global_step, } else: result = { 'eval_loss': eval_loss, 'R1': r1 / nb_eval_examples, 'R2': r2 / nb_eval_examples, 'R5': r5 / nb_eval_examples, 'R10': r10 / nb_eval_examples, 'MR': np.mean(1 / (np.array(mrr))), 'MRR': np.mean(mrr), } if test: if args.generate: export_scores_json(list(results_out)) if (args.testset == 'devtest') and not args.generate: export_results(result) return result
cuda_device=device) eval_file = os.path.join(eval_path, frame_type + '.tsv') eval_df = pd.read_csv(eval_file, sep='\t', converters={'labels': literal_eval}) labels = list(eval_df.columns)[3:-1] predictions, raw_outputs = model.predict(list(eval_df['text'])) full_y_true = np.array(eval_df[labels].astype(int)) full_y_pred = np.array(predictions) print(full_y_true.shape) print(full_y_pred.shape) cat_f1 = f1_score(full_y_true, full_y_pred, average='macro') cat_lrap = label_ranking_average_precision_score(full_y_true, full_y_pred) category_scores.append((frame_type, cat_f1, cat_lrap)) for i, label in enumerate(labels): y_true = np.array(eval_df[label].astype(int)) y_pred = np.array([predictions[j][i] for j in range(len(predictions))]) score = f1_score(y_true, y_pred, average='macro') print(frame_type, label, score) all_scores.append((frame_type, label, score)) df_cat = pd.DataFrame(category_scores, columns=['Frame Type', 'Macro F1', 'LRAP']) df_cat.to_csv( f'/shared/2/projects/framing/models/classify/09-24-20_overall_eval.tsv', sep='\t')
def ranking_precision(self): self.ranking_precision = metrics.label_ranking_average_precision_score(self.ground_truth, self.predictions_raw) return 'Ranking Precision (0, 1]: ' + str(self.ranking_precision)
cntr = 1.0 for iCntr in range(len(qList)): qCntr = qList[iCntr] if args.removeQuery: qLabel = qLabelArr[iCntr] else: qLabel = labelArr[qCntr] y_true = np.zeros((dist.shape[1])) indices = np.where(labelArr == qLabel) y_true[np.where(labelArr == qLabel)] = 1 y_true = y_true[ind[iCntr, :]] maxVal = np.max(dist[iCntr, :]) if args.removeQuery: currAP = label_ranking_average_precision_score( [y_true], [maxVal - dist[iCntr, :]]) else: currAP = label_ranking_average_precision_score( [y_true[1:]], [maxVal - dist[iCntr, 1:]]) running_ap += currAP cntr += 1 print('AP for Query:%d Text:%s Occ:%d is %.4f' % (qCntr, vocabIdx[qLabel], vocabCntr[qLabel], currAP)) with open(logs_dir + 'retrieval_accuracy.txt', 'a+') as fLog: fLog.write('AP for Query:%d Text:%s Occ:%d is %.4f \n' % (qCntr, vocabIdx[qLabel], vocabCntr[qLabel], currAP)) if args.printFlag: with open(os.path.join(ret_dir, 'CombAcc.txt'), 'a+') as fLog:
def eval_turn(Config, model, data_loader, val_version, epoch_num, log_file): model.train(False) val_corrects1 = 0 val_corrects2 = 0 val_corrects3 = 0 val_size = data_loader.__len__() item_count = data_loader.total_item_len t0 = time.time() get_l1_loss = nn.L1Loss() get_ce_loss = nn.CrossEntropyLoss() get_ce_sig_loss = nn.BCELoss() val_batch_size = data_loader.batch_size val_epoch_step = data_loader.__len__() num_cls = data_loader.num_cls val_loss_recorder = LossRecord(val_batch_size) val_celoss_recorder = LossRecord(val_batch_size) print('evaluating %s ...' % val_version, flush=True) eval_t = locals() sum_fbeta = 0 y_pred, Y_test = [], [] sum_fbeta = 0 best_fbeta = 0 ave_test_accu_final = 0 test_file = open("./result_log/val.log", "a+") with torch.no_grad(): for batch_cnt_val, data_val in enumerate(data_loader): # inputs = Variable(data_val[0].cuda()) # labels = Variable(torch.LongTensor(np.array(data_val[1])).long().cuda()) inputs, labels, labels_swap, swap_law, img_names = data_val labels_npy = np.array(labels) labels_tensor = Variable( torch.FloatTensor(np.array(labels)).cuda()) labels_ = labels_npy.astype(np.uint8) inputs = Variable(inputs.cuda()) outputs = model(inputs) loss = 0 # ce_loss = get_ce_loss(outputs[0], labels).item() ce_loss = get_ce_sig_loss(outputs[0], labels_tensor).item() loss += ce_loss val_loss_recorder.update(loss) val_celoss_recorder.update(ce_loss) if Config.use_dcl and Config.cls_2xmul: outputs_pred = outputs[0] + outputs[1][:, 0:num_cls] + outputs[ 1][:, num_cls:2 * num_cls] else: outputs_pred = outputs[0] # cal_sigmoid = nn.Sigmoid() # outputs_pred_s = cal_sigmoid(outputs_pred) ######## MAP is label ranking, do not need normilization # predict_multensor = torch.ge(outputs_pred, 0.5) ### 大于0.5的置为一,其他置为0,类似于阈值化操作 predict_mul_ = outputs_pred.cpu().numpy() temp_fbeta = label_ranking_average_precision_score( labels_, predict_mul_) predict_multensor = torch.ge(outputs_pred, 0.5) ### 大于0.5的置为一,其他置为0,类似于阈值化操作 predict_mul = predict_multensor.cpu().numpy() sum_fbeta = sum_fbeta + temp_fbeta ave_num = batch_cnt_val + 1 y_pred.extend(predict_mul[:]) Y_test.extend(labels_[:]) ave_acc = sum_fbeta / ave_num y_pred_ = np.array(y_pred) Y_test_ = np.array(Y_test) log_file.write(val_version + '\t' + str(val_loss_recorder.get_val()) + '\t' + str(val_celoss_recorder.get_val()) + '\t' + str(ave_acc) + '\n') t1 = time.time() since = t1 - t0 print('--' * 30, flush=True) print('% 3d %s %s %s-loss: %.4f ||%s-ave@acc: %.4f ||time: %d' % (epoch_num, val_version, dt(), val_version, val_loss_recorder.get_val(init=True), val_version, ave_acc, since), flush=True) print('--' * 30, flush=True) eval_t['metrics_' + str(0.5)] = evaluate_test(predictions=y_pred_, labels=Y_test_) metrics = eval_t['metrics_' + str(0.5)] output = "=> Test : Coverage = {}\n Average Precision = {}\n Micro Precision = {}\n Micro Recall = {}\n Micro F Score = {}\n".format( metrics['coverage'], ave_acc, metrics['micro_precision'], metrics['micro_recall'], metrics['micro_f1']) output += "=> Test : Macro Precision = {}\n Macro Recall = {}\n Macro F Score = {}\n ranking_loss = {}\n hamming_loss = {}\n\n".format( metrics['macro_precision'], metrics['macro_recall'], metrics['macro_f1'], metrics['ranking_loss'], metrics['hamming_loss']) # output += "\n=> Test : ma-False_positive_rate(FPR) = {}, mi-False_positive_rate(FPR) = {}\n".format(metrics['ma-FPR'],metrics['mi-FPR']) print(output) test_file.write('epoch:%d\n' % epoch_num) test_file.write(output) test_file.close() return ave_acc
labels = np.array(labels) X_w = np.array(np.split(couples, len(seq))) X_t = np.array(np.split(tag_couples, len(seq))) if test_labels ==0 : # Divide number of examples to rank so that GPU does not cause out of memory error splitter = get_min_divisor(len(labels)) test_y = np.reshape(np.empty_like(labels, dtype = 'float32'),(labels.shape[0],1)) for j in range(splitter): test_loss, test_y_block = model.test_on_batch([X_w[:,j*(labels.shape[0]/splitter): (j+1)*(labels.shape[0]/splitter) ,:], X_t[:,j*(labels.shape[0]/splitter): (j+1)*(labels.shape[0]/splitter) ,:]], labels[j*(labels.shape[0]/splitter): (j+1)*(labels.shape[0]/splitter)]) test_y[j*(labels.shape[0]/splitter): (j+1)*(labels.shape[0]/splitter)] = test_y_block else: test_loss, test_y = model.test_on_batch([X_w, X_t], labels) lraps.append(label_ranking_average_precision_score(np.reshape(np.array(labels),test_y.shape).T , test_y.T)) mrr, recall, prec = test_utils.print_accuracy_results(np.array(labels) , np.reshape(test_y, np.array(labels).shape)) mrrs.append(mrr) recalls.append(recall) precs.append(prec) losses.append(test_loss) test_losses.append(test_loss) if len(losses) % 100 == 0: progbar.update(i, values=[("loss", np.sum(losses))]) losses = [] samples_seen += len(labels) print("\nSkipped="+str(skipped)) print("\nlrap="+"{0:.5f}".format(np.mean(np.array(lraps)))+" :loss=" + str(np.mean(test_losses)) + " :Samples seen="+str(test_docs)+ "\n") print("mrr=" + "{0:.5f}".format(np.mean(mrrs))+"\n")
def build_model(config: TrainBertParams): # TODO: add typing """ Runs the main loop to build the trained Bert model for multi-label emotion classification """ tokenizer = transformers.BertTokenizer.from_pretrained(config.bert_path, do_lower_case=True) # TODO: consider adding stratify for multi-label dfx, _, _ = load_data_file(config.emotions_train_file) df_train, df_valid = model_selection.train_test_split(dfx.sample(n=config.training_sample, random_state=1), test_size=config.test_size, random_state=42) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = BERTDataset(text=df_train.text.values, target=df_train[EMOTIONS].values, tokenizer=tokenizer, max_len=MAX_LEN) train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.train_batch_size, num_workers=4) valid_dataset = BERTDataset(text=df_valid.text.values, target=df_valid[EMOTIONS].values, tokenizer=tokenizer, max_len=MAX_LEN) valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=config.valid_batch_size, num_workers=1) device = torch.device(config.device) model = BERTBaseUncased(config.bert_path, config.bert_base_uncase_params) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = config.no_decay_components optimizer_parameters = [ { "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": config.weight_decay, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = int(len(df_train) / config.train_batch_size * config.epochs) optimizer = AdamW(optimizer_parameters, lr=_ADAM_LEARNING_RATE) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) current_date = datetime.now().date().isoformat() output_model_file = os.path.join(config.output_model_path, f'emo_bert_model_{current_date}.bin') best_accuracy = 0 for epoch in tqdm(range(config.epochs), desc='epochs'): train_model(train_data_loader, model, optimizer, device, scheduler, config.bert_base_uncase_params.num_labels, epoch) outputs, targets = evaluate(valid_data_loader, model, device) targets = np.array(np.array(targets) >= _TARGETS_THRESHOLD).astype(int) outputs = np.array(outputs) accuracy = label_ranking_average_precision_score(targets, outputs) if accuracy > best_accuracy: torch.save(model.state_dict(), output_model_file) best_accuracy = accuracy TB_WRITER.close()
def _evaluate_samples(self, y_true, y_pred, metrics='all'): y_pred_binary = y_pred > 0.0 # define the available metrics as lazy lambda functions # so we can execute only the ones actually requested all_metrics = { 'Precision (doc avg)': lambda: precision_score(y_true, y_pred_binary, average='samples'), 'Recall (doc avg)': lambda: recall_score(y_true, y_pred_binary, average='samples'), 'F1 score (doc avg)': lambda: f1_score(y_true, y_pred_binary, average='samples'), 'Precision (subj avg)': lambda: precision_score(y_true, y_pred_binary, average='macro'), 'Recall (subj avg)': lambda: recall_score(y_true, y_pred_binary, average='macro'), 'F1 score (subj avg)': lambda: f1_score(y_true, y_pred_binary, average='macro'), 'Precision (weighted subj avg)': lambda: precision_score(y_true, y_pred_binary, average='weighted'), 'Recall (weighted subj avg)': lambda: recall_score(y_true, y_pred_binary, average='weighted'), 'F1 score (weighted subj avg)': lambda: f1_score(y_true, y_pred_binary, average='weighted'), 'Precision (microavg)': lambda: precision_score(y_true, y_pred_binary, average='micro'), 'Recall (microavg)': lambda: recall_score(y_true, y_pred_binary, average='micro'), 'F1 score (microavg)': lambda: f1_score(y_true, y_pred_binary, average='micro'), 'F1@5': lambda: f1_score( y_true, filter_pred_top_k(y_pred, 5) > 0.0, average='samples'), 'NDCG': lambda: ndcg_score(y_true, y_pred), 'NDCG@5': lambda: ndcg_score(y_true, y_pred, limit=5), 'NDCG@10': lambda: ndcg_score(y_true, y_pred, limit=10), 'Precision@1': lambda: precision_at_k_score(y_true, y_pred, limit=1), 'Precision@3': lambda: precision_at_k_score(y_true, y_pred, limit=3), 'Precision@5': lambda: precision_at_k_score(y_true, y_pred, limit=5), 'LRAP': lambda: label_ranking_average_precision_score(y_true, y_pred), 'True positives': lambda: true_positives(y_true, y_pred_binary), 'False positives': lambda: false_positives(y_true, y_pred_binary), 'False negatives': lambda: false_negatives(y_true, y_pred_binary), } if metrics == 'all': metrics = all_metrics.keys() with warnings.catch_warnings(): warnings.simplefilter('ignore') return {metric: all_metrics[metric]() for metric in metrics}
def score(y_true, y_pred, item_lst, six_db=False, A=1, B=1, C=1, top_k=150, mode='a', file_name='results.txt', save_path=''): idx_lst = [1] if six_db: item_lst = ['AraCyc', 'EcoCyc', 'HumanCyc', 'LeishCyc', 'TrypanoCyc', 'YeastCyc'] idx_lst = [idx for idx in np.arange(len(item_lst))] print('\t>> Scores are saved to {0:s}...'.format(str(file_name))) for i, idx in enumerate(idx_lst): y = y_true y_hat = y_pred if six_db: y = y_true[idx] y_hat = y_pred[idx] y = y.reshape((1, y.shape[0])) y_hat = np.reshape(y_hat, (1, len(y_hat))) save_data(data='*** Scores for {0:s}...\n'.format(str(item_lst[i])), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) else: save_data(data='*** Scores for {0:s}...\n'.format(item_lst[i]), file_name=file_name, save_path=save_path, mode='w', w_string=True, print_tag=False) ce_samples = coverage_error(y, y_hat) save_data(data='\t\t1)- Coverage error score: {0:.4f}\n'.format(ce_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) lrl_samples = label_ranking_loss(y, y_hat) save_data(data='\t\t2)- Ranking loss score: {0:.4f}\n'.format(lrl_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) lrap_samples = label_ranking_average_precision_score(y, y_hat) save_data(data='\t\t3)- Label ranking average precision score: {0:.4f}\n'.format(lrap_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) if not np.array_equal(y_pred, y_pred.astype(bool)): top_k = y_true.shape[1] if top_k > y_true.shape[1] else top_k psp_samples = psp(y_prob=y_hat, y_true=y, A=A, B=B, C=C, top_k=top_k) save_data(data='\t\t4)- Propensity Scored Precision at {0}: {1:.4f}\n'.format(top_k, psp_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) ndcg_samples = psndcg(y_prob=y_hat, y_true=y, A=A, B=B, C=C, top_k=top_k) save_data(data='\t\t5)- Propensity Scored nDCG at {0}: {1:.4f}\n'.format(top_k, ndcg_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) continue hl_samples = hamming_loss(y, y_hat) save_data(data='\t\t4)- Hamming-Loss score: {0:.4f}\n'.format(hl_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) pr_samples_average = precision_score(y, y_hat, average='samples') pr_samples_micro = precision_score(y, y_hat, average='micro') pr_samples_macro = precision_score(y, y_hat, average='macro') save_data(data='\t\t5)- Precision...\n', file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Average sample precision: {0:.4f}\n'.format(pr_samples_average), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Micro precision: {0:.4f}\n'.format(pr_samples_micro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Macro precision: {0:.4f}\n'.format(pr_samples_macro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) rc_samples_average = recall_score(y, y_hat, average='samples') rc_samples_micro = recall_score(y, y_hat, average='micro') rc_samples_macro = recall_score(y, y_hat, average='macro') save_data(data='\t\t6)- Recall...\n', file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Average sample recall: {0:.4f}\n'.format(rc_samples_average), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Micro recall: {0:.4f}\n'.format(rc_samples_micro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Macro recall: {0:.4f}\n'.format(rc_samples_macro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) f1_samples_average = f1_score(y, y_hat, average='samples') f1_samples_micro = f1_score(y, y_hat, average='micro') f1_samples_macro = f1_score(y, y_hat, average='macro') save_data(data='\t\t7)- F1-score...\n', file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Average sample f1-score: {0:.4f}\n'.format(f1_samples_average), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Micro f1-score: {0:.4f}\n'.format(f1_samples_micro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Macro f1-score: {0:.4f}\n'.format(f1_samples_macro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) js_score_samples = jaccard_score(y, y_hat, average='samples') js_score_micro = jaccard_score(y, y_hat, average='micro') js_score_macro = jaccard_score(y, y_hat, average='macro') js_score_weighted = jaccard_score(y, y_hat, average='weighted') save_data(data='\t\t8)- Jaccard score...\n', file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Jaccard score (samples): {0:.4f}\n'.format(js_score_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Jaccard score (micro): {0:.4f}\n'.format(js_score_micro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Jaccard score (macro): {0:.4f}\n'.format(js_score_macro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Jaccard score (weighted): {0:.4f}\n'.format(js_score_weighted), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) tn, fp, fn, tp = confusion_matrix(y.flatten(), y_hat.flatten()).ravel() save_data(data='\t\t9)- Confusion matrix...\n', file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> True positive: {0}\n'.format(tp), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> True negative: {0}\n'.format(tn), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> False positive: {0}\n'.format(fp), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> False negative: {0}\n'.format(fn), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False)
plrloss = [] plraprecision = [] nscoreone = 0 nscoretwo = 0 correctness = 0 for i in range(0,10): p = classifier2.predictor() p.learnPredictor() n_predicted = p.predict() correct = p.mlb.transform(util2.getCorrectGenres(p.testExamples)) ny_score = np.array(n_predicted) y_true = np.array(correct) nscoreone += label_ranking_loss(y_true, ny_score) nscoretwo += label_ranking_average_precision_score(y_true, ny_score) correctness += util2.printCorrectness(p.mlb, p.testExamples, n_predicted, correct) print "LABEL RANKING LOSS: " + str(float(nscoreone)/10) print "LABEL RANKING AVERAGE PRECISION: " + str(float(nscoretwo)/10) print "CORRECTNESS: " + str(float(correctness)/10) # util2.printAccuracyByGenre(p.mlb, p.testExamples, n_predicted, correct) # util2.printOutput(p.mlb, p.testExamples, n_predicted, correct) # print "==========" # print "PERCENT RESULTS" # print "LABEL RANKING LOSS:" # pscoreone = label_ranking_loss(y_true, py_score)
# make predictions on test data predicted = clf.predict_proba(Y) scores = np.reshape(np.concatenate((predicted[0][:,1], predicted[1][:,1], predicted[2][:,1])), (TEST_SIZE, OUTPUTS)) # compute error for multilabel ranking # coverage error # the best value is equal to the avg number of labels in test_label per sample # i.e. 2.0550000000000002 cover_err = coverage_error(test_label, scores) # cov_errs.append(cover_err) print("RandomForest - Coverage error: " + str(cover_err)) # label ranking average precision score # best value is 1 lrap_score = label_ranking_average_precision_score(test_label, scores) # lrap_scores.append(lrap_score) print("RandomForest - Label ranking avg precision score: " + str(lrap_score)) # compute label ranking loss # best value is 0 ranking_loss = label_ranking_loss(test_label, scores) # ranking_losses.append(ranking_loss) print("RandomForest - Ranking loss: " + str(ranking_loss)) # avg_cov_err = np.mean(cover_err) # print "RandomForest CV avg coverage error - Estimators " + str(estimators) + " " + str(avg_cov_err) # avg_lrap_score = np.mean(lrap_scores) # print "RandomForest CV avg lrap score - Estimators " + str(estimators) + " " + str(avg_lrap_score) # avg_rank_loss = np.mean(ranking_losses)
def main(): # matplotlib.style.use('ggplot') logging.basicConfig(filename='classify.log', filemode='w', \ level=logging.DEBUG) current_dir = os.path.dirname(os.path.abspath( \ inspect.getfile(inspect.currentframe()))) data_set_file = "./mozilla_firefox/pre_processing_experiments/output_without_cleaning_without_stemming_without_lemmatizing_without_stop_words_removal_without_punctuation_removal_without_numbers_removal.json" # The path of the file which # "./eclipse_jdt/pre_processing_experiments/output_without_cleaning_without_stemming_without_lemmatizing_without_stop_words_removal_without_punctuation_removal_without_numbers_removal.json" # "./mozilla_firefox/pre_processing_experiments/output_without_cleaning_without_stemming_without_lemmatizing_without_stop_words_removal_without_punctuation_removal_without_numbers_removal.json" # contains the pre-processed output # Below, the path of the file which contains a dictionary related # to the mappings of the developers developers_dict_file = "../developers_dict.json" # Below, the path of the file which contains a list of the # relevant distinct developers developers_list_file = "../developers_list.json" np.random.seed(0) # We set the seed start_time = time.time() # We get the time expressed in seconds # since the epoch # First we load the data of the three aforementioned files json_data = load_data_set(data_set_file) developers_dict_data = None # developers_dict_data = load_developers_mappings(developers_dict_file) developers_list_data = None # developers_list_data = load_distinct_developers_list(developers_list_file) # sm = SMOTE(random_state=42) # Then, we build a data frame using the loaded data set, the # loaded developers mappings, the loaded distinct developers. df = build_data_frame(json_data, developers_dict_data, developers_list_data) s = " ".join([tr.lower() for tr in df['text'].tolist()]) wordcloud = WordCloud(max_font_size=40).generate(s) fig = plt.figure() # Display the generated image: # the matplotlib way: plt.imshow(wordcloud) plt.axis("off") save_wordcloud_file = os.path.join(current_dir, "wordcloud.png") if save_wordcloud_file: plt.savefig(save_wordcloud_file, bbox_inches="tight") plt.close(fig) else: plt.show() # TO DO: Fix the lines below later # print("Histogram of the the frequencies of each class") # pd.value_counts(df['class'], sort=False).hist() # pylab.show() # Below, there is a dictionary to store the names, the classifiers # used, the parameters sent to the constructor of the classifiers # and the fitted classifiers models = { \ "RDF": [RandomForestClassifier, { "n_estimators": 50, "n_jobs": -1 }, None], \ # "ExtraTreesClassifier": [ExtraTreesClassifier, { # "n_estimators": 50, # "n_jobs": -1 # }, None], \ "NB": [MultinomialNB, {}, None], \ # "SVM": [SVC, { \ # "kernel": "linear", \ # "probability": True # }, None], \ "Perceptron": [Perceptron, { \ "n_jobs": -1, \ "class_weight": "balanced" }, None], \ "PassiveAggressiveClassifier": [PassiveAggressiveClassifier, { \ "n_jobs": -1, \ "class_weight": "balanced" }, None], \ # "RidgeClassifier": [RidgeClassifier, { \ # "solver": "sag", \ # "normalize": True # }, None], \ "RidgeClassifier (with wrapper)": [OneVsRestClassifier, { \ "n_jobs": -1, \ "estimator": RidgeClassifier(solver="sag", \ normalize=True \ # class_weight="balanced" # Bug: should be fixed ) }, None], \ "Linear SVM": [LinearSVC, { \ "random_state": 0, \ "class_weight": "balanced" \ }, None], \ "Linear SVM (with wrapper)": [OneVsRestClassifier, { \ "n_jobs": -1, \ "estimator": LinearSVC(random_state=0, class_weight="balanced") \ }, None], \ "CalibratedClassifierCV (Linear SVM with wrapper)": [CalibratedClassifierCV, { \ "base_estimator": OneVsRestClassifier(n_jobs=-1, \ estimator=LinearSVC(random_state=0, class_weight="balanced")) \ }, None], \ "Logistic Regression": [LogisticRegression, { "n_jobs": -1, \ "class_weight": "balanced" \ # "multi_class": "multinomial", \ # "solver": "newton-cg" }, None], \ "Stochastic Gradient Descent": [SGDClassifier, { "n_jobs": -1, \ "n_iter": 50, \ "shuffle": True, \ "class_weight": "balanced" }, None], \ "Nearest Centroid": [NearestCentroid, {}, None], # "RadiusNeighborsClassifier": [RadiusNeighborsClassifier, { \ # }, None] # "LinearDiscriminantAnalysis": [LinearDiscriminantAnalysis, \ # { \ # }, None], \ # "QuadraticDiscriminantAnalysis": [ \ # QuadraticDiscriminantAnalysis, { \ # }, None], \ # "K Nearest Neighbors": [KNeighborsClassifier, { \ # }, None], \ # "DecisionTreeClassifier": [DecisionTreeClassifier, { \ # }, None], \ # "Bagging Linear SVM": [BaggingClassifier, { \ # "base_estimator": LinearSVC(random_state=0, \ # class_weight="balanced"), \ # "max_samples": 0.5, \ # "max_features": 0.5, \ # "random_state": 0, \ # "n_jobs": -1, \ # "n_estimators": 100 # }, None], \ # "Bagging K Nearest Neighbors": [BaggingClassifier, { \ # "base_estimator": KNeighborsClassifier() \ # }, None] # "AdaBoostClassifier": [AdaBoostClassifier, { \ # "base_estimator": SGDClassifier(loss="log", n_jobs=-1, \ # n_iter=50, shuffle=True, class_weight="balanced"), \ # "n_estimators": 50 \ # }, None], \ # "GradientBoostingClassifier": [GradientBoostingClassifier, \ # { \ # "n_estimators": 10 \ # }, None] \ # "VotingClassifier": [VotingClassifier, { \ # "estimators": [ \ # ("pac", PassiveAggressiveClassifier(n_jobs=-1)), # ("rc", RidgeClassifier(solver="sag")), # ("lsvc", LinearSVC()), # ("lr", LogisticRegression(n_jobs=-1)), # ("sgdc", SGDClassifier(n_jobs=-1, n_iter=50)) # ], \ # "voting": "hard", \ # "n_jobs": -1 # }, None], # "VotingClassifier2": [VotingClassifier, { \ # "estimators": [ \ # ("lr", LogisticRegression(n_jobs=-1)), \ # ("sgdc", SGDClassifier(loss="modified_huber", \ # n_jobs=-1, n_iter=50)), \ # ("RandomForestClassifier", RandomForestClassifier( \ # n_estimators=50, n_jobs=-1)) \ # ], \ # "voting": "soft", \ # "n_jobs": -1 # }, None], # "StackingClassifier": [StackingClassifier, { \ # "classifiers": [ \ # SGDClassifier(loss="modified_huber", \ # n_jobs=-1, n_iter=50), \ # CalibratedClassifierCV(base_estimator= \ # OneVsRestClassifier(n_jobs=-1, \ # estimator=LinearSVC(random_state=0))), \ # CalibratedClassifierCV(base_estimator= \ # OneVsRestClassifier(n_jobs=-1, \ # estimator=RidgeClassifier(solver="sag"))) \ # ], \ # "use_probas": True, \ # "average_probas": False, \ # "meta_classifier": LogisticRegression(n_jobs=-1) # }, None] # "SoftmaxRegression": [SoftmaxRegression, {}, None], \ # "MultiLayerPerceptron": [MultiLayerPerceptron, {}, None] } # Below, there is a dictionary to store the accuracies for each # classifier models_predictions = {} chi2_feature_selection = SelectKBest(chi2, k="all") print_log("Splitting the data set") # Debug # df = df[-30000:] train_set, val_set, test_set = np.split(df, \ [int(.9*len(df)), int(.9999999999999999999999*len(df))]) # train_set, val_set, test_set = np \ # .split(df.sample(frac=1), \ # [int(.6*len(df)), int(.8*len(df))]) print_log("Shape of the initial Data Frame") # Debug print_log(df.shape) # Debug print_log(df['class'].value_counts(normalize=True)) print_log("Shape of the training set") # Debug print_log(train_set.shape) # Debug print_log(train_set['class'].value_counts(normalize=True)) print_log("Shape of the validation set") # Debug print_log(val_set.shape) # Debug print_log(val_set['class'].value_counts(normalize=True)) print_log("Shape of the test set") # Debug print_log(test_set.shape) # Debug print_log(test_set['class'].value_counts(normalize=True)) print_log("We count the occurrence of each term") # Debug count_vectorizer = CountVectorizer( \ lowercase=False, \ token_pattern=u"(?u)\S+" # ngram_range=(1,2) \ # max_df=0.5, \ # max_features=100000 ) print_log("Size of the vocabulary") X_train_counts = count_vectorizer \ .fit_transform(df['text'].values) print_log(X_train_counts.shape) X_train_counts = count_vectorizer \ .fit_transform(train_set['text'].values) print_log(X_train_counts.shape) print_log("Use of the TF-IDF model") # Debug tfidf_transformer = TfidfTransformer(use_idf=True, smooth_idf=False) # Debug print_log("Computation of the weights of the TF-IDF model") X_train = tfidf_transformer.fit_transform(X_train_counts) y_train = train_set['class'].values # standard_scaler = StandardScaler(with_mean=False) # # X_train = standard_scaler.fit_transform(X_train, y_train) # # print("Shape of the training set before over sampling") # Debug # print(X_train.shape) # Debug # # X_train, y_train = resample(X_train, y_train, random_state=0) X_train = chi2_feature_selection.fit_transform(X_train, y_train) # X_train_dense = None # y_train_dense = None # # dense_transformer = DenseTransformer() # le = LabelEncoder() # X_train, y_train = sm.fit_sample(X_train.toarray(), y_train) # # if hasattr(X_train, 'dtype') and np.issubdtype(X_train.dtype, np.float): # # preserve float family dtype # X_train = sp.csr_matrix(X_train) # else: # # convert counts or binary occurrences to floats # X_train = sp.csr_matrix(X_train, dtype=np.float64) # # print("Shape of the training set after over sampling") # Debug # print(X_train.shape) # print(pd.Series(y_train).value_counts(normalize=True)) print_log("Training of the models") # Debug for key, value in models.items(): print_log(key) # if key == "SoftmaxRegression" or key == "MultiLayerPerceptron": # X_train_dense = dense_transformer.fit_transform(X_train) # y_train_dense = le.fit_transform(y_train) # models[key][-1] = models[key][0](minibatches=1) \ # .fit(X_train_dense, y_train_dense) # else: # if key == "LinearDiscriminantAnalysis": # models[key][-1] = models[key][0](**models[key][1]) \ # .fit(X_train.toarray(), y_train) # else: models[key][-1] = models[key][0](**models[key][1]) \ .fit(X_train, y_train) print_log("--- {} seconds ---".format(time.time() - start_time)) print_log("We count the occurrence of each term in the val. " + \ "set") # Debug X_val_counts = count_vectorizer \ .transform(val_set['text'].values) print_log("Computation of the weights of the TF-IDF model " + \ "for the validation set") # Debug X_val = tfidf_transformer.transform(X_val_counts) # X_val = standard_scaler.transform(X_val) X_val = chi2_feature_selection.transform(X_val) y_val = val_set['class'].values # X_val_dense = None # y_val_dense = None print_log("Making predictions") # Debug for key, value in models.items(): print_log(key) # if key == "SoftmaxRegression" or key == "MultiLayerPerceptron": # X_val_dense = dense_transformer.transform(X_val) # y_val_dense = le.transform(y_val) # models_predictions[key] = np.mean(value[-1] \ # .predict(X_val_dense) == y_val_dense) # else: # if key == "LinearDiscriminantAnalysis": # models_predictions[key] = np.mean(value[-1] \ # .predict(X_val.toarray()) == y_val) # else: models[key].append(value[-1] \ .predict(X_val)) models_predictions[key] = np.mean(models[key][-1] == y_val) print_log("--- {} seconds ---".format(time.time() - start_time)) # Below, we print the accuracy of each classifier for key, value in models_predictions.items(): print_log("Accuracy of {}".format(key)) # Debug print_log(value) # Debug print_log("Predicted labels") print_log(models[key][-1]) print_log("True labels") print_log(y_val) try: if callable(getattr(models[key][-2], "predict_proba")): # print_log(models[key][-2].classes_) # print_log(models[key][-2].predict_proba(X_val)) lb = LabelBinarizer() _ = lb.fit_transform(models[key][-2].classes_) # print_log(lb.classes_) # print_log(y_classes_bin) # print_log(lb.transform(["exclude"])) y_val_bin = lb.transform(y_val) print_log("Mean Reciprocal Rank:") print_log(label_ranking_average_precision_score( \ y_val_bin, models[key][-2].predict_proba(X_val))) except AttributeError: pass print_log("Detailed report:") print_log(classification_report(y_val, models[key][-1])) print_log("Confusion matrix of Linear SVM (with wrapper):") cm = confusion_matrix(y_val, models["Linear SVM"][-1], labels=df['class'].unique()) print_log(df['class'].unique()) print_log(cm) df_cm = pd.DataFrame(cm, index=df['class'].unique(), columns=df['class'].unique()) fig = plt.figure(figsize=(20.0, 12.5)) sn.set(font_scale=0.5) sn.heatmap(df_cm, annot=True, annot_kws={"size": 8}) save_file = os.path.join(current_dir, "confusion_matrix.png") if save_file: plt.savefig(save_file, bbox_inches="tight") plt.close(fig) else: plt.show() print_log("--- {} seconds ---".format(time.time() - start_time)) # We dump the data frame df.to_csv("pre_processed_data.csv")
import numpy as np from sklearn.metrics import label_ranking_average_precision_score # In our use case 1 means that the image is relevant (same label as the query image) # And 0 means that the image is irrelevant y_true = np.array([[1, 1, 0, 0]]) # For each train image we compute the relevance score ''' Example 1 ''' y_score = np.array([[28, 10, 1, 0.5]]) label_ranking_average_precision_score(y_true, y_score) # In this first example, the two relevant items have the highest score, # the scoring function returns 1.0 ''' Example 2''' y_score = np.array([[28, 10, 10, 0.5]]) label_ranking_average_precision_score(y_true, y_score) # returns 0.83333333333333326 ''' Example 3''' y_score = np.array([[28, 10, 28, 0.5]]) label_ranking_average_precision_score(y_true, y_score) # returns 0.58333333333333326 ''' Example 4''' y_score = np.array([[10, 10, 28, 28]]) label_ranking_average_precision_score(y_true, y_score) # returns 0.5
for j in range(splitter): test_loss, test_y_block = model.test_on_batch([ X_w[:, j * (labels.shape[0] / splitter):(j + 1) * (labels.shape[0] / splitter), :], X_t[:, j * (labels.shape[0] / splitter):(j + 1) * (labels.shape[0] / splitter), :] ], labels[j * (labels.shape[0] / splitter):(j + 1) * (labels.shape[0] / splitter)]) test_y[j * (labels.shape[0] / splitter):(j + 1) * (labels.shape[0] / splitter)] = test_y_block else: test_loss, test_y = model.test_on_batch([X_w, X_t], labels) lraps.append( label_ranking_average_precision_score( np.reshape(np.array(labels), test_y.shape).T, test_y.T)) mrr, recall, prec = test_utils.print_accuracy_results( np.array(labels), np.reshape(test_y, np.array(labels).shape)) mrrs.append(mrr) recalls.append(recall) precs.append(prec) losses.append(test_loss) test_losses.append(test_loss) if len(losses) % 100 == 0: progbar.update(i, values=[("loss", np.sum(losses))]) losses = [] samples_seen += len(labels) print("\nSkipped=" + str(skipped))
def eval_turn(Config, model, data_loader, val_version, epoch_num, log_file): model.train(False) val_corrects1 = 0 val_corrects2 = 0 val_corrects3 = 0 val_size = data_loader.__len__() item_count = data_loader.total_item_len t0 = time.time() get_l1_loss = nn.L1Loss() get_ce_loss = nn.CrossEntropyLoss() val_batch_size = data_loader.batch_size val_epoch_step = data_loader.__len__() num_cls = data_loader.num_cls val_loss_recorder = LossRecord(val_batch_size) val_celoss_recorder = LossRecord(val_batch_size) print('evaluating %s ...' % val_version, flush=True) sum_fbeta = 0 with torch.no_grad(): for batch_cnt_val, data_val in enumerate(data_loader): # inputs = Variable(data_val[0].cuda()) # labels = Variable(torch.LongTensor(np.array(data_val[1])).long().cuda()) inputs, labels, labels_swap, swap_law, img_names = data_val labels_npy = np.array(labels) labels_tensor = Variable(torch.FloatTensor(labels_npy).cuda()) labels_ = labels_npy.astype(np.uint8) inputs = Variable(inputs.cuda()) outputs = model(inputs) loss = 0 # ce_loss = get_ce_loss(outputs[0], labels).item() ce_loss = get_sigmoid_ce(outputs[0], labels_tensor).item() loss += ce_loss val_loss_recorder.update(loss) val_celoss_recorder.update(ce_loss) if Config.use_dcl and Config.cls_2xmul: outputs_pred = outputs[0] + outputs[1][:, 0:num_cls] + outputs[ 1][:, num_cls:2 * num_cls] else: outputs_pred = outputs[0] ######## MAP is label ranking, do not need normilization # predict_multensor = torch.ge(outputs_pred, 0.5) ### 大于0.5的置为一,其他置为0,类似于阈值化操作 predict_mul = outputs_pred.cpu().numpy() temp_fbeta = label_ranking_average_precision_score( labels_, predict_mul) ################################################################# dy modify Micro precision # cor_sum = 0 # num_sum =0 # for j in range(10): # query_col = labels_[j,:] # label_col = predict_mul[j,:] # index = np.where(label_col > 0.5) # index_ = index[0] # number_=index_.size # query_binary = query_col[index] # query_label = label_col[index] # batch_corrects1 = np.count_nonzero(query_binary == query_label) # cor_sum = cor_sum + batch_corrects1 # num_sum = num_sum + number_ # temp_fbeta = cor_sum/num_sum ################################################################## sum_fbeta = sum_fbeta + temp_fbeta ave_num = batch_cnt_val + 1 # top3_val, top3_pos = torch.topk(outputs_pred, 3) # print('{:s} eval_batch: {:-6d} / {:d} loss: {:8.4f}'.format(val_version, batch_cnt_val, val_epoch_step, loss), flush=True) # batch_corrects1 = torch.sum((top3_pos[:, 0] == labels)).data.item() # val_corrects1 += batch_corrects1 # batch_corrects2 = torch.sum((top3_pos[:, 1] == labels)).data.item() # val_corrects2 += (batch_corrects2 + batch_corrects1) # batch_corrects3 = torch.sum((top3_pos[:, 2] == labels)).data.item() # val_corrects3 += (batch_corrects3 + batch_corrects2 + batch_corrects1) # val_acc1 = val_corrects1 / item_count # val_acc2 = val_corrects2 / item_count # val_acc3 = val_corrects3 / item_count # log_file.write(val_version + '\t' +str(val_loss_recorder.get_val())+'\t' + str(val_celoss_recorder.get_val()) + '\t' + str(val_acc1) + '\t' + str(val_acc3) + '\n') # t1 = time.time() # since = t1-t0 # print('--'*30, flush=True) # print('% 3d %s %s %s-loss: %.4f ||%s-acc@1: %.4f %s-acc@2: %.4f %s-acc@3: %.4f ||time: %d' % (epoch_num, val_version, dt(), val_version, val_loss_recorder.get_val(init=True), val_version, val_acc1,val_version, val_acc2, val_version, val_acc3, since), flush=True) # print('--' * 30, flush=True) # return val_acc1, val_acc2, val_acc3 ave_acc = sum_fbeta / ave_num log_file.write(val_version + '\t' + str(val_loss_recorder.get_val()) + '\t' + str(val_celoss_recorder.get_val()) + '\t' + str(ave_acc) + '\n') t1 = time.time() since = t1 - t0 print('--' * 30, flush=True) print('% 3d %s %s %s-loss: %.4f ||%s-ave@acc: %.4f ||time: %d' % (epoch_num, val_version, dt(), val_version, val_loss_recorder.get_val(init=True), val_version, ave_acc, since), flush=True) print('--' * 30, flush=True) return ave_acc
# else: output_prob = torch.sigmoid(output) if output_probs is None: output_probs = to_np(output_prob) else: output_probs = np.concatenate([output_probs, to_np(output_prob)], axis=0) if total_output_probs is None: total_output_probs = output_probs * model_weight else: total_output_probs += (output_probs * model_weight) predict_vector = np.argmax(total_output_probs, axis=1) ranking_ap_score = label_ranking_average_precision_score(total_tags, total_output_probs) ranking_loss = label_ranking_loss(total_tags, total_output_probs) label_vector = np.argmax(total_tags, axis=1) bool_vector = predict_vector == label_vector accuracy = bool_vector.sum() / len(bool_vector) ens_ranking_ap_score = ranking_ap_score ens_ranking_loss = ranking_loss print( 'Ens Val [{}] Acc {:2.4f} / Lank AP {:2.4f} / Lank Loss {:2.4f}'.format( datetime.now().strftime('%Y/%m/%d %H:%M:%S'), accuracy, ens_ranking_ap_score, ens_ranking_loss)) m_name_list = m_names.split(",")
def retrieve_closest_images(test_element, test_label, n_samples=10): #test_element = x_test[test_element] #print(test_element) test_label = y_test[test_label] learned_codes = encoder.predict(x_train) learned_codes = learned_codes.reshape( learned_codes.shape[0], learned_codes.shape[1] * learned_codes.shape[2] * learned_codes.shape[3]) test_code = encoder.predict(np.array([test_element])) test_code = test_code.reshape(test_code.shape[1] * test_code.shape[2] * test_code.shape[3]) distances = [] for code in learned_codes: distance = np.linalg.norm(code - test_code) distances.append(distance) nb_elements = learned_codes.shape[0] distances = np.array(distances) learned_code_index = np.arange(nb_elements) labels = np.copy(y_train).astype('float32') labels[labels != test_label] = -1 labels[labels == test_label] = 1 labels[labels == -1] = 0 labels = labels.reshape(distances.shape) distance_with_labels = np.stack((distances, labels, learned_code_index), axis=-1) sorted_distance_with_labels = distance_with_labels[ distance_with_labels[:, 0].argsort()] sorted_distances = 28 - sorted_distance_with_labels[:, 0] sorted_labels = sorted_distance_with_labels[:, 1] sorted_indexes = sorted_distance_with_labels[:, 2] kept_indexes = sorted_indexes[:n_samples] score = label_ranking_average_precision_score( np.array([sorted_labels[:n_samples]]), np.array([sorted_distances[:n_samples]])) print("Average precision ranking score for tested element is {}".format( score)) #original_image = x_test[70] original_image = test_element retrieved_images_labels = [] # cv2.imshow('original_image', original_image) #retrieved_images = [] retrieved_images = x_train[int(kept_indexes[0]), :] for i in range(1, n_samples): retrieved_images = np.hstack( (retrieved_images, x_train[int(kept_indexes[i]), :])) #retrieved_images.append(x_train[int(kept_indexes[i]), :]) for i in range(0, n_samples): retrieved_images_labels.append(y_train[int(kept_indexes[i]), :]) print("Retrieved labels:") labels = [] for label in retrieved_images_labels: if label[0] == 9: print("truck") labels.append("truck") elif label[0] == 0: print("airplane") labels.append("airplane") elif label[0] == 1: print("automobile") labels.append("auto") elif label[0] == 2: print("bird") labels.append("bird") elif label[0] == 3: print("cat") labels.append("cat") elif label[0] == 4: print("deer") labels.append("deer") elif label[0] == 5: print("dog") labels.append("dog") elif label[0] == 6: print("frog") labels.append("frog") elif label[0] == 7: print("horse") labels.append("horse") elif label[0] == 8: print("ship") labels.append("ship") else: print(label[0]) # cv2.imshow('Results', retrieved_images) # cv2.waitKey(0) # cv2.imwrite('E:/Facultate/unsupervises-image-retrieval/test_results_64v3/original_image5.jpg', 255 * cv2.resize(original_image, (0,0), fx=3, fy=3)) # cv2.imwrite('E:/Facultate/unsupervises-image-retrieval/test_results_64v3/retrieved_results5.jpg', 255 * cv2.resize(retrieved_images, (0,0), fx=2, fy=2)) return (255 * cv2.resize(original_image, (0, 0), fx=3, fy=3), 255 * cv2.resize(retrieved_images, (0, 0), fx=2, fy=2), score, labels)