def main( prediction_filenames, submission_filename, ): predictions_list = [] for filename in prediction_filenames: predictions_list.append(load_predictions(filename)) predictions = np.vstack(predictions_list) mean_predictions = np.mean(predictions, axis=0) test_filenames = list(sorted(os.listdir(config.TEST_DIR_PATH))) predictions_by_file = {} for filename, file_predictions in zip(test_filenames, mean_predictions): idx = np.argmax(file_predictions) label = idx_to_label[idx] predictions_by_file[filename] = label sample_submission = pd.read_csv(config.SAMPLE_SUBMISSION_PATH) sample_submission.drop('label', axis=1, inplace=True) predictions_df = pd.DataFrame( list(predictions_by_file.items()), columns=['fname', 'label'], ) submission_df = sample_submission.merge(predictions_df, on='fname') submission_df.to_csv( os.path.join(config.SUBMISSIONS_PATH, submission_filename), index=False, )
def process(exp, reference, query, tag=None): exp_clusters = sickle.assignments(query) clusters = pd.read_csv('{}/{}_clusters.csv'.format( os.path.join(os.path.dirname(CUR_DIR), '00-metadata'), query), index_col=0, header=None) clusters.index = clusters.index.str.replace('C', '').astype(int) preds = utils.load_predictions( os.path.join(CUR_DIR, '{}-preds.csv'.format(exp)), reference) # second_maxes = get_second_maxes(preds) # maxes_heatmap = plot_second_maxes(second_maxes) # maxes_heatmap.savefig(os.path.join(CUR_DIR, '{}-second-max-heatmap.png'.format(exp))) # # if query == 'SC03': # plasma_second_maxes = get_second_maxes(preds.loc[exp_clusters.index[exp_clusters == 7],:]) # maxes_heatmap = plot_second_maxes(plasma_second_maxes) # plt.suptitle('Second max predictions for Plasma cells') # maxes_heatmap.savefig(os.path.join(CUR_DIR, '{}-plasma-second-max-heatmap.png'.format(exp))) hmap = heatmap(preds) hmap.savefig(os.path.join(CUR_DIR, '{}-heatmap.pdf'.format(exp))) if query == 'SC03': hmap = heatmap(preds.loc[exp_clusters.index[exp_clusters == 2], :], label=r'$\it{B\ cells}$ from SC03 dataset', figsize=(16, 8), subfig='A') hmap.savefig(os.path.join(CUR_DIR, '{}-bcells-heatmap.pdf'.format(exp))) hmap = heatmap(preds.loc[exp_clusters.index[exp_clusters == 7], :], label=r'$\it{Plasma\ cells}$ from SC03 dataset', figsize=(16, 8), subfig='B') hmap.savefig(os.path.join(CUR_DIR, '{}-plasma-heatmap.pdf'.format(exp))) mapping = sickle.mapping(query, reference) s = sankey.sankey(clusters.iloc[:, 0].loc[exp_clusters], preds.idxmax(axis=1), alpha=.7, left_order=sickle.sankey_order(), mapping=mapping, tag=tag) s.savefig(os.path.join(CUR_DIR, '{}-sankey.pdf'.format(exp))) if mapping: open(os.path.join(CUR_DIR, '{}-f1.txt'.format(exp)), 'w').write('F1 score: {:.4f}'.format( mapping.f1(clusters.iloc[:, 0].loc[exp_clusters], preds.idxmax(axis=1))))
def process(exp, reference, query, tag=None): exp_clusters = sickle.assignments(query) clusters = pd.read_csv('{}/{}_clusters.csv'.format( os.path.join(os.path.dirname(CUR_DIR), '00-metadata'), query ), index_col=0, header=None) clusters.index = clusters.index.str.replace('C', '').astype(int) preds = utils.load_predictions( os.path.join(CUR_DIR, '{}-preds.csv'.format(exp)), reference ) second_maxes = get_second_maxes(preds) maxes_heatmap = plot_second_maxes(second_maxes) maxes_heatmap.savefig(os.path.join(CUR_DIR, '{}-second-max-heatmap.png'.format(exp))) if query == 'SC03': plasma_second_maxes = get_second_maxes(preds.loc[exp_clusters.index[exp_clusters == 7],:]) maxes_heatmap = plot_second_maxes(plasma_second_maxes) plt.suptitle('Second max predictions for Plasma cells') maxes_heatmap.savefig(os.path.join(CUR_DIR, '{}-plasma-second-max-heatmap.png'.format(exp))) hmap = heatmap(preds) hmap.suptitle('Predictions for {} dataset'.format(query)) hmap.subplots_adjust(top=0.88) hmap.savefig(os.path.join(CUR_DIR, '{}-heatmap.pdf'.format(exp))) if query == 'SC03': hmap = heatmap(preds.loc[exp_clusters.index[exp_clusters == 7],:], figsize=(16, 12)) hmap.suptitle('Predictions for SC03 Plasma cells') hmap.subplots_adjust(top=0.88) hmap.savefig(os.path.join(CUR_DIR, '{}-plasma-heatmap.pdf'.format(exp))) seaborn.set(font_scale=1) mapping = sickle.mapping(query, reference) s = sankey.sankey( clusters.iloc[:, 0].loc[exp_clusters], preds.idxmax(axis=1), alpha=.7, left_order=sickle.sankey_order(), mapping=mapping, tag=tag ) s.savefig(os.path.join(CUR_DIR, '{}-sankey.pdf'.format(exp))) if mapping: open(os.path.join(CUR_DIR, '{}-f1.txt'.format(exp)), 'w').write( 'F1 score: {:.4f}'.format( mapping.f1(clusters.iloc[:, 0].loc[exp_clusters], preds.idxmax(axis=1)) ) )
def draw_(img_dir, out_dir, json_file, threshold=0.0001): predictions = load_predictions(json_file) exec_time = json_file.rpartition('_')[-1].rstrip('.json') pfd_pgs_sizes = [] for img_name, result in predictions.items(): img_path = f'{img_dir}/{img_name}' img = cv2.imread(img_path) pfd_pgs_sizes.append(img.shape[:2]) # new_img = img.copy() # img_obj = Image(Path(img_path)) # for obj in result: # obj_label = obj['label'] # if obj_label == 'Bordered': # table_bbox = obj['bbox'] # table_img = crop_img_to_bbox(img, table_bbox) # boxes = recognize_bordered_table(table_img, threshold) # new_img = draw_boxes(new_img, boxes, table_bbox[:2], stroke=3) # new_img = draw_boxes(new_img, [convert_to_xywh(table_bbox)], color=(255, 0, 0), stroke=4) # detect_bordered_tables_on_image(img_obj) detection_json = detect_images(Path(img_dir), pfd_pgs_sizes) subdir = f'{out_dir}/{json_file.rpartition("_")[-1].rstrip(".json")}' os.makedirs(subdir, exist_ok=True) with open(f'{subdir}/detections.json', 'w') as f: json.dump(detection_json, f)
def evaluate(predictions_file, labels_files, output_dir): IDs = [] labels = [] weights = [] for labels_file in labels_files: file_IDs, file_labels, file_weights = \ load_weighted_labels(labels_file) IDs += file_IDs labels += list(file_labels) weights += list(file_weights) labels = np.array(labels) weights = np.array(weights) predicted_IDs, probs = load_predictions(predictions_file) evaluate_IDs = [] evaluate_labels = [] evaluate_probs = [] for i, ID in enumerate(predicted_IDs): # only evaluate if... # ID has weight = 1 in the original labels # and ID is not ambiguous in the original labels if (ID in IDs and weights[IDs.index(ID)] == 1 and labels[IDs.index(ID)] != 0): evaluate_IDs.append(ID) evaluate_labels.append(labels[IDs.index(ID)]) evaluate_probs.append(probs[i]) evaluate_IDs = np.array(evaluate_IDs) evaluate_labels = np.array(evaluate_labels) evaluate_probs = np.array(evaluate_probs) try: sort_index = evaluate_probs.argsort()[::-1] evaluate_IDs = evaluate_IDs[sort_index] evaluate_labels = evaluate_labels[sort_index] evaluate_probs = evaluate_probs[sort_index] prior = sum(evaluate_labels == 1) / len(evaluate_labels) ranks = [10, 20, 50] auROC = roc_auc_score(evaluate_labels, evaluate_probs) auPRC = np.log2( average_precision_score(evaluate_labels, evaluate_probs) / prior) # precision = TP / (TP + FP) precisions = \ [np.log2((sum(evaluate_labels[:rank] == 1) / rank) / prior) for rank in ranks] # recall = TP / (TP + FN) recalls = \ [(sum(evaluate_labels[:rank] == 1) / sum(evaluate_labels == 1)) for rank in ranks] precision_headers = \ ["log(P@r{}/prior)".format(rank) for rank in ranks] recall_headers = ["R@r{}".format(rank) for rank in ranks] values = [auROC] + [auPRC] + precisions + recalls headers = \ ["auROC"] + ["auPRC"] + precision_headers + recall_headers output_file = \ os.path.join(output_dir, os.path.basename(predictions_file)) with open(output_file, "w") as f: blank_string = "\t".join(["{}"] * (2 * len(ranks) + 2)) + "\n" f.write(blank_string.format(*headers)) f.write(blank_string.format(*values)) except ValueError as e: print("ValueError:", e)
############################################################################### from utils import load_predictions, generate_metrics, save_pandas_summary from utils import summarize_metrics_folds, pandanize_summary if __name__ == "__main__": print "Loading data" predictions_dict = load_predictions() print "Started calculation" metrics_dict = generate_metrics(predictions_dict) summary = summarize_metrics_folds(metrics_dict) print "Finished calculation" pd_summary = pandanize_summary(summary) save_pandas_summary(pd_summary) print "Stored metrics"
import utils if __name__=="__main__": predictions = utils.load_predictions("experiments/results/predictions.txt") print predictions