def setUpClass(cls): print("Setting up full integration tests") testing_dir = util.get_testing_dir() pretrained = os.path.join(testing_dir, "pretrained", "random_forest.sav") cls.classifier = clf.Classifier() cls.classifier.load_model(pretrained) test_xml = os.path.join(testing_dir, "xml") paper_paths = util.get_paper_paths(test_xml) paper_soups = util.load_paper_xmls(paper_paths) cls.good = paper_soups[0] cls.good_col = tu.TokenCollection(cls.good) cls.good_col.build_tokens() feature_matrix = cls.good_col.generate_feature_matrix() tokens_count, _ = feature_matrix.shape bias_vec = np.ones((tokens_count, 1)) feature_matrix = np.hstack([feature_matrix, bias_vec]) # classify A1, A2, R1, R2, OC, P prob_matrix = cls.classifier.clf.predict_proba(feature_matrix) cls.final_prob_matrix = prob_matrix.copy() for tok_i in range(len(cls.final_prob_matrix)): cls.final_prob_matrix[tok_i, :] /= \ np.sum(cls.final_prob_matrix[tok_i,:]) cls.predictions = {} for ev_label in tu.EvLabel: cls.predictions[ev_label] = \ cls.final_prob_matrix[:, ev_label.value + 1]
def train(self, paper_paths): paper_soups = util.load_paper_xmls(paper_paths) paper_count = len(paper_soups) # initializing the label vectors # each label has an empty list [] train_start = time.time() print("Training on {} paper(s)...".format(paper_count)) # Extract feature vectors from all papers token_cols = [None] * paper_count cum_feat_matrix = np.zeros((0, ft.feature_count + 1)) # +1 is bias cum_labels_vec = np.zeros((0, 1)) cum_labels = np.zeros((0, len(tu.EvLabel.__members__.items()))) for i in range(paper_count): # going through all papers soup = paper_soups[i] paper_id = soup.pmid.text # print("Processing papers {} out of {}\r".format(i + 1, paper_count)) # print("Paper #", paper_id) # start = time.time() col = tu.TokenCollection(soup) col.build_tokens() feature_matrix = col.generate_feature_matrix() tokens_count, _ = feature_matrix.shape bias_vec = np.ones((tokens_count, 1)) feature_matrix = np.hstack([feature_matrix, bias_vec]) # converts a one-hot matrix (labels) into a vector of size # (tokens_count,1) where each value corresponds to the class ID # from Enum EvLabel or -1 for unclassified tokens labels = col.generate_train_labels() labels_vec = np.ones((tokens_count, 1)) * -1 for token_i in range(tokens_count): for ev_label in tu.EvLabel: if labels[token_i, ev_label.value] > 0: labels_vec[token_i] = ev_label.value cum_labels = np.vstack((cum_labels, labels)) # append current feature_matrix to cum_feat_matrix cum_feat_matrix = np.vstack((cum_feat_matrix, feature_matrix)) cum_labels_vec = np.vstack((cum_labels_vec, labels_vec)) token_cols[i] = col # end = time.time() # print("Time elapsed on paper #{} ({}): {}" # .format(i + 1, paper_id, np.round(end - start, 4))) self.clf.fit(cum_feat_matrix, cum_labels_vec.flatten()) train_end = time.time() print("Done training. Time elapsed: ", train_end - train_start) self.last_train_paths = paper_paths
def test(self, paper_paths): # Test how good our prediction is paper_soups = util.load_paper_xmls(paper_paths) paper_count = len(paper_soups) print("Testing on {} paper(s)...".format(paper_count)) # Extract feature vectors from all papers test_results = [None] * paper_count losses = np.zeros((paper_count, )) precisions = [0] * 6 for paper_i in range(paper_count): soup = paper_soups[paper_i] print("---- Paper #{} [{}]".format(paper_i + 1, soup.pmid.text)) col = tu.TokenCollection(soup) col.build_tokens() feature_matrix = col.generate_feature_matrix() tokens_count, _ = feature_matrix.shape bias_vec = np.ones((tokens_count, 1)) feature_matrix = np.hstack([feature_matrix, bias_vec]) # classify A1, A2, R1, R2, OC, P prob_matrix = self.clf.predict_proba(feature_matrix) final_prob_matrix = prob_matrix.copy() for tok_i in range(len(final_prob_matrix)): final_prob_matrix[tok_i, :] /= np.sum( final_prob_matrix[tok_i, :]) predictions = {} for ev_label in tu.EvLabel: predictions[ev_label] = final_prob_matrix[:, ev_label.value + 1] label_assignment = self.assign_ev_labels(col, predictions) loss = self.eval_loss(col, prob_matrix) losses[paper_i] = loss predicted_phrases = [None] * 6 for ev_label in tu.EvLabel: true_ev_label_data = col.ev_labels.get(ev_label) if true_ev_label_data is None: print("Label not found: ", ev_label) else: ev_label_data = label_assignment[ev_label] if ev_label_data.token.chunk is None: predicted_phrase = ev_label_data.token.word else: c_i = col.chunks.index(ev_label_data.token.chunk) predicted_phrase = ev_label_data.token.chunk.string if len(ev_label_data.token.chunk.tokens) == 1: next_tok_i = 1 + col.tokens.index( ev_label_data.token.chunk.tokens[-1]) next_tok = col.tokens[next_tok_i].word if next_tok == "(": predicted_phrase += " (" elif next_tok == ",": predicted_phrase += "," predicted_phrase = predicted_phrase + " {}".format( col.chunks[c_i + 1].string) if col.chunks[c_i + 1].string == "with": predicted_phrase = predicted_phrase + " {}".format( col.chunks[c_i + 2].string) if next_tok == "(": predicted_phrase += " )" elif ev_label == tu.EvLabel.P: c_i = col.chunks.index(ev_label_data.token.chunk) predicted_phrase = predicted_phrase + " {} {}".format( col.chunks[c_i + 1].string, col.chunks[c_i + 2].string) print("Predicted: ", ev_label.name, predicted_phrase, " --- True Label: ", true_ev_label_data.word) predicted_phrases[ev_label.value] = predicted_phrase phrase_lowered = predicted_phrase.lower() if true_ev_label_data.word in phrase_lowered or \ (true_ev_label_data.word == "iop" and "pressure" in phrase_lowered) or \ (true_ev_label_data.word == "pressure" and "iop" in phrase_lowered): precisions[ev_label.value] += 1 loss = np.round(loss, 4) print("loss for this paper is: ", loss) test_result = { "soup": soup, "paper_path": paper_paths[paper_i], "token_collection": col, "true_label_assignment": col.ev_labels, "predicted_label_assignment": label_assignment, "predicted_phrases": predicted_phrases, "feature_matrix": feature_matrix, "loss": loss } test_results[paper_i] = test_result total_loss = np.sum(losses) print("\n\n---------------") precisions = [np.round(p / paper_count, 4) for p in precisions] print("Average precisions for this run is: \nA1:{}\t A2:{}\t R1:" "{}\t R2:{}\t OC:{}\t P:{}".format(precisions[0], precisions[1], precisions[2], precisions[3], precisions[4], precisions[5])) dist_loss = np.round(total_loss, 4) print("total loss is: ", np.round(dist_loss, 4)) print("average loss is: ", np.round(dist_loss / paper_count, 4)) self.last_total_loss = dist_loss self.last_precisions = precisions self.last_test_results = test_results return total_loss, precisions
def setUp(self): self.good = self.paper_soups[0] self.good_col = tu.TokenCollection(self.good) self.bad = self.paper_soups[1] self.bad_col = tu.TokenCollection(self.bad)
def process_word(word_i, word): # print('Processing word {}'.format(word_i + 1)) cached_classes = umls_cache.get(word) if cached_classes is not None: return word, cached_classes classes = util.get_umls_classes(word) return word, classes for i in range(len(paper_paths)): soup = util.parse_paper(paper_paths[i]) print('---- Paper #{}/{} [{}]'.format(i + 1, paper_count, soup.pmid.text)) col = tu.TokenCollection(soup) col.build_tokens(umls_cache=True) print('Total tokens: {}'.format(len(col.tokens))) result = Parallel(n_jobs=4)(delayed(process_word)(i, token.word) for i, token in enumerate(col.tokens)) for word, classes in result: umls_cache.set(word, classes) umls_cache.save() print('\n\n') print('Done!')