def sample_negative_mappings(self, kb1, kb2, tp_mappings): """ Given two KBs and true positive mapping, sample easy and hard negatives for training data :param kb1: source KB :param kb2: target KB :param tp_mappings: true positive mappings :return: negative pairs (0 for hard negatives, -1 for easy negatives) """ cand_sel = CandidateSelection(kb1, kb2) sys.stdout.write('\t\tExtracting candidates...\n') kb2_ent_ids = [e.research_entity_id for e in kb2.entities] tps = set([tuple(i[:2]) for i in tp_mappings]) cand_negs = [] rand_negs = [] # sample negatives for each true positive (TP) for tp in tps: # get candidates for source entity cands = cand_sel.select_candidates( tp[0])[:constants.KEEP_TOP_K_CANDIDATES] # sample hard negatives cand = random.sample( cands, min(constants.NUM_HARD_NEGATIVE_PER_POSITIVE, len(cands))) cand_negs += [tuple([tp[0], c]) for c in cand] # sample easy negatives rand = random.sample(kb2_ent_ids, constants.NUM_EASY_NEGATIVE_PER_POSITIVE) rand_negs += [tuple([tp[0], r]) for r in rand] # filter negatives hard_negatives = set(cand_negs).difference(tps) easy_negatives = set(rand_negs).difference(tps).difference( hard_negatives) # append negative pairs together with labels: (0 = hard negative, -1 = easy negative) neg_pairs = [] for neg in hard_negatives: neg_pairs.append([neg[0], neg[1], 0, self.umls_header]) for neg in easy_negatives: neg_pairs.append([neg[0], neg[1], -1, self.umls_header]) return neg_pairs
def align(self, model_type, model_path, s_kb_path, t_kb_path, gold_path, output_path, align_strat, cuda_device=-1, missed_path=None): """ Align two input ontologies :param model_type: type of model :param model_path: path to ontoemma model :param s_kb_path: path to source KB :param t_kb_path: path to target KB :param gold_path: path to gold alignment between source and target KBs :param output_path: path to write output alignment :param align_strat: strategy for alignment assignment :param cuda_device: GPU device number :param missed_path: optional parameter for outputting missed alignments :return: """ assert model_type in constants.IMPLEMENTED_MODEL_TYPES assert os.path.exists(model_path) assert s_kb_path is not None assert t_kb_path is not None alignment_scores = None sys.stdout.write("Loading KBs...\n") s_kb = self.load_kb(s_kb_path) t_kb = self.load_kb(t_kb_path) sys.stdout.write("Normalizing KBs...\n") s_kb.normalize_kb() t_kb.normalize_kb() sys.stdout.write("Building candidate indices...\n") cand_sel = CandidateSelection(s_kb, t_kb) similarity_scores = [] if model_type == 'lr': similarity_scores = self._align_lr(model_path, s_kb, t_kb, cand_sel) elif model_type == 'nn': similarity_scores = self._align_nn(model_path, s_kb, t_kb, cand_sel, cuda_device) neighborhood_scores = self._compute_neighborhood_similarities(similarity_scores, s_kb, t_kb) alignment = self._compute_alignment(align_strat, similarity_scores, neighborhood_scores, s_kb, t_kb) if missed_path is None and output_path is not None: missed_path = output_path + '.ontoemma.missed' if gold_path is not None and os.path.exists(gold_path): sys.stdout.write("Evaluating against gold standard...\n") alignment_scores = self.compare_alignment_to_gold(gold_path, alignment, s_kb, t_kb, missed_path) if output_path is not None: sys.stdout.write("Writing results to file...\n") self.write_alignment(output_path, alignment, s_kb_path, t_kb_path) return alignment_scores
def eval_cs(self, s_kb_path, t_kb_path, gold_path, output_path, missed_path): """ Evaluate candidate selection module :param s_kb_path: source kb path :param t_kb_path: target kb path :param gold_path: gold alignment file path :param output_path: output path for evaluation results :param missed_path: output path for missed alignments :return: """ sys.stdout.write("Loading KBs...\n") s_kb = self.load_kb(s_kb_path) t_kb = self.load_kb(t_kb_path) sys.stdout.write("Loading gold alignment...\n") gold_alignment = self.load_alignment(gold_path) positive_alignments = [(i[0], i[1]) for i in gold_alignment] sys.stdout.write("\tNumber of gold alignments: %i\n" % len(positive_alignments)) sys.stdout.write("Starting candidate selection...\n") cand_sel = CandidateSelection(s_kb, t_kb) cand_sel.EVAL_OUTPUT_FILE = output_path cand_sel.EVAL_MISSED_FILE = missed_path sys.stdout.write("Evaluating candidate selection...\n") cand_sel.eval(positive_alignments) return