def load_data_for_predict(self, ann_dir, ignore_mappings=[], ignore_context=False, separate_by_label=False, full_text_dir=None): """ load data for prediction - no ground truth exists :param ann_dir: :param ignore_mappings: :param ignore_context: :param separate_by_label: :param full_text_dir: :return: """ if ignore_context: logging.info('doing learning without considering contextual info') cm = self.concept_mapping file_keys = [f[:f.rfind('.')] for f in listdir(ann_dir) if isfile(join(ann_dir, f))] lbl2data = {} for fk in file_keys: cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm) fk = fk.replace('se_ann_', '') if full_text_dir is not None: cr.full_text_folder = full_text_dir LabelModel.read_one_ann_doc(self, cr, fk, lbl2data=lbl2data, ignore_mappings=ignore_mappings, ignore_context=ignore_context, separate_by_label=separate_by_label) return {'lbl2data': lbl2data, 'files': file_keys}
def collect_dimensions(self, ann_dir): cm = self.concept_mapping file_keys = [ f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f)) ] # collect dimension labels for fk in file_keys: cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm) t = self.label.replace('neg_', '') anns = cr.get_anns_by_label(t) neg_anns = cr.get_anns_by_label('neg_' + t) for a in anns + neg_anns: self.add_label_dimension_by_annotation(a) # self.add_context_dimension_by_annotation(a) if (a.negation != 'Negated' and self.label.startswith('neg_')) or \ (a.negation == 'Negated' and not self.label.startswith('neg_')): continue sanns = cr.get_same_sentence_anns(a) context_anns = [] + sanns['umls'] + sanns['phenotype'] # collect cui labels for u in sanns['umls']: self._cui2label[u.cui] = u.pref for c in context_anns: self.add_context_dimension_by_annotation(c)
def assess_label_quality(self, ann_dir, gold_dir, separate_by_label=True, ignore_context=True): if ignore_context: logging.info('doing learning without considering contextual info') # print self.get_top_tfidf_dimensions(self.max_dimensions) cm = self.concept_mapping file_keys = [ f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f)) ] label_type = self.label.replace('neg_', '') query_label_perform = {} for fk in file_keys: cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm) if not isfile(join(gold_dir, '%s-ann.xml' % fk)): continue gd = EDIRDoc(join(gold_dir, '%s-ann.xml' % fk)) not_matched_gds = [] for e in gd.get_ess_entities(): if (ignore_context and e.label.replace('neg_', '') == label_type) \ or (not ignore_context and e.label == self.label): not_matched_gds.append(e.id) anns = cr.get_anns_by_label(self.label, no_context=ignore_context) for a in anns: multiple_true_positives = 0 matched = False for g in gd.get_ess_entities(): if g.id in not_matched_gds: gt = g.label.replace('neg_', '') if g.overlap(a) and ( (g.label == self.label and not ignore_context) or (ignore_context and gt == label_type)): if matched: multiple_true_positives += 1 matched = True not_matched_gds.remove(g.id) if separate_by_label: lbl = LabelModel.get_ann_query_label(a) else: lbl = 'united' ql = lbl if ql not in query_label_perform: query_label_perform[ql] = {'c': 0, 'w': 0} if matched: query_label_perform[ql]['c'] += 1 else: query_label_perform[ql]['w'] += 1 lbls = [(l, 1.0 * query_label_perform[l]['c'] / (query_label_perform[l]['c'] + query_label_perform[l]['w']), query_label_perform[l]['c'], query_label_perform[l]['w']) for l in query_label_perform] return sorted(lbls, key=lambda x: x[1])
def populate_validation_results(): label_dir = _gold_dir ann_dir = _ann_dir label2performances = {} file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))] for fk in file_keys: populate_semehr_results(label_dir, ann_dir, fk, label2performances, using_combined=False) CustomisedRecoginiser.print_performances(label2performances)
def predict_doc_phenotypes(doc_key, doc_anns, doc_text, model_factory, concept_mapping, ignore_mappings=[], mention_pattern=None): """ load a document and do all phenotype predictions in one go this is designed for large amounts of documents to be loaded, for example, from databases :param doc_key: :param doc_anns: :param doc_text: :param model_factory: :param concept_mapping: :param ignore_mappings: :param mention_pattern: :return: """ cr = CustomisedRecoginiser(doc_key, concept_mapping=concept_mapping, ann_doc=doc_anns) cr.full_text = doc_text p2count = {} total = 0 for p in model_factory.phenotypes: lm = model_factory.get_model_by_phenotype(p) if lm is None: logging.info('phenotype %s not found' % p) continue lbl2data = {} LabelModel.read_one_ann_doc(lm, cr, doc_key, lbl2data=lbl2data, ignore_mappings=ignore_mappings, ignore_context=True, separate_by_label=True) doc2predicted = {} label_model_predict(lm, model_factory.model_file_pattern(p), lbl2data, doc2predicted, mention_pattern=mention_pattern, mention_prediction_param=cr) if doc_key in doc2predicted: p2count[p] = { 'freq': len(doc2predicted[doc_key]), 'cui2freq': collect_phenotype_concept(doc2predicted[doc_key]) } total += 1 return p2count if total > 0 else None
def do_learn_exp(viz_file, num_dimensions=[20], ignore_context=False, separate_by_label=False, conll_output_file=None, eHostGD=False, mention_pattern=None): results = {} id2conll = {} result_str = '' for lbl in _labels: logging.info('working on [%s]' % lbl) _learning_model_file = _learning_model_dir + '/%s.lm' % lbl _ml_model_file_ptn = _learning_model_dir + '/' + lbl + '_%s_DT.model' _pca_model_file = None pca_dim = None max_dimensions = num_dimensions t = lbl.replace('neg_', '') ignore_mappings = _ignore_mappings[t] if t in _ignore_mappings else [] # remove previous model files logging.debug('removing previously learnt models...') for f in [f for f in # listdir(_learning_model_dir) if isfile(join(_learning_model_dir, f)) and f.endswith('.model')]: remove( # join(_learning_model_dir, f)) for dim in max_dimensions: logging.info('dimension setting: %s' % dim) learn_prediction_model(lbl, ann_dir=_ann_dir, gold_dir=_gold_dir, ml_model_file_ptn=_ml_model_file_ptn, model_dir=_learning_model_dir, pca_dim=pca_dim, pca_model_file=_pca_model_file, max_dimension=dim, ignore_mappings=ignore_mappings, viz_file=viz_file, ignore_context=ignore_context, separate_by_label=separate_by_label, full_text_dir=_gold_text_dir, eHostGD=eHostGD) logging.debug('bad labels: %s' % ignore_mappings) pl = '%s dim[%s]' % (lbl, dim) performance = LabelPerformance(pl) results[pl] = performance predict_label(_learning_model_file, _test_ann_dir, _test_gold_dir, _ml_model_file_ptn, performance, pca_model_file=_pca_model_file, max_dimension=dim, ignore_mappings=ignore_mappings, ignore_context=ignore_context, separate_by_label=separate_by_label, full_text_dir=_test_text_dir, file_pattern=_gold_file_pattern, id2conll=id2conll, label_whitelist=_labels, eHostGD=eHostGD, mention_pattern=mention_pattern) result_str = CustomisedRecoginiser.print_performances(results) return result_str
def populate_semehr_results(label_dir, ann_dir, file_key, label2performances, using_combined=False): label_file = '%s-ann.xml' % file_key ann_file = '%s.json' % file_key print(join(label_dir, label_file)) if not isfile(join(label_dir, label_file)): return ed = EDIRDoc(join(label_dir, label_file)) cm = Concept2Mapping(_concept_mapping) cr = CustomisedRecoginiser(join(ann_dir, ann_file), cm) if using_combined: cr.validate_combined_performance(ed.get_ess_entities(), label2performances) else: cr.validate_mapped_performance(ed.get_ess_entities(), label2performances)
def load_data(self, ann_dir, gold_dir, verbose=True, ignore_mappings=[], ignore_context=False, separate_by_label=False, ful_text_dir=None, eHostGD=False, annotated_anns={}): """ :param ann_dir: :param gold_dir: :param verbose: :param ignore_mappings: :param ignore_context: :param separate_by_label: :param ful_text_dir: :param eHostGD: :param annotated_anns: NB: this is for labelling settings where only partial data is annotated on the documents. Therefore, we need to filter out those not assessed by the annotators to avoid kill some true positives (those are correct but not assessed by annotators) :return: """ if ignore_context: logging.info('doing learning without considering contextual info') # print self.get_top_tfidf_dimensions(self.max_dimensions) cm = self.concept_mapping file_keys = [ f[:f.rfind('.')] for f in listdir(ann_dir) if isfile(join(ann_dir, f)) ] lbl2data = {} false_negatives = 0 lbl2tps = {} label_type = self.label.replace('neg_', '') query_label_perform = {} for fk in file_keys: cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm) fk = fk.replace('se_ann_', '') if ful_text_dir is not None: cr.full_text_folder = ful_text_dir if eHostGD: if not isfile(join(gold_dir, '%s.txt.knowtator.xml' % fk)): continue # logging.debug('using GD file %s' % join(gold_dir, '%s.txt.knowtator.xml' % fk)) gd = eHostGenedDoc(join(gold_dir, '%s.txt.knowtator.xml' % fk)) else: if not isfile(join(gold_dir, '%s-ann.xml' % fk)): continue logging.debug('using GD file %s' % join(gold_dir, '%s-ann.xml' % fk)) gd = EDIRDoc(join(gold_dir, '%s-ann.xml' % fk)) # re-segement sentences # cr.re_segment_sentences(fk) # cr.relocate_all_anns(fk) # gd.relocate_anns(cr.get_full_text(fk)) not_matched_gds = [] for e in gd.get_ess_entities(): if (ignore_context and e.label.replace('neg_', '') == label_type) \ or (not ignore_context and e.label == self.label): not_matched_gds.append(e.id) anns = cr.get_anns_by_label(self.label, ignore_mappings=ignore_mappings, no_context=ignore_context) if len(annotated_anns) > 0: if '%s.txt' % fk not in annotated_anns: continue kept_anns = [] for a in anns: for aa in annotated_anns['%s.txt' % fk]: if int(aa['s']) == a.start and int(aa['e']) == a.end: kept_anns.append(a) anns = kept_anns for a in anns: logging.debug('%s, %s, %s' % (a.str, a.start, a.end)) multiple_true_positives = 0 t2anns = cr.get_prior_anns(a) # if len(t2anns['umls']) + len(t2anns['phenotype']) == 0: # t2anns = cr.get_prior_anns(a, contenxt_depth=-2) context_anns = [] + t2anns['umls'] + t2anns['phenotype'] + \ cr.get_context_words(a, fk) # context_anns = cr.get_context_words(a, fk) matched = False for g in gd.get_ess_entities(): if g.id in not_matched_gds: gt = g.label.replace('neg_', '') if g.overlap(a) and ( (g.label == self.label and not ignore_context) or (ignore_context and gt == label_type)): if matched: multiple_true_positives += 1 matched = True not_matched_gds.remove(g.id) if verbose: if not matched: logging.debug( '%s %s %s' % ('!', self.get_ann_dim_label(a) + ' // ' + ' | '.join( self.get_ann_dim_label(a, generalise=True) for a in context_anns), fk)) else: logging.debug( '%s %s %s' % ('R', self.get_ann_dim_label(a) + ' // ' + ' | '.join( self.get_ann_dim_label(a, generalise=True) for a in context_anns), fk)) lbl = LabelModel.get_label_specific_data( self, lbl2data, a, context_anns, fk, cr, separate_by_label=separate_by_label) lbl2data[lbl]['multiple_tps'] += multiple_true_positives Y = lbl2data[lbl]['Y'] Y.append([1 if matched else 0]) ql = lbl if ql not in query_label_perform: query_label_perform[ql] = {'c': 0, 'w': 0} if matched: query_label_perform[ql]['c'] += 1 else: query_label_perform[ql]['w'] += 1 false_negatives += len(not_matched_gds) missed = None for g in gd.get_ess_entities(): if g.id in not_matched_gds: missed = g logging.debug('\t'.join([ 'M', g.str, str(g.negated), str(g.start), str(g.end), join(gold_dir, '%s-ann.xml' % fk) ])) # if len(not_matched_gds) > 0: # print not_matched_gds # for a in anns: # logging.debug(a.str, a.start, a.end, missed.overlap(a)) bad_labels = [] for ql in query_label_perform: p = query_label_perform[ql] if p['c'] == 0 or (1.0 * p['w'] / p['c'] < 0.05): bad_labels.append(ql) return { 'lbl2data': lbl2data, 'fns': false_negatives, 'bad_labels': bad_labels, 'files': file_keys }
def collect_tfidf_dimensions(self, ann_dir, gold_dir, ignore_context=False, separate_by_label=False, full_text_dir=None, eHostGD=False): cm = self.concept_mapping file_keys = [ f[:f.rfind('.')] for f in listdir(ann_dir) if isfile(join(ann_dir, f)) ] # collect dimension labels tp_freq = 0 fp_freq = 0 label_type = self.label.replace('neg_', '') fn_freq = 0 for fk in file_keys: cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm) fk = fk.replace('se_ann_', '') if full_text_dir is not None: cr.full_text_folder = full_text_dir if eHostGD: if not isfile(join(gold_dir, '%s.txt.knowtator.xml' % fk)): continue gd = eHostGenedDoc(join(gold_dir, '%s.txt.knowtator.xml' % fk)) else: if not isfile(join(gold_dir, '%s-ann.xml' % fk)): continue gd = EDIRDoc(join(gold_dir, '%s-ann.xml' % fk)) t = self.label.replace('neg_', '') anns = cr.get_anns_by_label(t) neg_anns = cr.get_anns_by_label('neg_' + t) # re-segement sentences # cr.re_segment_sentences(fk) # cr.relocate_all_anns(fk) # gd.relocate_anns(cr.get_full_text(fk)) not_matched_gds = [] for e in gd.get_ess_entities(): if (ignore_context and e.label.replace('neg_', '') == label_type) \ or (not ignore_context and e.label == self.label): not_matched_gds.append(e.id) for a in anns + neg_anns: # self.add_context_dimension_by_annotation(a) self.add_label_dimension_by_annotation(a) # if (not ignore_context) and ((a.negation != 'Negated' and self.label.startswith('neg_')) or \ # (a.negation == 'Negated' and not self.label.startswith('neg_'))): # logging.info('skipped because context') # continue matched = False for g in gd.get_ess_entities(): if g.id in not_matched_gds: gt = g.label.replace('neg_', '') if g.overlap(a) and ( (g.label == self.label and not ignore_context) or (ignore_context and gt == label_type)): matched = True tp_freq += 1 not_matched_gds.remove(g.id) if not matched: fp_freq += 1 sanns = cr.get_prior_anns(a, contenxt_depth=-1) context_anns = [] + sanns['umls'] + sanns[ 'phenotype'] + cr.get_context_words(a, fk) # context_anns = cr.get_context_words(a, fk) # collect cui labels for u in sanns['umls']: self._cui2label[u.cui] = u.pref for c in context_anns: self.add_context_dimension_by_annotation( c, tp=True if matched else None, fp=True if not matched else None, lbl='united' if not separate_by_label else LabelModel.get_ann_query_label(a)) fn_freq += len(not_matched_gds) self._tps = tp_freq self._fps = fp_freq logging.debug('tp: %s, fp: %s, fn: %s' % (tp_freq, fp_freq, fn_freq))
def predict_label(model_file, test_ann_dir, test_gold_dir, ml_model_file_ptn, performance, pca_model_file=None, max_dimension=None, ignore_mappings=[], ignore_context=False, separate_by_label=False, full_text_dir=None, file_pattern='%s-ann.xml', id2conll=None, label_whitelist=None, eHostGD=False, mention_pattern=None): lm = LabelModel.deserialise(model_file) lm.max_dimensions = max_dimension data = lm.load_data(test_ann_dir, test_gold_dir, ignore_mappings=ignore_mappings, ignore_context=ignore_context, separate_by_label=separate_by_label, verbose=False, ful_text_dir=full_text_dir, eHostGD=eHostGD, annotated_anns=_annotated_anns) files = data['files'] for d in files: d = d.replace('se_ann_', '') if d not in id2conll: id2conll[d] = ConllDoc(join(test_gold_dir, file_pattern % d)) if label_whitelist is not None: id2conll[d].set_label_white_list(label_whitelist) lbl2performances = {} for lbl in data['lbl2data']: this_performance = LabelPerformance(lbl) X = data['lbl2data'][lbl]['X'] Y = data['lbl2data'][lbl]['Y'] mtp = data['lbl2data'][lbl]['multiple_tps'] doc_anns = data['lbl2data'][lbl]['doc_anns'] mp_predicted = None if mention_pattern is not None: mp_predicted = mention_pattern.predict(doc_anns) if lbl in lm.rare_labels: logging.info('%s to be predicted using %s' % (lbl, lm.rare_labels[lbl])) PhenomeLearners.predict_use_simple_stats( lm.rare_labels[lbl], Y, mtp, performance, separate_performance=this_performance, id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern, doc_folder=test_gold_dir, label_whitelist=label_whitelist, mp_predicted=mp_predicted ) else: if len(X) > 0: logging.debug('predict data: %s, dimensions %s, insts %s' % (lbl, len(X[0]), len(X))) bc = lm.get_binary_cluster_classifier(lbl) if bc is not None: complementary_classifiers = [] for l in lm.cluster_classifier_dict: if l != lbl: complementary_classifiers.append(lm.cluster_classifier_dict[l]) for idx in range(len(X)): logging.debug( '%s => %s' % (bc.classify(X[idx], complementary_classifiers=complementary_classifiers), Y[idx])) PhenomeLearners.predict_use_model(X, Y, 0, mtp, ml_model_file_ptn % escape_lable_to_filename(lbl), performance, pca_model_file=pca_model_file, separate_performance=this_performance, id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern, doc_folder=test_gold_dir, label_whitelist=label_whitelist, mp_predicted=mp_predicted) lbl2performances[lbl] = this_performance perform_str = CustomisedRecoginiser.print_performances(lbl2performances) logging.debug('missed instances: %s' % data['fns']) performance.increase_false_negative(data['fns']) return perform_str