def predict_doc_phenotypes(doc_key, doc_anns, doc_text, model_factory, concept_mapping, ignore_mappings=[], mention_pattern=None): """ load a document and do all phenotype predictions in one go this is designed for large amounts of documents to be loaded, for example, from databases :param doc_key: :param doc_anns: :param doc_text: :param model_factory: :param concept_mapping: :param ignore_mappings: :param mention_pattern: :return: """ cr = CustomisedRecoginiser(doc_key, concept_mapping=concept_mapping, ann_doc=doc_anns) cr.full_text = doc_text p2count = {} total = 0 for p in model_factory.phenotypes: lm = model_factory.get_model_by_phenotype(p) if lm is None: logging.info('phenotype %s not found' % p) continue lbl2data = {} LabelModel.read_one_ann_doc(lm, cr, doc_key, lbl2data=lbl2data, ignore_mappings=ignore_mappings, ignore_context=True, separate_by_label=True) doc2predicted = {} label_model_predict(lm, model_factory.model_file_pattern(p), lbl2data, doc2predicted, mention_pattern=mention_pattern, mention_prediction_param=cr) if doc_key in doc2predicted: p2count[p] = { 'freq': len(doc2predicted[doc_key]), 'cui2freq': collect_phenotype_concept(doc2predicted[doc_key]) } total += 1 return p2count if total > 0 else None
def load_models(self): for phenotype in self._phenotypes: logging.info('loading on [%s]' % phenotype) _learning_model_file = self._learning_model_dir + '/%s.lm' % phenotype if not exists(_learning_model_file): # if previous learnt model not exists, skip self._no_model_labels.append(phenotype) continue self._phenotype2model_file_pattern[ phenotype] = self._learning_model_dir + '/' + phenotype + '_%s_DT.model' lm = LabelModel.deserialise(_learning_model_file) lm.max_dimensions = 30 self._phenotype2model[phenotype] = lm
def predict(settings): ann_dir = settings['test_ann_dir'] test_text_dir = settings['test_fulltext_dir'] _concept_mapping = settings['concept_mapping_file'] _learning_model_dir = settings['learning_model_dir'] _labels = utils.read_text_file(settings['entity_types_file']) ignore_mappings = utils.load_json_data(settings['ignore_mapping_file']) _cm_obj = Concept2Mapping(_concept_mapping) doc2predicted = {} no_models_labels = [] for phenotype in _labels: logging.info('working on [%s]' % phenotype) _learning_model_file = _learning_model_dir + '/%s.lm' % phenotype if not exists(_learning_model_file): # if previous learnt model not exists, skip no_models_labels.append(phenotype) continue _ml_model_file_ptn = _learning_model_dir + '/' + phenotype + '_%s_DT.model' lm = LabelModel.deserialise(_learning_model_file) # pass the concept2mapping object to the label model instance lm.concept_mapping = _cm_obj lm.max_dimensions = 30 data = lm.load_data_for_predict(ann_dir=ann_dir, ignore_mappings=ignore_mappings, ignore_context=True, separate_by_label=True, full_text_dir=test_text_dir) for lbl in data['lbl2data']: X = data['lbl2data'][lbl]['X'] logging.debug(X) doc_anns = data['lbl2data'][lbl]['doc_anns'] label_model_predict(lm, _ml_model_file_ptn, data['lbl2data'], doc2predicted) return doc2predicted, no_models_labels
def predict_label(model_file, test_ann_dir, test_gold_dir, ml_model_file_ptn, performance, pca_model_file=None, max_dimension=None, ignore_mappings=[], ignore_context=False, separate_by_label=False, full_text_dir=None, file_pattern='%s-ann.xml', id2conll=None, label_whitelist=None, eHostGD=False, mention_pattern=None): lm = LabelModel.deserialise(model_file) lm.max_dimensions = max_dimension data = lm.load_data(test_ann_dir, test_gold_dir, ignore_mappings=ignore_mappings, ignore_context=ignore_context, separate_by_label=separate_by_label, verbose=False, ful_text_dir=full_text_dir, eHostGD=eHostGD, annotated_anns=_annotated_anns) files = data['files'] for d in files: d = d.replace('se_ann_', '') if d not in id2conll: id2conll[d] = ConllDoc(join(test_gold_dir, file_pattern % d)) if label_whitelist is not None: id2conll[d].set_label_white_list(label_whitelist) lbl2performances = {} for lbl in data['lbl2data']: this_performance = LabelPerformance(lbl) X = data['lbl2data'][lbl]['X'] Y = data['lbl2data'][lbl]['Y'] mtp = data['lbl2data'][lbl]['multiple_tps'] doc_anns = data['lbl2data'][lbl]['doc_anns'] mp_predicted = None if mention_pattern is not None: mp_predicted = mention_pattern.predict(doc_anns) if lbl in lm.rare_labels: logging.info('%s to be predicted using %s' % (lbl, lm.rare_labels[lbl])) PhenomeLearners.predict_use_simple_stats( lm.rare_labels[lbl], Y, mtp, performance, separate_performance=this_performance, id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern, doc_folder=test_gold_dir, label_whitelist=label_whitelist, mp_predicted=mp_predicted ) else: if len(X) > 0: logging.debug('predict data: %s, dimensions %s, insts %s' % (lbl, len(X[0]), len(X))) bc = lm.get_binary_cluster_classifier(lbl) if bc is not None: complementary_classifiers = [] for l in lm.cluster_classifier_dict: if l != lbl: complementary_classifiers.append(lm.cluster_classifier_dict[l]) for idx in range(len(X)): logging.debug( '%s => %s' % (bc.classify(X[idx], complementary_classifiers=complementary_classifiers), Y[idx])) PhenomeLearners.predict_use_model(X, Y, 0, mtp, ml_model_file_ptn % escape_lable_to_filename(lbl), performance, pca_model_file=pca_model_file, separate_performance=this_performance, id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern, doc_folder=test_gold_dir, label_whitelist=label_whitelist, mp_predicted=mp_predicted) lbl2performances[lbl] = this_performance perform_str = CustomisedRecoginiser.print_performances(lbl2performances) logging.debug('missed instances: %s' % data['fns']) performance.increase_false_negative(data['fns']) return perform_str
def learn_prediction_model(label, ann_dir=None, gold_dir=None, model_file=None, model_dir=None, ml_model_file_ptn=None, pca_dim=None, pca_model_file=None, max_dimension=None, ignore_mappings=[], viz_file=None, ignore_context=False, separate_by_label=False, full_text_dir=None, eHostGD=False): model_changed = False if model_file is not None: lm = LabelModel.deserialise(model_file) else: model_changed = True lm = LabelModel(label, _cm_obj) lm.collect_tfidf_dimensions(ann_dir=ann_dir, gold_dir=gold_dir, ignore_context=ignore_context, separate_by_label=separate_by_label, full_text_dir=full_text_dir, eHostGD=eHostGD) lm.use_one_dimension_for_label = False lm.max_dimensions = max_dimension if ann_dir is not None: # bad_lables = lm.get_low_quality_labels(ann_dir, gold_dir) # logging.info(bad_lables) bad_lables = [] data = lm.load_data(ann_dir, gold_dir, ignore_mappings=bad_lables, ignore_context=ignore_context, separate_by_label=separate_by_label, ful_text_dir=full_text_dir, eHostGD=eHostGD, annotated_anns=_annotated_anns) # if separate_by_label: for lbl in data['lbl2data']: X = data['lbl2data'][lbl]['X'] Y = data['lbl2data'][lbl]['Y'] n_true = 0 for y in Y: if y == [1]: n_true += 1 logging.debug('training data: %s, dimensions %s, insts %s' % (lbl, len(X[0]), len(X))) if len(X) <= _min_sample_size: lm.add_rare_label(lbl, n_true * 1.0 / len(X)) continue # ignore_mappings += data['bad_labels'] PhenomeLearners.random_forest_learning(X, Y, output_file=ml_model_file_ptn % escape_lable_to_filename(lbl)) # lm.svm_learning(X, Y, output_file=ml_model_file_ptn % escape_lable_to_filename(lbl)) # lm.gaussian_nb(X, Y, output_file=ml_model_file_ptn % escape_lable_to_filename(lbl)) logging.debug('%s, #insts: %s, #tps: %s' % (lbl, len(X), n_true)) if model_dir is not None and model_changed: lm.serialise(join(model_dir, '%s.lm' % label)) logging.debug('%s.lm saved' % label)