Example #1
0
def predict_doc_phenotypes(doc_key,
                           doc_anns,
                           doc_text,
                           model_factory,
                           concept_mapping,
                           ignore_mappings=[],
                           mention_pattern=None):
    """
    load a document and do all phenotype predictions in one go
    this is designed for large amounts of documents to be loaded, for example, from databases
    :param doc_key:
    :param doc_anns:
    :param doc_text:
    :param model_factory:
    :param concept_mapping:
    :param ignore_mappings:
    :param mention_pattern:
    :return:
    """
    cr = CustomisedRecoginiser(doc_key,
                               concept_mapping=concept_mapping,
                               ann_doc=doc_anns)
    cr.full_text = doc_text
    p2count = {}
    total = 0
    for p in model_factory.phenotypes:
        lm = model_factory.get_model_by_phenotype(p)
        if lm is None:
            logging.info('phenotype %s not found' % p)
            continue
        lbl2data = {}
        LabelModel.read_one_ann_doc(lm,
                                    cr,
                                    doc_key,
                                    lbl2data=lbl2data,
                                    ignore_mappings=ignore_mappings,
                                    ignore_context=True,
                                    separate_by_label=True)
        doc2predicted = {}
        label_model_predict(lm,
                            model_factory.model_file_pattern(p),
                            lbl2data,
                            doc2predicted,
                            mention_pattern=mention_pattern,
                            mention_prediction_param=cr)
        if doc_key in doc2predicted:
            p2count[p] = {
                'freq': len(doc2predicted[doc_key]),
                'cui2freq': collect_phenotype_concept(doc2predicted[doc_key])
            }
            total += 1
    return p2count if total > 0 else None
Example #2
0
    def load_models(self):
        for phenotype in self._phenotypes:
            logging.info('loading on [%s]' % phenotype)
            _learning_model_file = self._learning_model_dir + '/%s.lm' % phenotype

            if not exists(_learning_model_file):
                # if previous learnt model not exists, skip
                self._no_model_labels.append(phenotype)
                continue

            self._phenotype2model_file_pattern[
                phenotype] = self._learning_model_dir + '/' + phenotype + '_%s_DT.model'

            lm = LabelModel.deserialise(_learning_model_file)
            lm.max_dimensions = 30
            self._phenotype2model[phenotype] = lm
Example #3
0
def predict(settings):
    ann_dir = settings['test_ann_dir']
    test_text_dir = settings['test_fulltext_dir']
    _concept_mapping = settings['concept_mapping_file']
    _learning_model_dir = settings['learning_model_dir']
    _labels = utils.read_text_file(settings['entity_types_file'])
    ignore_mappings = utils.load_json_data(settings['ignore_mapping_file'])
    _cm_obj = Concept2Mapping(_concept_mapping)

    doc2predicted = {}
    no_models_labels = []
    for phenotype in _labels:
        logging.info('working on [%s]' % phenotype)
        _learning_model_file = _learning_model_dir + '/%s.lm' % phenotype

        if not exists(_learning_model_file):
            # if previous learnt model not exists, skip
            no_models_labels.append(phenotype)
            continue

        _ml_model_file_ptn = _learning_model_dir + '/' + phenotype + '_%s_DT.model'

        lm = LabelModel.deserialise(_learning_model_file)
        # pass the concept2mapping object to the label model instance
        lm.concept_mapping = _cm_obj
        lm.max_dimensions = 30
        data = lm.load_data_for_predict(ann_dir=ann_dir,
                                        ignore_mappings=ignore_mappings,
                                        ignore_context=True,
                                        separate_by_label=True,
                                        full_text_dir=test_text_dir)
        for lbl in data['lbl2data']:
            X = data['lbl2data'][lbl]['X']
            logging.debug(X)
            doc_anns = data['lbl2data'][lbl]['doc_anns']
            label_model_predict(lm, _ml_model_file_ptn, data['lbl2data'],
                                doc2predicted)
    return doc2predicted, no_models_labels
Example #4
0
def predict_label(model_file, test_ann_dir, test_gold_dir, ml_model_file_ptn, performance,
                  pca_model_file=None,
                  max_dimension=None,
                  ignore_mappings=[],
                  ignore_context=False,
                  separate_by_label=False,
                  full_text_dir=None,
                  file_pattern='%s-ann.xml',
                  id2conll=None,
                  label_whitelist=None,
                  eHostGD=False, mention_pattern=None):
    lm = LabelModel.deserialise(model_file)
    lm.max_dimensions = max_dimension
    data = lm.load_data(test_ann_dir, test_gold_dir, ignore_mappings=ignore_mappings, ignore_context=ignore_context,
                        separate_by_label=separate_by_label, verbose=False, ful_text_dir=full_text_dir, eHostGD=eHostGD,
                        annotated_anns=_annotated_anns)

    files = data['files']
    for d in files:
        d = d.replace('se_ann_', '')
        if d not in id2conll:
            id2conll[d] = ConllDoc(join(test_gold_dir, file_pattern % d))
            if label_whitelist is not None:
                id2conll[d].set_label_white_list(label_whitelist)
    lbl2performances = {}
    for lbl in data['lbl2data']:
        this_performance = LabelPerformance(lbl)
        X = data['lbl2data'][lbl]['X']
        Y = data['lbl2data'][lbl]['Y']
        mtp = data['lbl2data'][lbl]['multiple_tps']
        doc_anns = data['lbl2data'][lbl]['doc_anns']
        mp_predicted = None
        if mention_pattern is not None:
            mp_predicted = mention_pattern.predict(doc_anns)
        if lbl in lm.rare_labels:
            logging.info('%s to be predicted using %s' % (lbl, lm.rare_labels[lbl]))
            PhenomeLearners.predict_use_simple_stats(
                lm.rare_labels[lbl], Y, mtp,
                performance, separate_performance=this_performance,
                id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern,
                doc_folder=test_gold_dir,
                label_whitelist=label_whitelist, mp_predicted=mp_predicted
            )
        else:
            if len(X) > 0:
                logging.debug('predict data: %s, dimensions %s, insts %s' % (lbl, len(X[0]), len(X)))
            bc = lm.get_binary_cluster_classifier(lbl)
            if bc is not None:
                complementary_classifiers = []
                for l in lm.cluster_classifier_dict:
                    if l != lbl:
                        complementary_classifiers.append(lm.cluster_classifier_dict[l])
                for idx in range(len(X)):
                    logging.debug(
                        '%s => %s' % (bc.classify(X[idx], complementary_classifiers=complementary_classifiers), Y[idx]))
            PhenomeLearners.predict_use_model(X, Y, 0, mtp, ml_model_file_ptn % escape_lable_to_filename(lbl),
                                              performance,
                                              pca_model_file=pca_model_file,
                                              separate_performance=this_performance,
                                              id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern,
                                              doc_folder=test_gold_dir,
                                              label_whitelist=label_whitelist, mp_predicted=mp_predicted)
        lbl2performances[lbl] = this_performance
    perform_str = CustomisedRecoginiser.print_performances(lbl2performances)
    logging.debug('missed instances: %s' % data['fns'])
    performance.increase_false_negative(data['fns'])
    return perform_str
Example #5
0
def learn_prediction_model(label, ann_dir=None, gold_dir=None, model_file=None, model_dir=None,
                           ml_model_file_ptn=None,
                           pca_dim=None,
                           pca_model_file=None,
                           max_dimension=None,
                           ignore_mappings=[],
                           viz_file=None, ignore_context=False, separate_by_label=False, full_text_dir=None,
                           eHostGD=False):
    model_changed = False
    if model_file is not None:
        lm = LabelModel.deserialise(model_file)
    else:
        model_changed = True
        lm = LabelModel(label, _cm_obj)
        lm.collect_tfidf_dimensions(ann_dir=ann_dir, gold_dir=gold_dir, ignore_context=ignore_context,
                                    separate_by_label=separate_by_label, full_text_dir=full_text_dir, eHostGD=eHostGD)
    lm.use_one_dimension_for_label = False
    lm.max_dimensions = max_dimension
    if ann_dir is not None:
        # bad_lables = lm.get_low_quality_labels(ann_dir, gold_dir)
        # logging.info(bad_lables)
        bad_lables = []
        data = lm.load_data(ann_dir, gold_dir, ignore_mappings=bad_lables, ignore_context=ignore_context,
                            separate_by_label=separate_by_label, ful_text_dir=full_text_dir, eHostGD=eHostGD,
                            annotated_anns=_annotated_anns)
        # if separate_by_label:
        for lbl in data['lbl2data']:
            X = data['lbl2data'][lbl]['X']
            Y = data['lbl2data'][lbl]['Y']
            n_true = 0
            for y in Y:
                if y == [1]:
                    n_true += 1
            logging.debug('training data: %s, dimensions %s, insts %s' % (lbl, len(X[0]), len(X)))
            if len(X) <= _min_sample_size:
                lm.add_rare_label(lbl, n_true * 1.0 / len(X))
                continue
            # ignore_mappings += data['bad_labels']
            PhenomeLearners.random_forest_learning(X, Y, output_file=ml_model_file_ptn % escape_lable_to_filename(lbl))
            # lm.svm_learning(X, Y, output_file=ml_model_file_ptn % escape_lable_to_filename(lbl))
            # lm.gaussian_nb(X, Y, output_file=ml_model_file_ptn % escape_lable_to_filename(lbl))
            logging.debug('%s, #insts: %s, #tps: %s' % (lbl, len(X), n_true))

    if model_dir is not None and model_changed:
        lm.serialise(join(model_dir, '%s.lm' % label))
        logging.debug('%s.lm saved' % label)