Ejemplo n.º 1
0
def populate_validation_results():
    label_dir = _gold_dir
    ann_dir = _ann_dir

    label2performances = {}
    file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
    for fk in file_keys:
        populate_semehr_results(label_dir, ann_dir, fk, label2performances, using_combined=False)
    CustomisedRecoginiser.print_performances(label2performances)
Ejemplo n.º 2
0
def do_learn_exp(viz_file, num_dimensions=[20], ignore_context=False, separate_by_label=False, conll_output_file=None,
                 eHostGD=False, mention_pattern=None):
    results = {}
    id2conll = {}
    result_str = ''
    for lbl in _labels:
        logging.info('working on [%s]' % lbl)
        _learning_model_file = _learning_model_dir + '/%s.lm' % lbl
        _ml_model_file_ptn = _learning_model_dir + '/' + lbl + '_%s_DT.model'
        _pca_model_file = None
        pca_dim = None
        max_dimensions = num_dimensions

        t = lbl.replace('neg_', '')
        ignore_mappings = _ignore_mappings[t] if t in _ignore_mappings else []
        # remove previous model files logging.debug('removing previously learnt models...') for f in [f for f in
        # listdir(_learning_model_dir) if isfile(join(_learning_model_dir, f)) and f.endswith('.model')]: remove(
        # join(_learning_model_dir, f))
        for dim in max_dimensions:
            logging.info('dimension setting: %s' % dim)
            learn_prediction_model(lbl,
                                   ann_dir=_ann_dir,
                                   gold_dir=_gold_dir,
                                   ml_model_file_ptn=_ml_model_file_ptn,
                                   model_dir=_learning_model_dir,
                                   pca_dim=pca_dim,
                                   pca_model_file=_pca_model_file,
                                   max_dimension=dim,
                                   ignore_mappings=ignore_mappings,
                                   viz_file=viz_file,
                                   ignore_context=ignore_context,
                                   separate_by_label=separate_by_label,
                                   full_text_dir=_gold_text_dir,
                                   eHostGD=eHostGD)
            logging.debug('bad labels: %s' % ignore_mappings)
            pl = '%s dim[%s]' % (lbl, dim)
            performance = LabelPerformance(pl)
            results[pl] = performance
            predict_label(_learning_model_file,
                          _test_ann_dir,
                          _test_gold_dir,
                          _ml_model_file_ptn,
                          performance,
                          pca_model_file=_pca_model_file,
                          max_dimension=dim,
                          ignore_mappings=ignore_mappings,
                          ignore_context=ignore_context,
                          separate_by_label=separate_by_label,
                          full_text_dir=_test_text_dir,
                          file_pattern=_gold_file_pattern,
                          id2conll=id2conll,
                          label_whitelist=_labels,
                          eHostGD=eHostGD, mention_pattern=mention_pattern)
        result_str = CustomisedRecoginiser.print_performances(results)
    return result_str
Ejemplo n.º 3
0
def predict_label(model_file, test_ann_dir, test_gold_dir, ml_model_file_ptn, performance,
                  pca_model_file=None,
                  max_dimension=None,
                  ignore_mappings=[],
                  ignore_context=False,
                  separate_by_label=False,
                  full_text_dir=None,
                  file_pattern='%s-ann.xml',
                  id2conll=None,
                  label_whitelist=None,
                  eHostGD=False, mention_pattern=None):
    lm = LabelModel.deserialise(model_file)
    lm.max_dimensions = max_dimension
    data = lm.load_data(test_ann_dir, test_gold_dir, ignore_mappings=ignore_mappings, ignore_context=ignore_context,
                        separate_by_label=separate_by_label, verbose=False, ful_text_dir=full_text_dir, eHostGD=eHostGD,
                        annotated_anns=_annotated_anns)

    files = data['files']
    for d in files:
        d = d.replace('se_ann_', '')
        if d not in id2conll:
            id2conll[d] = ConllDoc(join(test_gold_dir, file_pattern % d))
            if label_whitelist is not None:
                id2conll[d].set_label_white_list(label_whitelist)
    lbl2performances = {}
    for lbl in data['lbl2data']:
        this_performance = LabelPerformance(lbl)
        X = data['lbl2data'][lbl]['X']
        Y = data['lbl2data'][lbl]['Y']
        mtp = data['lbl2data'][lbl]['multiple_tps']
        doc_anns = data['lbl2data'][lbl]['doc_anns']
        mp_predicted = None
        if mention_pattern is not None:
            mp_predicted = mention_pattern.predict(doc_anns)
        if lbl in lm.rare_labels:
            logging.info('%s to be predicted using %s' % (lbl, lm.rare_labels[lbl]))
            PhenomeLearners.predict_use_simple_stats(
                lm.rare_labels[lbl], Y, mtp,
                performance, separate_performance=this_performance,
                id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern,
                doc_folder=test_gold_dir,
                label_whitelist=label_whitelist, mp_predicted=mp_predicted
            )
        else:
            if len(X) > 0:
                logging.debug('predict data: %s, dimensions %s, insts %s' % (lbl, len(X[0]), len(X)))
            bc = lm.get_binary_cluster_classifier(lbl)
            if bc is not None:
                complementary_classifiers = []
                for l in lm.cluster_classifier_dict:
                    if l != lbl:
                        complementary_classifiers.append(lm.cluster_classifier_dict[l])
                for idx in range(len(X)):
                    logging.debug(
                        '%s => %s' % (bc.classify(X[idx], complementary_classifiers=complementary_classifiers), Y[idx]))
            PhenomeLearners.predict_use_model(X, Y, 0, mtp, ml_model_file_ptn % escape_lable_to_filename(lbl),
                                              performance,
                                              pca_model_file=pca_model_file,
                                              separate_performance=this_performance,
                                              id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern,
                                              doc_folder=test_gold_dir,
                                              label_whitelist=label_whitelist, mp_predicted=mp_predicted)
        lbl2performances[lbl] = this_performance
    perform_str = CustomisedRecoginiser.print_performances(lbl2performances)
    logging.debug('missed instances: %s' % data['fns'])
    performance.increase_false_negative(data['fns'])
    return perform_str