def populate_validation_results(): label_dir = _gold_dir ann_dir = _ann_dir label2performances = {} file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))] for fk in file_keys: populate_semehr_results(label_dir, ann_dir, fk, label2performances, using_combined=False) CustomisedRecoginiser.print_performances(label2performances)
def do_learn_exp(viz_file, num_dimensions=[20], ignore_context=False, separate_by_label=False, conll_output_file=None, eHostGD=False, mention_pattern=None): results = {} id2conll = {} result_str = '' for lbl in _labels: logging.info('working on [%s]' % lbl) _learning_model_file = _learning_model_dir + '/%s.lm' % lbl _ml_model_file_ptn = _learning_model_dir + '/' + lbl + '_%s_DT.model' _pca_model_file = None pca_dim = None max_dimensions = num_dimensions t = lbl.replace('neg_', '') ignore_mappings = _ignore_mappings[t] if t in _ignore_mappings else [] # remove previous model files logging.debug('removing previously learnt models...') for f in [f for f in # listdir(_learning_model_dir) if isfile(join(_learning_model_dir, f)) and f.endswith('.model')]: remove( # join(_learning_model_dir, f)) for dim in max_dimensions: logging.info('dimension setting: %s' % dim) learn_prediction_model(lbl, ann_dir=_ann_dir, gold_dir=_gold_dir, ml_model_file_ptn=_ml_model_file_ptn, model_dir=_learning_model_dir, pca_dim=pca_dim, pca_model_file=_pca_model_file, max_dimension=dim, ignore_mappings=ignore_mappings, viz_file=viz_file, ignore_context=ignore_context, separate_by_label=separate_by_label, full_text_dir=_gold_text_dir, eHostGD=eHostGD) logging.debug('bad labels: %s' % ignore_mappings) pl = '%s dim[%s]' % (lbl, dim) performance = LabelPerformance(pl) results[pl] = performance predict_label(_learning_model_file, _test_ann_dir, _test_gold_dir, _ml_model_file_ptn, performance, pca_model_file=_pca_model_file, max_dimension=dim, ignore_mappings=ignore_mappings, ignore_context=ignore_context, separate_by_label=separate_by_label, full_text_dir=_test_text_dir, file_pattern=_gold_file_pattern, id2conll=id2conll, label_whitelist=_labels, eHostGD=eHostGD, mention_pattern=mention_pattern) result_str = CustomisedRecoginiser.print_performances(results) return result_str
def predict_label(model_file, test_ann_dir, test_gold_dir, ml_model_file_ptn, performance, pca_model_file=None, max_dimension=None, ignore_mappings=[], ignore_context=False, separate_by_label=False, full_text_dir=None, file_pattern='%s-ann.xml', id2conll=None, label_whitelist=None, eHostGD=False, mention_pattern=None): lm = LabelModel.deserialise(model_file) lm.max_dimensions = max_dimension data = lm.load_data(test_ann_dir, test_gold_dir, ignore_mappings=ignore_mappings, ignore_context=ignore_context, separate_by_label=separate_by_label, verbose=False, ful_text_dir=full_text_dir, eHostGD=eHostGD, annotated_anns=_annotated_anns) files = data['files'] for d in files: d = d.replace('se_ann_', '') if d not in id2conll: id2conll[d] = ConllDoc(join(test_gold_dir, file_pattern % d)) if label_whitelist is not None: id2conll[d].set_label_white_list(label_whitelist) lbl2performances = {} for lbl in data['lbl2data']: this_performance = LabelPerformance(lbl) X = data['lbl2data'][lbl]['X'] Y = data['lbl2data'][lbl]['Y'] mtp = data['lbl2data'][lbl]['multiple_tps'] doc_anns = data['lbl2data'][lbl]['doc_anns'] mp_predicted = None if mention_pattern is not None: mp_predicted = mention_pattern.predict(doc_anns) if lbl in lm.rare_labels: logging.info('%s to be predicted using %s' % (lbl, lm.rare_labels[lbl])) PhenomeLearners.predict_use_simple_stats( lm.rare_labels[lbl], Y, mtp, performance, separate_performance=this_performance, id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern, doc_folder=test_gold_dir, label_whitelist=label_whitelist, mp_predicted=mp_predicted ) else: if len(X) > 0: logging.debug('predict data: %s, dimensions %s, insts %s' % (lbl, len(X[0]), len(X))) bc = lm.get_binary_cluster_classifier(lbl) if bc is not None: complementary_classifiers = [] for l in lm.cluster_classifier_dict: if l != lbl: complementary_classifiers.append(lm.cluster_classifier_dict[l]) for idx in range(len(X)): logging.debug( '%s => %s' % (bc.classify(X[idx], complementary_classifiers=complementary_classifiers), Y[idx])) PhenomeLearners.predict_use_model(X, Y, 0, mtp, ml_model_file_ptn % escape_lable_to_filename(lbl), performance, pca_model_file=pca_model_file, separate_performance=this_performance, id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern, doc_folder=test_gold_dir, label_whitelist=label_whitelist, mp_predicted=mp_predicted) lbl2performances[lbl] = this_performance perform_str = CustomisedRecoginiser.print_performances(lbl2performances) logging.debug('missed instances: %s' % data['fns']) performance.increase_false_negative(data['fns']) return perform_str