def test_ds_project(self): xc = xc_load(os.path.join(testfile_dir, 'xigt/index_error.xml'), do_basic_processing=True) inst = xc[0] heur_align_inst(inst) parse_translation_line(inst, dt=True) project_ds_tier(inst) proj_t = get_lang_ds(inst) tgt_t = DepTree.fromstring("""(ROOT[0] (salli-i[2] (Jumala[1]) (sata-a[4] ([[3])) (rake-i-ta[5]) (ja[6]) (tuhka-a[7] (].[8]))))""", stype=DEPSTR_PTB) self.assertTrue(tgt_t.similar(proj_t)) inst2 = xc[1] heur_align_inst(inst2) parse_translation_line(inst2, dt=True) project_ds_tier(inst2) print(inst2) tgt2_t = DepTree.fromstring("""(ROOT[0] (unohta-a[2] (*Minua[1]) (unohda-n[4] (/Minä[3]) (/laula-tta-a[6] (pelo-tta-a[5])) ) )) """, stype=DEPSTR_PTB) self.assertTrue(get_lang_ds(inst2), tgt2_t)
def test_conll(self): ds = get_lang_ds(self.inst2) conll_str = ds.to_conll(lowercase=False) s = """ 1 Was _ PRON PRON _ 2 _ _ _ 2 glaubst _ VERB VERB _ 0 root _ _ 3 Du _ PRON PRON _ 2 nsubj _ _ 4 wer _ PRON PRON _ 2 dobj _ _ 5 angerufen _ VERB VERB _ 2 dep _ _ 6 hat _ VERB VERB _ 5 _ _ _""" self.assertEqual(conll_str.strip(), s.strip())
def eval_method(aln_method): # Set up the gold instances gold_edges = set(gold_ds.to_indices()) # Add the number of compares, (the gold edges) # and currently 0 for matches... plma.add(lang, aln_method, 0, len(gold_edges)) # Try to do the projection try: project_ds_tier(inst, proj_aln_method=aln_method, ds_source=INTENT_DS_PARSER) ds = get_lang_ds(inst, parse_method=INTENT_DS_PROJ) tgt_edges = set(ds.to_indices()) # Add the number of matches, with 0 compares, since we added # those previously. plma.add(lang, aln_method, len(gold_edges & tgt_edges), 0) except TreeProjectionError: pass
def evaluate_ds_projections_on_file(lang, xc, plma, outstream=sys.stdout): """ :type plma: PerLangMethodAccuracies """ matches = 0 compares = 0 aln_methods=[INTENT_ALN_GIZA, INTENT_ALN_GIZAHEUR, INTENT_ALN_HEUR, INTENT_ALN_HEURPOS, INTENT_ALN_MANUAL] for inst in xc: # giza_align_t_g(inst) gold_ds = get_lang_ds(inst, parse_method=INTENT_DS_MANUAL) if not gold_ds: continue # ------------------------------------------- # If we have a gold standard DS, let's continue. # ------------------------------------------- def eval_method(aln_method): # Set up the gold instances gold_edges = set(gold_ds.to_indices()) # Add the number of compares, (the gold edges) # and currently 0 for matches... plma.add(lang, aln_method, 0, len(gold_edges)) # Try to do the projection try: project_ds_tier(inst, proj_aln_method=aln_method, ds_source=INTENT_DS_PARSER) ds = get_lang_ds(inst, parse_method=INTENT_DS_PROJ) tgt_edges = set(ds.to_indices()) # Add the number of matches, with 0 compares, since we added # those previously. plma.add(lang, aln_method, len(gold_edges & tgt_edges), 0) except TreeProjectionError: pass for aln_method in aln_methods: eval_method(aln_method)
def extract_parser_from_instance(inst: Igt, output_stream, pos_source, tm): """ Given an IGT instance, extract the projected dependency structure from it (along with the POS tags from the given pos_source) :param inst: Input instance :param output_stream: The output stream to write the training data to. """ extracted = 0 try: ds = get_lang_ds(inst, pos_source=pos_source, unk_pos_handling=None) if ds is not None: conll_string = to_conll(ds, lang(inst), lowercase=True, match_punc=True, clean_token=True, unk_pos='UNK', tagmap=tm) output_stream.write(conll_string+'\n\n') output_stream.flush() extracted += 1 except RuntimeError as re: print(re) EXTRACT_LOG.error("Runtime error in instance {}".format(inst.id)) except RGXigtException as rgxe: EXTRACT_LOG.warn('Instance "{}" failed with "{}"'.format(inst.id, rgxe)) return extracted
def all_tokens_present_test(self): project_ds_tier(self.inst) ds = get_lang_ds(self.inst) print(ds.to_conll())