def test_align_extract(self): inst = copy_xigt(self.xc[0]) # rgp(get_bilingual_alignment_tier(inst, trans(inst).id, glosses(inst).id, aln_method=INTENT_ALN_HEURPOS)) aheur = get_trans_gloss_alignment(inst, aln_method=INTENT_ALN_HEUR) aheurpos = get_trans_gloss_alignment(inst, aln_method=INTENT_ALN_HEURPOS) agiza = get_trans_gloss_alignment(inst, aln_method=INTENT_ALN_GIZA) agizaheur = get_trans_gloss_alignment(inst, aln_method=INTENT_ALN_GIZAHEUR) a1 = Alignment([(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (8, 7), (11, 8)]) a2 = Alignment([(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 12), (8, 7), (9, 10), (11, 8), (12, 10), (13, 11)]) a3 = Alignment([(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 5), (7, 5), (9, 7), (10, 12), (11, 8), (12, 12), (13, 11), (14, 12)]) a4 = Alignment([(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 5), (7, 5), (9, 7), (10, 12), (11, 8), (12, 12), (13, 11), (14, 11)]) self.assertEqual(aheur, a1) self.assertEqual(aheurpos, a2) self.assertEqual(agiza, a3) self.assertEqual(agizaheur, a4)
def add_corpus(self, name, method, lang, xc, lang_trans = False): """ :type method: str :type xc: RGCorpus """ for inst in xc: # ------------------------------------------- # Only continue when there is a gold standard # alignment to compare against. # ------------------------------------------- gold = get_trans_gloss_alignment(inst, aln_method=ARG_ALN_MANUAL) if gold is None: continue if lang_trans: aln = get_trans_lang_alignment(inst, aln_method=method) else: aln = get_trans_gloss_alignment(inst, aln_method=method) self.add_alignment(name, lang, inst.id, aln)
def test_read_proj_ds_tree(self): src_t = get_ds(self.inst2, trans(self.inst2)) tgt_w = lang(self.inst2) aln = get_trans_gloss_alignment(self.inst2) tgt_t = DepTree.fromstring(""" (ROOT[0] (glaubst[2] (Was[1]) (Du[3]) (wer[4]) (angerufen[5] (hat[6])) )) """, stype=DEPSTR_PTB) proj_t = project_ds(src_t, tgt_w, aln) self.assertTrue(proj_t.structurally_eq(tgt_t))
def evaluate_heuristic_methods_on_file(f, xc, mas, classifier_obj, tagger_obj, lang, pool=None, lock=None): EVAL_LOG.info('Evaluating heuristic methods on file "{}"'.format(os.path.basename(f))) for inst in xc: # ------------------------------------------- # Only evaluate against instances that have a gold alignment. manual = get_trans_gloss_alignment(inst, aln_method=INTENT_ALN_MANUAL) if manual is None: continue EVAL_LOG.debug('Running heuristic alignments on instance "{}"'.format(inst.id)) heur = heur_align_inst(copy_xigt(inst), lowercase=False, stem=False, tokenize=False, no_multiples=True, use_pos=False) mas.add_alignment('baseline', lang, inst.id, heur) heur = heur_align_inst(copy_xigt(inst), lowercase=True, stem=False, tokenize=False, no_multiples=True, use_pos=False) mas.add_alignment('lowercasing', lang, inst.id, heur) heur = heur_align_inst(copy_xigt(inst), lowercase=True, stem=False, tokenize=True, no_multiples=True, use_pos=False) mas.add_alignment('Tokenization', lang, inst.id, heur) heur = heur_align_inst(copy_xigt(inst), lowercase=True, stem=False, tokenize=True, no_multiples=False, use_pos=False) mas.add_alignment('Multiple Matches', lang, inst.id, heur) heur = heur_align_inst(copy_xigt(inst), lowercase=True, stem=True, tokenize=True, no_multiples=False, use_pos=False) mas.add_alignment('Morphing', lang, inst.id, heur) heur = heur_align_inst(copy_xigt(inst), lowercase=True, stem=True, tokenize=True, no_multiples=False, grams=True, use_pos=False) mas.add_alignment('Grams', lang, inst.id, heur) b = copy_xigt(inst) classify_gloss_pos(b, classifier_obj) tag_trans_pos(b, tagger_obj) heur = heur_align_inst(b, lowercase=True, stem=True, tokenize=True, no_multiples=False, grams=True, use_pos=True) mas.add_alignment('POS', lang, inst.id, heur)
def enrich(**kwargs): global classifier if ARG_OUTFILE not in kwargs: ENRICH_LOG.critical("No output file specified.") sys.exit() # ============================================================================= # Set up the alternate classifier path... # ============================================================================= class_path = kwargs.get('class_path') #=========================================================================== # Set up the different arguments... #=========================================================================== inpath = kwargs.get(ARG_INFILE) parse_args = kwargs.get(PARSE_VAR, []) pos_args = kwargs.get(POS_VAR, []) aln_args = kwargs.get(ALN_VAR, []) max_parse_length = kwargs.get('max_parse_length', 10) if not (parse_args or pos_args or aln_args): ENRICH_LOG.warning("No enrichment specified. Basic processing only will be performed.") #=========================================================================== # Sanity check the arguments. #=========================================================================== # Check that alignment is asked for if projection is asked for. if (ARG_POS_PROJ in pos_args or ARG_PARSE_PROJ in parse_args) and (not aln_args): ENRICH_LOG.warn("You have asked for projection methods but have not requested " + \ "alignments to be generated. Projection may fail if alignment not already present in file.") ENRICH_LOG.log(1000, 'Loading input file...') with open(inpath, 'r', encoding='utf-8') as in_f: corp = xigtxml.load(in_f, mode=INCREMENTAL) # ------------------------------------------- # Initialize the English tagger if: # A) "proj" option is selected for pos. # B) "trans" option is given for pos. # C) "heurpos" option is given for alignment. # ------------------------------------------- s = None if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args: ENRICH_LOG.log(1000, 'Initializing tagger...') tagger = c.getpath('stanford_tagger_trans') try: s = StanfordPOSTagger(tagger) except TaggerError as te: ENRICH_LOG.critical(te) sys.exit(2) # ------------------------------------------- # Initialize the parser if: # A) "trans" option is given for parse # B) "proj" option is given for parse. # ------------------------------------------- if ARG_PARSE_TRANS in parse_args or ARG_PARSE_PROJ in parse_args: ENRICH_LOG.log(1000, "Intializing English parser...") sp = stanford_parser.StanfordParser() # ------------------------------------------- # Initialize the classifier if: # A) "class" option is given for pos # B) "heurpos" option is given for alignment. # ------------------------------------------- m = None if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args: ENRICH_LOG.log(1000, "Initializing gloss-line classifier...") p = load_posdict() m = mallet_maxent.MalletMaxent(classifier) # -- 1b) Giza Gloss to Translation alignment -------------------------------------- if ARG_ALN_GIZA in aln_args or ARG_ALN_GIZAHEUR in aln_args: ENRICH_LOG.log(1000, 'Aligning gloss and translation lines using mgiza++...') try: if ARG_ALN_GIZAHEUR in aln_args: giza_align_t_g(corp, resume=True, use_heur=True, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT)) if ARG_ALN_GIZA in aln_args: giza_align_t_g(corp, resume=True, use_heur=False, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT)) except GizaAlignmentException as gae: gl = logging.getLogger('giza') gl.critical(str(gae)) raise gae # ------------------------------------------- # Begin iterating through the corpus # ------------------------------------------- for inst in corp: feedback_string = 'Instance {:15s}: {{:20s}}{{}}'.format(inst.id) reasons = [] inst_status = None def fail(reason): nonlocal inst_status, reasons if reason not in reasons: reasons.append(reason) inst_status = 'WARN' def success(): nonlocal inst_status inst_status = 'OK' # ------------------------------------------- # Define the reasons for failure # ------------------------------------------- F_GLOSS_LINE = "NOGLOSS" F_LANG_LINE = "NOLANG" F_TRANS_LINE = "NOTRANS" F_BAD_LINES = "BADLINES" F_L_G_ALN = "L_G_ALIGN" F_T_G_ALN = "G_T_ALIGN" F_NO_TRANS_POS="NO_POS_TRANS" F_PROJECTION = "PROJECTION" F_UNKNOWN = "UNKNOWN" F_PARSELEN = "OVER_MAX_LENGTH" try: # ------------------------------------------- # Get the different lines # ------------------------------------------- def tryline(func): nonlocal inst try: return func(inst) except NoNormLineException as nnle: return None gl = tryline(gloss_line) tls = tryline(trans_lines) lls = tryline(lang_lines) has_gl = gl is not None has_tl = tls is not None has_ll = lls is not None has_all = lambda: (has_gl and has_tl and has_ll) # ------------------------------------------- # Translation Line # ------------------------------------------- if has_tl: if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args: try: tag_trans_pos(inst, s) except CriticalTaggerError as cte: ENRICH_LOG.critical(str(cte)) sys.exit(2) if ARG_PARSE_PROJ in parse_args or ARG_PARSE_TRANS in parse_args: if len(trans(inst)) <= max_parse_length: parse_translation_line(inst, sp, pt=True, dt=True) else: fail(F_PARSELEN) # 4) POS tag the gloss line -------------------------------------------- if has_gl: if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args: classify_gloss_pos(inst, m, posdict=p) # ------------------------------------------- # Try getting alignments. # ------------------------------------------- if has_gl and has_ll: try: add_gloss_lang_alignments(inst) except GlossLangAlignException as glae: fail(F_L_G_ALN) if has_gl and has_tl: if ARG_ALN_HEURPOS in aln_args: heur_align_inst(inst, use_pos=True) if ARG_ALN_HEUR in aln_args: heur_align_inst(inst, use_pos=False) # ------------------------------------------- # Now, do the necessary projection tasks. # ------------------------------------------- # Project the classifier tags... if has_ll and has_gl and ARG_POS_CLASS in pos_args: try: project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_CLASS) except GlossLangAlignException: fail(F_L_G_ALN) # ------------------------------------------- # Do the trans-to-lang projection... # ------------------------------------------- if has_all(): proj_aln_method = ALN_ARG_MAP[kwargs.get('proj_aln', ARG_ALN_ANY)] aln = get_trans_gloss_alignment(inst, aln_method=proj_aln_method) if not aln or len(aln) == 0: fail(F_T_G_ALN) else: # ------------------------------------------- # POS Projection # ------------------------------------------- if ARG_POS_PROJ in pos_args: trans_tags = trans_tag_tier(inst) if not trans_tags: fail(F_NO_TRANS_POS) else: project_trans_pos_to_gloss(inst) try: project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ) except GlossLangAlignException as glae: fail(F_L_G_ALN) # ------------------------------------------- # Parse projection # ------------------------------------------- if ARG_PARSE_PROJ in parse_args: try: project_pt_tier(inst, proj_aln_method=proj_aln_method) except PhraseStructureProjectionException as pspe: fail(F_PROJECTION) except NoAlignmentProvidedError as nape: fail(F_T_G_ALN) try: project_ds_tier(inst, proj_aln_method=proj_aln_method) except ProjectionException as pe: fail(F_PROJECTION) except NoAlignmentProvidedError as nape: fail(F_T_G_ALN) # Sort the tiers... ---------------------------------------------------- inst.sort_tiers() except Exception as e: # ENRICH_LOG.warn("Unknown Error occurred processing instance {}".format(inst.id)) ENRICH_LOG.debug(e) # raise(e) fail(F_UNKNOWN) if not reasons: success() ENRICH_LOG.info(feedback_string.format(inst_status, ','.join(reasons))) ENRICH_LOG.log(1000, 'Writing output file...') if hasattr(kwargs.get(ARG_OUTFILE), 'write'): xigtxml.dump(kwargs.get(ARG_OUTFILE), corp) else: xigtxml.dump(writefile(kwargs.get(ARG_OUTFILE)), corp) ENRICH_LOG.log(1000, 'Done.') ENRICH_LOG.log(1000, "{} instances written.".format(len(corp)))
def evaluate_pos_projections_on_file(lang, xc, plma, pos_proj_matrix, tagger, gold_tagmap=None, trans_tagmap=None, outstream=sys.stdout): """ :type plma: PerLangMethodAccuracies :type pos_proj_matrix: POSMatrix """ new_xc = XigtCorpus(xc.id) for inst in xc: gtt = gloss_tag_tier(inst, INTENT_POS_MANUAL) ttt = trans_tag_tier(inst, INTENT_POS_MANUAL) m_aln = get_trans_gloss_alignment(inst, INTENT_ALN_MANUAL) # Only continue if we have manual gloss tags, trans tags, and manual alignment. if gtt is None or m_aln is None or ttt is None: continue # Get the heuristic alignment... h_aln = heur_align_inst(inst) # And tag the translation line. tag_trans_pos(inst, tagger=tagger) # Now, iterate through each alignment method and set of tags. for aln_method in [INTENT_ALN_MANUAL, INTENT_ALN_HEUR]: for trans_tag_method in [INTENT_POS_MANUAL, INTENT_POS_TAGGER]: project_trans_pos_to_gloss(inst, aln_method=aln_method, trans_tag_method=trans_tag_method) proj_gtt = gloss_tag_tier(inst, tag_method=INTENT_POS_PROJ) # Go through each word in the gloss line and, if it has a gold # tag, was it correct? matches = 0 compares = 0 for gw in gloss(inst): gold_tag = xigt_find(gtt, alignment=gw.id) proj_tag = xigt_find(proj_gtt, alignment=gw.id) if gold_tag is not None: gold_tag_v = gold_tag.value() # Remap the tags if asked... if gold_tagmap is not None: try: gold_tag_v = gold_tagmap.get(gold_tag_v) except TagMapException: pass if proj_tag is None: proj_str = '**UNK' else: proj_str = proj_tag.value() if trans_tagmap is not None: # Try to remap the tag, but keep it if it can't be remapped. try: proj_str = trans_tagmap.get(proj_str) except TagMapException: pass pos_proj_matrix.add(gold_tag_v, proj_str) if proj_tag is not None and proj_str == gold_tag_v: matches += 1 compares += 1 plma.add(lang, '{}:{}'.format(aln_method, trans_tag_method), matches, compares) outstream.write('{}\n'.format(plma)) return new_xc