def evaluate_classifier_on_instances(inst_list, classifier, feat_list, pos_class_matrix, gold_tagmap=None): """ Given a list of instances, do the evaluation on them. :param inst_list: :param classifier: :param tagger: :return: """ pd = load_posdict() if (CLASS_FEATS_DICT in feat_list) or (CLASS_FEATS_PDICT in feat_list) or (CLASS_FEATS_NDICT in feat_list) else False matches = 0 compares = 0 for inst in inst_list: sup_postier = gloss_tag_tier(inst, tag_method=INTENT_POS_MANUAL) if sup_postier is None: continue gw_tier = gloss(inst) classify_gloss_pos(inst, classifier, posdict=pd, feat_prev_gram=CLASS_FEATS_PRESW in feat_list, feat_next_gram=CLASS_FEATS_NEXSW in feat_list, feat_dict=CLASS_FEATS_DICT in feat_list, feat_prev_gram_dict=CLASS_FEATS_PDICT in feat_list, feat_next_gram_dict=CLASS_FEATS_NDICT in feat_list, feat_suffix=CLASS_FEATS_SUF in feat_list, feat_prefix=CLASS_FEATS_PRE in feat_list, feat_morph_num=CLASS_FEATS_NUMSW in feat_list, feat_has_number=CLASS_FEATS_NUM in feat_list, feat_basic=CLASS_FEATS_SW in feat_list) cls_postier = gloss_tag_tier(inst, tag_method=INTENT_POS_CLASS) for cls_tag in cls_postier: word = xigt_find(gw_tier, id=cls_tag.alignment) sup_tag = xigt_find(sup_postier, alignment=cls_tag.alignment) if sup_tag is None: continue else: sup_tag_v = sup_tag.value() if gold_tagmap is not None: sup_tag_v = gold_tagmap.get(sup_tag_v) pos_class_matrix.add(sup_tag_v, cls_tag.value()) if cls_tag.value() == sup_tag_v: matches += 1 compares += 1 return matches, compares, matches/compares*100
def write_out_gram_dict(subword_dict, feat_path, feat_list, threshold = 1): """ Given the gram+tag dict, write out grams for those that have been seen enough to meet our threshold. :param subword_dict: :type subword_dict: TwoLevelCountDict :param feat_path: :param class_path: """ EXTRACT_LOG.log(NORM_LEVEL, 'Writing out svm-lite style features to "{}"...'.format(feat_path)) feat_file = open(feat_path, 'w', encoding='utf-8') # Load the posdict if needed... pd = load_posdict() if (CLASS_FEATS_DICT in feat_list) or (CLASS_FEATS_PDICT in feat_list) or (CLASS_FEATS_NDICT in feat_list) else False for subword in subword_dict.keys(): for tag in subword_dict[subword].keys(): # Write out the gram with this tag as many times as it appears... for prev_word, next_word in subword_dict[subword][tag]['contexts']: gt = GoldTagPOSToken(subword, goldlabel=tag) # ------------------------------------------- # Now, vary the features depending on whats in the list # ------------------------------------------- write_gram(gt, lowercase=True, feat_next_gram=CLASS_FEATS_NEXSW in feat_list, feat_prev_gram=CLASS_FEATS_PRESW in feat_list, feat_suffix=CLASS_FEATS_SUF in feat_list, feat_prefix=CLASS_FEATS_PRE in feat_list, feat_has_number=CLASS_FEATS_NUM in feat_list, feat_morph_num=CLASS_FEATS_NUMSW in feat_list, feat_prev_gram_dict=CLASS_FEATS_PDICT in feat_list, feat_next_gram_dict=CLASS_FEATS_NDICT in feat_list, feat_basic=CLASS_FEATS_SW in feat_list, feat_dict=CLASS_FEATS_DICT in feat_list, posdict=pd, next_gram=next_word, prev_gram=prev_word, output=feat_file) feat_file.close() EXTRACT_LOG.log(NORM_LEVEL, 'Written')
def write_gram(token, **kwargs): # Re-cast the kwargs as an argpasser. kwargs = ArgPasser(kwargs) output_type = kwargs.get('type', 'classifier') output = kwargs.get('output', sys.stdout) posdict = kwargs.get('posdict', None) if posdict is None: posdict = env.load_posdict() # Previous tag info prev_gram = kwargs.get('prev_gram') next_gram = kwargs.get('next_gram') # Get heuristic alignment aln_labels = kwargs.get('aln_labels', []) # =========================================================================== # Break apart the token... # =========================================================================== gram = token.seq pos = token.goldlabel # Lowercase if asked for lower = kwargs.get('lowercase', True, bool) gram = gram.lower() if gram else gram # Fix the various issues with the grams. gram = fix_gram(gram) # =========================================================================== # Do some cleaning on the gram.... # =========================================================================== # Only take the first of two slashed grams gram = re.sub('(.*)?/(.*)', r'\1', gram) # Remove leading and trailing stuff gram = re.sub('^(\S+)[\-=:\[\(\]\)/\*]$', r'\1', gram) gram = re.sub('^[\-=:\[\(\]\)/\*](\S+)$', r'\1', gram) # =========================================================================== # Output the grams for a classifier # # NOTE! Only tokens that have an ASSIGNED pos tag will be written out this way! if output_type == 'classifier' and pos: output.write(pos) # ======================================================================= # Get the morphemes # ======================================================================= morphs = intent.utils.token.tokenize_string(gram, intent.utils.token.morpheme_tokenizer) # ============================================================================= # Gram cleaning.... # ============================================================================= # Replace the characters that cause the svmlight format issues. gram = gram.replace(':', '-') gram = gram.replace('#', '-') # ======================================================================= # Is there a number # ======================================================================= if re.search('[0-9]', gram) and kwargs.get('feat_has_number', False, bool): output.write('\thas-number:1') # ======================================================================= # What labels is it aligned with # ======================================================================= if kwargs.get('feat_align', False, bool): for aln_label in aln_labels: output.write('\taln-label-%s:1' % aln_label) # ======================================================================= # Suffix # ======================================================================= if kwargs.get('feat_suffix', True, bool): output.write('\tgram-suffix-3-%s:1' % gram[-3:]) output.write('\tgram-suffix-2-%s:1' % gram[-2:]) output.write('\tgram-suffix-1-%s:1' % gram[-1:]) # ======================================================================= # Prefix # ======================================================================= if kwargs.get('feat_prefix', True, bool): output.write('\tgram-prefix-3-%s:1' % gram[:3]) output.write('\tgram-prefix-2-%s:1' % gram[:2]) output.write('\tgram-prefix-1-%s:1' % gram[:1]) # ======================================================================= # Number of morphs # ======================================================================= if kwargs.get('feat_morph_num', False, bool): output.write('\t%d-morphs:1' % len(list(morphs))) # =================================================================== # Previous gram # =================================================================== if prev_gram: prev_gram = prev_gram.lower() if lower else prev_gram # And then tokenize... for token in intent.utils.token.tokenize_string(prev_gram, intent.utils.token.morpheme_tokenizer): if kwargs.get('feat_prev_gram', True, bool): output.write('\tprev-gram-%s:1' % fix_gram(token.seq)) # Add prev dictionary tag if posdict and kwargs.get('feat_prev_gram_dict', True, bool) and token.seq in posdict: prev_tags = posdict.top_n(token.seq) output.write('\tprev-gram-dict-tag-%s:1' % prev_tags[0][0]) # Write a "**NONE**" for prev or next... elif kwargs.get('feat_prev_gram', True, bool): output.write('\tprev-gram-**NONE**:1') # =================================================================== # Next gram # =================================================================== if next_gram: next_gram = next_gram.lower() if lower else next_gram for token in intent.utils.token.tokenize_string(next_gram, intent.utils.token.morpheme_tokenizer): # =================================================================== # Gram itself # =================================================================== if kwargs.get('feat_next_gram', True, bool): output.write('\tnext-gram-%s:1' % fix_gram(token.seq)) if posdict and kwargs.get('feat_next_gram_dict', True, bool) and token.seq in posdict: next_tags = posdict.top_n(token.seq) output.write('\tnext-gram-dict-tag-%s:1' % next_tags[0][0]) elif kwargs.get('feat_next_gram', True, bool): output.write('\tnext-gram-**NONE**:1') # ======================================================================= # Iterate through the morphs # ======================================================================= for token in morphs: # =================================================================== # Just write the morph # =================================================================== if kwargs.get('feat_basic', True, bool): output.write('\t%s:1' % token.seq) # =================================================================== # If the morph resembles a word in our dictionary, give it # a predicted tag # =================================================================== if posdict and token.seq in posdict and kwargs.get('feat_dict', True, bool): top_tags = posdict.top_n(token.seq) # best = top_tags[0][0] # if best != pos: # MODULE_LOGGER.debug('%s tagged as %s not %s' % (gram, pos, best)) output.write('\ttop-dict-word-%s:1' % top_tags[0][0]) if len(top_tags) > 1: output.write('\tnext-dict-word-%s:1' % top_tags[1][0]) output.write('\n') # =========================================================================== # If writing the gram out for the tagger... # =========================================================================== if output_type == 'tagger' and kwargs.get('tag_f'): output.write('%s/%s ' % (gram, pos))
def enrich(**kwargs): global classifier if ARG_OUTFILE not in kwargs: ENRICH_LOG.critical("No output file specified.") sys.exit() # ============================================================================= # Set up the alternate classifier path... # ============================================================================= class_path = kwargs.get('class_path') #=========================================================================== # Set up the different arguments... #=========================================================================== inpath = kwargs.get(ARG_INFILE) parse_args = kwargs.get(PARSE_VAR, []) pos_args = kwargs.get(POS_VAR, []) aln_args = kwargs.get(ALN_VAR, []) max_parse_length = kwargs.get('max_parse_length', 10) if not (parse_args or pos_args or aln_args): ENRICH_LOG.warning("No enrichment specified. Basic processing only will be performed.") #=========================================================================== # Sanity check the arguments. #=========================================================================== # Check that alignment is asked for if projection is asked for. if (ARG_POS_PROJ in pos_args or ARG_PARSE_PROJ in parse_args) and (not aln_args): ENRICH_LOG.warn("You have asked for projection methods but have not requested " + \ "alignments to be generated. Projection may fail if alignment not already present in file.") ENRICH_LOG.log(1000, 'Loading input file...') with open(inpath, 'r', encoding='utf-8') as in_f: corp = xigtxml.load(in_f, mode=INCREMENTAL) # ------------------------------------------- # Initialize the English tagger if: # A) "proj" option is selected for pos. # B) "trans" option is given for pos. # C) "heurpos" option is given for alignment. # ------------------------------------------- s = None if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args: ENRICH_LOG.log(1000, 'Initializing tagger...') tagger = c.getpath('stanford_tagger_trans') try: s = StanfordPOSTagger(tagger) except TaggerError as te: ENRICH_LOG.critical(te) sys.exit(2) # ------------------------------------------- # Initialize the parser if: # A) "trans" option is given for parse # B) "proj" option is given for parse. # ------------------------------------------- if ARG_PARSE_TRANS in parse_args or ARG_PARSE_PROJ in parse_args: ENRICH_LOG.log(1000, "Intializing English parser...") sp = stanford_parser.StanfordParser() # ------------------------------------------- # Initialize the classifier if: # A) "class" option is given for pos # B) "heurpos" option is given for alignment. # ------------------------------------------- m = None if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args: ENRICH_LOG.log(1000, "Initializing gloss-line classifier...") p = load_posdict() m = mallet_maxent.MalletMaxent(classifier) # -- 1b) Giza Gloss to Translation alignment -------------------------------------- if ARG_ALN_GIZA in aln_args or ARG_ALN_GIZAHEUR in aln_args: ENRICH_LOG.log(1000, 'Aligning gloss and translation lines using mgiza++...') try: if ARG_ALN_GIZAHEUR in aln_args: giza_align_t_g(corp, resume=True, use_heur=True, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT)) if ARG_ALN_GIZA in aln_args: giza_align_t_g(corp, resume=True, use_heur=False, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT)) except GizaAlignmentException as gae: gl = logging.getLogger('giza') gl.critical(str(gae)) raise gae # ------------------------------------------- # Begin iterating through the corpus # ------------------------------------------- for inst in corp: feedback_string = 'Instance {:15s}: {{:20s}}{{}}'.format(inst.id) reasons = [] inst_status = None def fail(reason): nonlocal inst_status, reasons if reason not in reasons: reasons.append(reason) inst_status = 'WARN' def success(): nonlocal inst_status inst_status = 'OK' # ------------------------------------------- # Define the reasons for failure # ------------------------------------------- F_GLOSS_LINE = "NOGLOSS" F_LANG_LINE = "NOLANG" F_TRANS_LINE = "NOTRANS" F_BAD_LINES = "BADLINES" F_L_G_ALN = "L_G_ALIGN" F_T_G_ALN = "G_T_ALIGN" F_NO_TRANS_POS="NO_POS_TRANS" F_PROJECTION = "PROJECTION" F_UNKNOWN = "UNKNOWN" F_PARSELEN = "OVER_MAX_LENGTH" try: # ------------------------------------------- # Get the different lines # ------------------------------------------- def tryline(func): nonlocal inst try: return func(inst) except NoNormLineException as nnle: return None gl = tryline(gloss_line) tls = tryline(trans_lines) lls = tryline(lang_lines) has_gl = gl is not None has_tl = tls is not None has_ll = lls is not None has_all = lambda: (has_gl and has_tl and has_ll) # ------------------------------------------- # Translation Line # ------------------------------------------- if has_tl: if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args: try: tag_trans_pos(inst, s) except CriticalTaggerError as cte: ENRICH_LOG.critical(str(cte)) sys.exit(2) if ARG_PARSE_PROJ in parse_args or ARG_PARSE_TRANS in parse_args: if len(trans(inst)) <= max_parse_length: parse_translation_line(inst, sp, pt=True, dt=True) else: fail(F_PARSELEN) # 4) POS tag the gloss line -------------------------------------------- if has_gl: if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args: classify_gloss_pos(inst, m, posdict=p) # ------------------------------------------- # Try getting alignments. # ------------------------------------------- if has_gl and has_ll: try: add_gloss_lang_alignments(inst) except GlossLangAlignException as glae: fail(F_L_G_ALN) if has_gl and has_tl: if ARG_ALN_HEURPOS in aln_args: heur_align_inst(inst, use_pos=True) if ARG_ALN_HEUR in aln_args: heur_align_inst(inst, use_pos=False) # ------------------------------------------- # Now, do the necessary projection tasks. # ------------------------------------------- # Project the classifier tags... if has_ll and has_gl and ARG_POS_CLASS in pos_args: try: project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_CLASS) except GlossLangAlignException: fail(F_L_G_ALN) # ------------------------------------------- # Do the trans-to-lang projection... # ------------------------------------------- if has_all(): proj_aln_method = ALN_ARG_MAP[kwargs.get('proj_aln', ARG_ALN_ANY)] aln = get_trans_gloss_alignment(inst, aln_method=proj_aln_method) if not aln or len(aln) == 0: fail(F_T_G_ALN) else: # ------------------------------------------- # POS Projection # ------------------------------------------- if ARG_POS_PROJ in pos_args: trans_tags = trans_tag_tier(inst) if not trans_tags: fail(F_NO_TRANS_POS) else: project_trans_pos_to_gloss(inst) try: project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ) except GlossLangAlignException as glae: fail(F_L_G_ALN) # ------------------------------------------- # Parse projection # ------------------------------------------- if ARG_PARSE_PROJ in parse_args: try: project_pt_tier(inst, proj_aln_method=proj_aln_method) except PhraseStructureProjectionException as pspe: fail(F_PROJECTION) except NoAlignmentProvidedError as nape: fail(F_T_G_ALN) try: project_ds_tier(inst, proj_aln_method=proj_aln_method) except ProjectionException as pe: fail(F_PROJECTION) except NoAlignmentProvidedError as nape: fail(F_T_G_ALN) # Sort the tiers... ---------------------------------------------------- inst.sort_tiers() except Exception as e: # ENRICH_LOG.warn("Unknown Error occurred processing instance {}".format(inst.id)) ENRICH_LOG.debug(e) # raise(e) fail(F_UNKNOWN) if not reasons: success() ENRICH_LOG.info(feedback_string.format(inst_status, ','.join(reasons))) ENRICH_LOG.log(1000, 'Writing output file...') if hasattr(kwargs.get(ARG_OUTFILE), 'write'): xigtxml.dump(kwargs.get(ARG_OUTFILE), corp) else: xigtxml.dump(writefile(kwargs.get(ARG_OUTFILE)), corp) ENRICH_LOG.log(1000, 'Done.') ENRICH_LOG.log(1000, "{} instances written.".format(len(corp)))
def test_classify_pos_tags(self): tags = classify_gloss_pos(self.igt, MalletMaxent(), posdict=load_posdict()) self.assertEqual(tags, self.tags)