def evaluate_lang_pos(lang, xc, e_tagger, pos_pla, gold_tagmap=None, outstream=sys.stdout): """ :type pos_pla: POSEvalDict """ matches = 0 compares = 0 # Iterate through each instance in the corpus. for inst in xc: gold_tag_tier = lang_tag_tier(inst, INTENT_POS_MANUAL) # If there are no gold tags for this instance, skip it. if gold_tag_tier is None: continue # Create the eval tag tier and retrieve it tag_lang_pos(inst, e_tagger) eval_tag_tier = lang_tag_tier(inst, INTENT_POS_TAGGER) # For each gold tag... for gold_tag in gold_tag_tier: # Find it's matching tag on the eval side, and compare. eval_tag = xigt_find(eval_tag_tier, alignment=gold_tag.alignment) gold_tag_v = gold_tag.value() if gold_tag_v is not None: if gold_tagmap: try: gold_tag_v = gold_tagmap.get(gold_tag_v) except TagMapException: pass if gold_tag_v != 'JUNK': if gold_tag_v == eval_tag.value(): matches += 1 compares += 1 pos_pla.add(gold_tag.value(), eval_tag.value()) return matches, compares
def extract_tagger_from_instance(inst: Igt, output_stream, pos_source, tm): """ Given an instance, retrieve the language-line words and POS tags. :param inst: :param output_stream: :param pos_source: """ lang_pos_tags = lang_tag_tier(inst, tag_method=pos_source) lang_words = lang(inst) training_sentences = 0 # ------------------------------------------- # Only try extracting if there are in fact valid POS tags. # ------------------------------------------- if lang_pos_tags: first = True for lang_word in lang_words: lang_pos_tag = None if lang_pos_tags is not None: lang_pos_tag = xigt_find(lang_pos_tags, alignment=lang_word.id) tag_string = lang_pos_tag.value() if lang_pos_tag is not None else handle_unknown_pos(inst, lang_word) if tag_string and tm: tag_string = tm[tag_string] word_string = lang_word.value() # ------------------------------------------- # Do some cleaning on the output words # ------------------------------------------- word_string = clean_lang_token(word_string, lowercase=True) # For every instance after the first, # add a space. out_str = ' {}/{}' if first: first = False out_str = out_str.strip() output_stream.write(out_str.format(word_string, tag_string)) output_stream.write('\n') output_stream.flush() training_sentences += 1 return training_sentences
def gather_gloss_pos_stats(inst, subword_dict, feat_list): """ Given an instance, look for the gloss pos tags, and save the statistics about them, so that we can filter by the number of times each kind was seen later. :param inst: Instance to process. :type inst: RGIgt :param subword_dict: This dictionary will record the number of times each (word, TAG) pair has been seen. :type subword_dict: SubwordDict :param gram_tag_dict: This dictionary will record the number of times individual grams are seen. :type gram_tag_dict: TwoLevelCountDict """ # Grab the gloss POS tier... gpos_tier = gloss_tag_tier(inst) lpos_tier = lang_tag_tier(inst) gw_tier = gloss(inst) if CLASS_FEATS_ALN in feat_list: heur_align_inst(inst) get_trans_glosses_alignment(inst, aln_method=INTENT_ALN_HEUR) # If there are POS tags on the language line but not the gloss line... if gpos_tier is None and lpos_tier is not None: add_gloss_lang_alignments(inst) project_lang_to_gloss(inst) gpos_tier = gloss_tag_tier(inst) # If this tier exists, then let's process it. if gpos_tier is not None: # Iterate over each gloss POS tag... for i, gw in enumerate(gw_tier): tag = xigt_find(inst, alignment=gw.id) if tag is None: continue prev_word = gw_tier[i-1].value().lower() if i > 0 else None next_word = gw_tier[i+1].value().lower() if i < len(gw_tier)-1 else None if CLASS_FEATS_ALN in feat_list: subword_dict.add_word_tag(gw.value().lower(), tag.value(), prev_word, next_word)