def extract_tagger_from_instance(inst: Igt, output_stream, pos_source, tm): """ Given an instance, retrieve the language-line words and POS tags. :param inst: :param output_stream: :param pos_source: """ lang_pos_tags = lang_tag_tier(inst, tag_method=pos_source) lang_words = lang(inst) training_sentences = 0 # ------------------------------------------- # Only try extracting if there are in fact valid POS tags. # ------------------------------------------- if lang_pos_tags: first = True for lang_word in lang_words: lang_pos_tag = None if lang_pos_tags is not None: lang_pos_tag = xigt_find(lang_pos_tags, alignment=lang_word.id) tag_string = lang_pos_tag.value() if lang_pos_tag is not None else handle_unknown_pos(inst, lang_word) if tag_string and tm: tag_string = tm[tag_string] word_string = lang_word.value() # ------------------------------------------- # Do some cleaning on the output words # ------------------------------------------- word_string = clean_lang_token(word_string, lowercase=True) # For every instance after the first, # add a space. out_str = ' {}/{}' if first: first = False out_str = out_str.strip() output_stream.write(out_str.format(word_string, tag_string)) output_stream.write('\n') output_stream.flush() training_sentences += 1 return training_sentences
def extract_sents_from_inst(inst: Igt, out_src, out_tgt, aln_method=None, no_alignment_heur = True, sent_type=SENT_TYPE_T_G): """ Extract parallel sentences from an instance. Either: 1) Translation--Gloss 2) Translation--Language """ # ------------------------------------------- # 1) Get the source string (translation) # ------------------------------------------- src_str = tier_text(trans(inst), remove_whitespace_inside_tokens=True).lower() # ------------------------------------------- # 2) Decide whether the target string is gloss or language. # ------------------------------------------- if sent_type == SENT_TYPE_T_L: tgt_str = tier_text(lang(inst), remove_whitespace_inside_tokens=True).lower() elif sent_type == SENT_TYPE_T_G: tgt_str = tier_text(gloss(inst), remove_whitespace_inside_tokens=True).lower() else: raise Exception("Invalid sent type") # ------------------------------------------- # 3) Write the choice out to disk. # ------------------------------------------- out_src.write(src_str + '\n') out_tgt.write(tgt_str + '\n') out_src.flush() out_tgt.flush() # ------------------------------------------- # 4) Add heuristic alignments, if asked for. # ------------------------------------------- if not no_alignment_heur: pairs = get_trans_aligned_wordpairs(inst, aln_method=aln_method, add_align=True, sent_type=sent_type) for src_word, tgt_word in pairs: out_src.write(src_word.lower() + '\n') out_tgt.write(tgt_word.lower() + '\n')
def extract_parser_from_instance(inst: Igt, output_stream, pos_source, tm): """ Given an IGT instance, extract the projected dependency structure from it (along with the POS tags from the given pos_source) :param inst: Input instance :param output_stream: The output stream to write the training data to. """ extracted = 0 try: ds = get_lang_ds(inst, pos_source=pos_source, unk_pos_handling=None) if ds is not None: conll_string = to_conll(ds, lang(inst), lowercase=True, match_punc=True, clean_token=True, unk_pos='UNK', tagmap=tm) output_stream.write(conll_string+'\n\n') output_stream.flush() extracted += 1 except RuntimeError as re: print(re) EXTRACT_LOG.error("Runtime error in instance {}".format(inst.id)) except RGXigtException as rgxe: EXTRACT_LOG.warn('Instance "{}" failed with "{}"'.format(inst.id, rgxe)) return extracted