def main(args): lines = read_file_to_lines(args.lex_table, args.unicode_escape) entries = filter(lambda x: x, [ parse_line_to_entry(line, delimiter=args.delimiter) for line in lines ]) raw_lexicon = groupby(sorted(entries), key=(lambda e: (e.src, e.tgt))) def merge_duplicate_entry(entries): entries = list(entries) e = entries[0] return LexEntry(e.src, e.tgt, sum(e.prob for e in entries)) def map_entries(entries): entries = sorted(entries, key=lambda e: -e.prob)[:3] max_prob = entries[0].prob entries = [ LexEntry(e.src, e.tgt, e.prob * (1 / max_prob)) for e in entries ] return filter(lambda e: e.prob > 1e-5, entries) merged_lexicon = map(merge_duplicate_entry, map(operator.itemgetter(1), raw_lexicon)) lexicon = groupby(sorted(merged_lexicon), key=(lambda e: e.src)) pruned_lexicon = flatten( map(map_entries, map(operator.itemgetter(1), lexicon))) out_lines = map(lambda e: f"{e.src} {e.prob} {e.tgt}", pruned_lexicon) write_lines_to_file(args.lexicon_path, out_lines)
def from_moses(cls, moses_path, unicode_escape): lines = read_file_to_lines(moses_path, unicode_escape) entries = filter(lambda x: x, [ MosesHelper.parse_line_to_entry(line, delimiter='\|\|\|') for line in lines ]) return cls(list(entries))
def from_kaldi(cls, lexicon_path: str, with_prob: bool = False, sum_dup_pron_probs: bool = True): lines = read_file_to_lines(lexicon_path) def parse_line(line): cols = line.split() if with_prob: return LexiconEntry(cols[0], np.log(float(cols[1])), " ".join(cols[2:])) else: return LexiconEntry(cols[0], 0.0, " ".join(cols[1:])) return cls(map(parse_line, lines), sum_dup_pron_probs)
# chinese = converter.convert(chinese) # new_sent = sent[:start] + chinese + sent[:end] # return convert_arabic_number_to_chinese(new_sent) # else: # return sent parser = argparse.ArgumentParser() parser.add_argument('--lexicon-path') parser.add_argument('--input-path') parser.add_argument('--output-path') parser.add_argument('--filtered-output-path') args = parser.parse_args() lexicon = parse_lexicon(args.lexicon_path) lines = read_file_to_lines(args.input_path) texts = [" ".join(line.split()[2:]) for line in lines] cnt = 0 cutted_sents = [] valid_line_nums = [] for idx, text in enumerate(texts): sents = re.split("\s+", text) words = [] segmentable = True for sent in sents: sent = re.sub("[^\u4e00-\u9fa5A-Za-z0-9]", "", sent) sent = cn2an.transform(sent, "an2cn") sent = converter.convert(sent) maybe_words = dict_seg(sent, lexicon) if not maybe_words:
moses_config = MosesConfig(True, True, args.n_best) moses_client = MosesClient(port=args.mosesserver_port, config=moses_config) word_seg = None if "dict" in model_types: from tsm.ckip_wrapper import CKIPWordSegWrapper cutter = CKIPWordSegWrapper(args.ckip_path, dict_lexicon, not args.recommend_dictionary) seq2seq_translator = None if 'seq2seq' in model_types or 'seq2seq' in args.unk_consult_order: seq2seq_translator = AllennlpClient() unk_translator = UnkTranslator(prob_lexicon, dict_lexicon, taibun_lexicon, args.unk_consult_order, seq2seq_translator) maybe_process_unk = maybe_process_unk_factory(unk_translator) lines = read_file_to_lines(args.src_path) outf = open(args.dest_path, 'w') oovs = [] for line in tqdm.tqdm(lines): utt_id = None if args.has_utt_id: fields = line.split() utt_id = fields[0] line = " ".join(fields[1:]) src_sent = Sentence.parse_mixed_text(line, remove_punct=True) all_entries = [] if "dict" in model_types: maybe_sents = cutter.cut("".join(src_sent)) n_best = math.ceil( min(args.n_best, math.exp(math.log(1000) / len(maybe_sents))))
parser = argparse.ArgumentParser() parser.add_argument('input_file') parser.add_argument('map_file') parser.add_argument('output_file') parser.add_argument('--col', type=int, help="starting from which column") parser.add_argument('--delimiter', default="\s+") args = parser.parse_args() def line2word_syls(line): cols = re.split(args.delimiter, line) return cols[:args.col], list(filter(lambda col: col, cols[args.col:])) syl_lines = read_file_to_lines(args.input_file) syl_lexicon = list(map(line2word_syls, syl_lines)) def line2syl_phn(line): idx = line.index(" ") return line[:idx], line[idx + 1:] map_lines = read_file_to_lines(args.map_file) mapping = dict(map(line2syl_phn, map_lines)) def map_syltone(syl): tone = int(syl[-1]) phns = mapping[syl[:-1]]