def main(fstpath, path_in, path_out): pool = multiprocessing.Pool(processes=16) fst = fstinter.FST(fstpath, get_cost) fst_analyze_line = partial(analyze_line, fst) for fnin in glob.glob(path_in): fnout = os.path.join(path_out, os.path.basename(fnin)) with open(fnin, encoding='utf-8') as fin: lines = fin.readlines() out_data = pool.map(fst_analyze_line, lines) out_data = '\n'.join(out_data) with open(fnout, 'w', encoding='utf-8') as fout: fout.write(out_data)
def main(fstpath, path_in, path_out): fst = fstinter.FST(fstpath, get_cost) for fnin in glob.glob(path_in): print("Analyzing {}...".format(fnin)) fnout = os.path.join(path_out, os.path.basename(fnin)) with open(fnin, encoding='utf-8') as fin, open(fnout, 'w', encoding='utf-8') as fout: for line in tqdm.tqdm(fin, total=get_num_lines(fnin)): output = [] tokens = line.strip().split() analyses = fst.analyses(tokens) for (token, (morphs, lemma)) in analyses: props = [] if len(morphs) < 2 else morphs[1:] output.append('w:{}~l:{}~m:{}'.format( token, lemma, '+'.join(props))) print(' '.join(output), file=fout)
def __init__(self, fstpath): self.fst = fstinter.FST(fstpath, get_cost)