def slice_lines_grouped_by_n(inf, outf, *, n, s): for lines in tools.group_n_lines(inf, n=n): if type(s) == slice: outf.write(''.join(lines[s])) elif type(s) == int: outf.write(lines[s]) else: raise AssertionError
def run(self): with self.input().open('r') as inf: with self.output()['src'].open( 'w') as outf1, self.output()['tgt'].open( 'w') as outf2: for src, tgt, _ in tools.group_n_lines(inf, n=3): outf1.write(src) outf2.write(tgt)
def ngram_pairs_from_lines(lines): for i, (zh, tag, _) in enumerate(tools.group_n_lines(lines, n=3), 1): zh, tag = zh.split(), tag.split() if i % 100000 == 0: print('{:,}'.format(i)) # show progress # return yield from task.ngram_pairs(zh, tag)
def run(self): with self.input().open('r') as inf, self.output().open('w') as outf: for zh, en, _ in tools.group_n_lines(inf, n=3): zh, en = zh.strip(), en.strip() zh = tools.zhsent_preprocess(zh) with contextlib.redirect_stdout(outf): print(zh) print(en) print()
def run(self): from geniatagger import GeniaTaggerClient gtagger = GeniaTaggerClient() with self.input().open('r') as input_file: with self.output()['np'].open('w') as np_out, self.output()['vp'].open('w') as vp_out, self.output()['pure_np'].open('w') as pure_np_out: for en, ch in tools.group_n_lines(input_file, n=2): en, ch = en.strip(), ch.strip() en_tag_info = gtagger.parse(en) if 'B-VP' == en_tag_info[0][3]: outfile = vp_out elif 'B-VP' not in (wdata[3] for wdata in en_tag_info): outfile = pure_np_out else: outfile = np_out print(en, file=outfile) print(ch, file=outfile) print(*('/'.join(wdata) for wdata in en_tag_info), file=outfile) print(file=outfile)
def run(self): from nltk.tokenize import sent_tokenize from nltk.tokenize import RegexpTokenizer ch_sent_tokenize = RegexpTokenizer('(?:[^。「」!?]*(「[^」]*」)?[^。「」!?]*)+[。!?;]?').tokenize import sys with self.input().open('r') as input_file, self.output().open('w') as output_file: for en, ch in tools.group_n_lines(input_file, n=2): en, ch = en.strip(), ch.strip() ens = sent_tokenize(en) chs = [sub_ch for sub_ch in ch_sent_tokenize(ch) if sub_ch != ''] score = 0 if len(ens) != len(chs): print('Unmatched sentences length:', ens, chs, file=sys.stderr) continue score = sum(translate_score(en, ch) for en, ch in zip(ens, chs)) / len(en.split()) for en, ch in zip(ens, chs): print(score, en, ch, sep='\t', file=output_file)
def transformat_line2slash(inf, outf): for zh, tag, _ in tools.group_n_lines(inf, n=3): zh, tag = zh.strip().split(), tag.strip().split() print(*('{}/{}'.format(z, t) for z, t in zip(zh, tag)), file=outf)