def add_part(self, dir_path: Path, entry: Entry, drop_noise=False): path = self.cache.get_entry(entry) swap = entry.is_swap(self.langs) parser = Parser(path, langs=self.langs, ext=entry.in_ext or None, ent=entry) langs = '_'.join(self.langs) l1 = (dir_path / f'{entry.name}-{langs}').with_suffix(f'.{self.langs[0]}') l2 = (dir_path / f'{entry.name}-{langs}').with_suffix(f'.{self.langs[1]}') mode = dict(mode='w', encoding='utf-8', errors='ignore') with l1.open(**mode) as f1, l2.open(**mode) as f2: count, skips, noise = 0, 0, 0 for rec in parser.read_segs(): rec = rec[:2] # get the first two recs if len(rec) != 2: skips += 1 continue if drop_noise and entry.is_noisy(seg1=rec[0], seg2=rec[1]): skips += 1 noise += 1 continue sent1, sent2 = [s.strip() for s in rec] if not sent1 or not sent2: skips += 1 continue if swap: sent2, sent1 = sent1, sent2 sent1 = sent1.replace('\n', ' ').replace('\t', ' ') sent2 = sent2.replace('\n', ' ').replace('\t', ' ') f1.write(f'{sent1}\n') f2.write(f'{sent2}\n') count += 1 msg = f'Looks like an error. {count} segs are valid {skips} are invalid: {entry}' assert count > 0, msg if skips > count: log.warning(msg) if noise > 0: log.info( f"{entry}: Noise : {noise:,}/{count:,} => {100*noise/count:.4f}%" ) log.info(f"wrote {count} lines to {l1} == {l2}") return count, skips
def get_stats(self, entry: Entry): path = self.get_entry(entry) parser = Parser(path, ext=entry.in_ext or None, ent=entry) count, skips, noise = 0, 0, 0 toks = [0, 0] chars = [0, 0] for rec in parser.read_segs(): if len(rec) < 2 or not rec[0] or not rec[1]: skips += 1 continue if entry.is_noisy(seg1=rec[0], seg2=rec[1]): noise += 1 skips += 1 continue count += 1 s1, s2 = rec[:2] # get the first two recs chars[0] += len(s1) chars[1] += len(s2) s1_tok, s2_tok = s1.split(), s2.split() toks[0] += len(s1_tok) toks[1] += len(s2_tok) l1, l2 = entry.did.langs l1, l2 = l1.lang, l2.lang assert count > 0, f'No valid records are found for {entry.did}' if l2 < l1: l1, l2 = l2, l1 toks = toks[1], toks[0] chars = chars[1], chars[0] return { 'id': str(entry.did), 'segs': count, 'segs_err': skips, 'segs_noise': noise, f'{l1}_toks': toks[0], f'{l2}_toks': toks[1], f'{l1}_chars': chars[0], f'{l2}_chars': chars[0] }