Ejemplo n.º 1
0
Archivo: data.py Proyecto: kpu/mtdata
 def add_part(self, dir_path: Path, entry: Entry, drop_noise=False):
     path = self.cache.get_entry(entry)
     swap = entry.is_swap(self.langs)
     parser = Parser(path,
                     langs=self.langs,
                     ext=entry.in_ext or None,
                     ent=entry)
     langs = '_'.join(self.langs)
     l1 = (dir_path /
           f'{entry.name}-{langs}').with_suffix(f'.{self.langs[0]}')
     l2 = (dir_path /
           f'{entry.name}-{langs}').with_suffix(f'.{self.langs[1]}')
     mode = dict(mode='w', encoding='utf-8', errors='ignore')
     with l1.open(**mode) as f1, l2.open(**mode) as f2:
         count, skips, noise = 0, 0, 0
         for rec in parser.read_segs():
             rec = rec[:2]  # get the first two recs
             if len(rec) != 2:
                 skips += 1
                 continue
             if drop_noise and entry.is_noisy(seg1=rec[0], seg2=rec[1]):
                 skips += 1
                 noise += 1
                 continue
             sent1, sent2 = [s.strip() for s in rec]
             if not sent1 or not sent2:
                 skips += 1
                 continue
             if swap:
                 sent2, sent1 = sent1, sent2
             sent1 = sent1.replace('\n', ' ').replace('\t', ' ')
             sent2 = sent2.replace('\n', ' ').replace('\t', ' ')
             f1.write(f'{sent1}\n')
             f2.write(f'{sent2}\n')
             count += 1
         msg = f'Looks like an error. {count} segs are valid {skips} are invalid: {entry}'
         assert count > 0, msg
         if skips > count:
             log.warning(msg)
         if noise > 0:
             log.info(
                 f"{entry}: Noise : {noise:,}/{count:,} => {100*noise/count:.4f}%"
             )
         log.info(f"wrote {count} lines to {l1} == {l2}")
     return count, skips
Ejemplo n.º 2
0
    def get_stats(self, entry: Entry):
        path = self.get_entry(entry)
        parser = Parser(path, ext=entry.in_ext or None, ent=entry)
        count, skips, noise = 0, 0, 0
        toks = [0, 0]
        chars = [0, 0]
        for rec in parser.read_segs():
            if len(rec) < 2 or not rec[0] or not rec[1]:
                skips += 1
                continue
            if entry.is_noisy(seg1=rec[0], seg2=rec[1]):
                noise += 1
                skips += 1
                continue
            count += 1
            s1, s2 = rec[:2]  # get the first two recs
            chars[0] += len(s1)
            chars[1] += len(s2)
            s1_tok, s2_tok = s1.split(), s2.split()
            toks[0] += len(s1_tok)
            toks[1] += len(s2_tok)

        l1, l2 = entry.did.langs
        l1, l2 = l1.lang, l2.lang
        assert count > 0, f'No valid records are found for {entry.did}'
        if l2 < l1:
            l1, l2 = l2, l1
            toks = toks[1], toks[0]
            chars = chars[1], chars[0]
        return {
            'id': str(entry.did),
            'segs': count,
            'segs_err': skips,
            'segs_noise': noise,
            f'{l1}_toks': toks[0],
            f'{l2}_toks': toks[1],
            f'{l1}_chars': chars[0],
            f'{l2}_chars': chars[0]
        }