def read_tmx(path: Union[Path, str], langs=None): """ reads a TMX file as records :param path: path to .tmx file :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect :return: stream of (text1, text2) """ passes = 0 fails = 0 with IO.reader(path) as data: recs = parse_tmx(data) for lang_seg in recs: if langs is None: log.warning( "langs not set; this could result in language mismatch") if len(lang_seg) == 2: langs = tuple(lang_seg.keys()) else: raise Exception( f"Language autodetect for TMX only supports 2 languages, but provided with {lang_seg.keys()} in TMX {path}" ) if langs[0] in lang_seg and langs[1] in lang_seg: yield lang_seg[langs[0]], lang_seg[langs[1]] passes += 1 else: fails += 1 if passes == 0: if fails == 0: raise Exception(f"Empty TMX {path}") raise Exception(f"Nothing for {langs[0]}--{langs[1]} in TMX {path}") if fails != 0: log.warning( f"Skipped {fails} entries due to language mismatch in TMX {path}") log.info(f"Extracted {passes} pairs from TMX {path}")
def parse_tmx(data, n_langs=2, log_every=DEF_PROGRESS): context = ET.iterparse(data, events=['end']) tus = (el for event, el in context if el.tag == 'tu') count, skips = 0, 0 st = t = time.time() for tu in tus: langs, segs = [], [] for tuv in tu.findall('tuv'): lang = [v for k, v in tuv.attrib.items() if k.endswith('lang')] if lang: langs.append(lang[0]) seg = tuv.findtext('seg') if seg: segs.append(unescape(seg.strip())) if n_langs and len(segs) == len(langs) == n_langs: count += 1 yield list(zip(langs, segs)) else: skips += 1 log.warning( f"Skipped: langs {langs} segs {len(segs)} ; Parsed count {count}" ) if log_every and (time.time() - t) > log_every: elapsed = datetime.timedelta(seconds=round(time.time() - st)) log.info(f"{elapsed} :: Parsed: {count:,} Skipped:{skips:,}") t = time.time() tu.clear() log.info(f"Skipped ={skips}; parsed: {count}")
def parse_tmx(data, log_every=DEF_PROGRESS): context = ET.iterparse(data, events=['end']) tus = (el for event, el in context if el.tag == 'tu') count = 0 st = t = time.time() for tu in tus: lang_seg = {} for tuv in tu.findall('tuv'): lang = [v for k, v in tuv.attrib.items() if k.endswith('lang')] seg = tuv.findtext('seg') if lang and seg: lang = iso3_code(lang[0], fail_error=True) seg = unescape(seg.strip()).replace('\n', ' ').replace('\t', ' ') if lang in lang_seg: log.warning( f"Language {lang} appears twice in same translation unit." ) lang_seg[lang] = seg yield lang_seg count += 1 if log_every and (time.time() - t) > log_every: elapsed = datetime.timedelta(seconds=round(time.time() - st)) log.info(f"{elapsed} :: Parsed: {count:,}") t = time.time() tu.clear()
def get_recipe(recipe_id, out_dir: Path, compress=False, drop_dupes=False, drop_tests=False, fail_on_error=False, n_jobs=DEF_N_JOBS, merge_train=True, **kwargs): if kwargs: log.warning(f"Args are ignored: {kwargs}") from mtdata.recipe import RECIPES recipe = RECIPES.get(recipe_id) if not recipe: raise ValueError( f'recipe {recipe_id} not found. See "mtdata list-recipe"') get_data(langs=recipe.langs, train_dids=recipe.train, dev_dids=recipe.dev, test_dids=recipe.test, merge_train=merge_train, out_dir=out_dir, compress=compress, drop_dupes=drop_dupes, drop_tests=drop_tests, fail_on_error=fail_on_error, n_jobs=n_jobs)
def main(inp, out, langs): recs = read_tmx(inp, langs=langs) with IO.writer(out) as out: count = 0 for rec in recs: rec = [l.replace('\t', ' ') for l in rec] out.write('\t'.join(rec) + '\n') count += 1 log.warning(f"Wrote {count} lines to {out}")
def main(inp, out): segs = read_sgm(inp) with IO.writer(out) as out: count = 0 for seg in segs: seg = seg.replace('\t', ' ') out.write(seg + '\n') count += 1 log.warning(f"Wrote {count} lines to {out}")
def LangPair(string): parts = string.split('-') if len(parts) != 2: msg = f'expected value of form "xx-yy" eg "de-en"; given {string}' raise argparse.ArgumentTypeError(msg) iso_codes = [iso3_code(part, fail_error=True) for part in parts] if iso_codes != parts: log.warning( f"Suggestion: Use ISO 639_3 codes {'-'.join(iso_codes)} instead of {string}." f" Let's make a little space for all 7000+ languages of our planet 😢." ) return tuple(iso_codes)
def add_part(self, dir_path: Path, entry: Entry, drop_noise=False, compress=False): flag_file = dir_path / f'.valid.{entry.did}' if flag_file.exists(): log.info(f"{flag_file} exits. Skipping") return -1, -1 path = self.cache.get_entry(entry) # swap = entry.is_swap(self.langs) parser = Parser(path, ext=entry.in_ext or None, ent=entry) # langs = '_'.join(str(lang) for lang in self.langs) # Check that files are written in correct order l1, l2 = self.get_paths(dir_path, entry, compress=compress) io_args = dict(encoding='utf-8', errors='ignore') with IO.writer(l1, **io_args) as f1, IO.writer(l2, **io_args) as f2: count, skips, noise = 0, 0, 0 for rec in parser.read_segs(): rec = rec[:2] # get the first two recs if len(rec) != 2: skips += 1 continue if drop_noise and entry.is_noisy(seg1=rec[0], seg2=rec[1]): skips += 1 noise += 1 continue sent1, sent2 = [s.strip() for s in rec] if not sent1 or not sent2: skips += 1 continue sent1 = sent1.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ') sent2 = sent2.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ') f1.write(f'{sent1}\n') f2.write(f'{sent2}\n') count += 1 msg = f'Looks like an error. {count} segs are valid {skips} are invalid: {entry}' assert count > 0, msg if skips > count: log.warning(msg) if noise > 0: log.info( f"{entry}: Noise : {noise:,}/{count:,} => {100 * noise / count:.4f}%" ) log.info(f"wrote {count} lines to {l1} == {l2}") flag_file.touch() return count, skips
def __post_init__(self): if not isinstance(self.paths, list): self.paths = [self.paths] assert 1 <= len(self.paths) <= 2 for p in self.paths: assert p.exists(), f'{p} not exists' if not self.ext: exts = [detect_extension(p.name) for p in self.paths] if len(exts) == 2 and set(exts) == set(self.langs): log.warning( f"Treating {' .'.join(exts)} as plain text. To override: in_ext=<ext>" ) exts = ['txt'] # treat that as plain text assert len( set(exts)) == 1, f'Expected a type of exts, but found: {exts}' self.ext = exts[0]
def read_tmx(path: Union[Path, str], langs=None): """ reads a TMX file as records :param path: path to .tmx file :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect :return: stream of (text1, text2) """ passes = 0 fails = 0 if langs: assert len(langs) == 2 langs = [bcp47(lang) for lang in langs] assert not BCP47Tag.are_compatible( *langs), f'{langs} expected to be different (/unambiguous)' with IO.reader(path) as data: recs = parse_tmx(data) for lang_seg in recs: if langs is None: log.warning( "langs not set; this could result in language mismatch") if len(lang_seg) == 2: langs = tuple(lang_seg.keys()) else: raise Exception( f"Language autodetect for TMX only supports 2 languages," f" but provided with {lang_seg.keys()} in TMX {path}") seg1, seg2 = None, None for lang, seg in lang_seg.items(): if BCP47Tag.are_compatible(langs[0], lang): seg1 = seg elif BCP47Tag.are_compatible(langs[1], lang): seg2 = seg # else ignore if seg1 and seg2: # both segs are found yield seg1, seg2 passes += 1 else: fails += 1 if passes == 0: if fails == 0: raise Exception(f"Empty TMX {path}") raise Exception(f"Nothing for {langs[0]}-{langs[1]} in TMX {path}") if fails != 0: log.warning( f"Skipped {fails} entries due to language mismatch in TMX {path}") log.info(f"Extracted {passes} pairs from TMX {path}")
def add_part(self, dir_path: Path, entry: Entry, drop_noise=False): path = self.cache.get_entry(entry) swap = entry.is_swap(self.langs) parser = Parser(path, langs=self.langs, ext=entry.in_ext or None, ent=entry) langs = '_'.join(self.langs) l1 = (dir_path / f'{entry.name}-{langs}').with_suffix(f'.{self.langs[0]}') l2 = (dir_path / f'{entry.name}-{langs}').with_suffix(f'.{self.langs[1]}') mode = dict(mode='w', encoding='utf-8', errors='ignore') with l1.open(**mode) as f1, l2.open(**mode) as f2: count, skips, noise = 0, 0, 0 for rec in parser.read_segs(): rec = rec[:2] # get the first two recs if len(rec) != 2: skips += 1 continue if drop_noise and entry.is_noisy(seg1=rec[0], seg2=rec[1]): skips += 1 noise += 1 continue sent1, sent2 = [s.strip() for s in rec] if not sent1 or not sent2: skips += 1 continue if swap: sent2, sent1 = sent1, sent2 sent1 = sent1.replace('\n', ' ').replace('\t', ' ') sent2 = sent2.replace('\n', ' ').replace('\t', ' ') f1.write(f'{sent1}\n') f2.write(f'{sent2}\n') count += 1 msg = f'Looks like an error. {count} segs are valid {skips} are invalid: {entry}' assert count > 0, msg if skips > count: log.warning(msg) if noise > 0: log.info( f"{entry}: Noise : {noise:,}/{count:,} => {100*noise/count:.4f}%" ) log.info(f"wrote {count} lines to {l1} == {l2}") return count, skips
def __post_init__(self): if not isinstance(self.paths, list): self.paths = [self.paths] for p in self.paths: assert p.exists(), f'{p} not exists' if not self.ext: exts = [detect_extension(p.name) for p in self.paths] if len(exts) == 2: log.warning( f"Treating {' .'.join(exts)} as plain text. To override: in_ext=<ext>" ) exts = ['txt'] # treat that as plain text assert len( set(exts)) == 1, f'Expected a type of exts, but found: {exts}' self.ext = exts[0] assert 1 <= len(self.paths) # tsv and tmx just concatenate all of them assert len(self.paths) <= 3 or self.ext == 'tmx' or self.ext == 'tsv'
def get_data(langs, out_dir, train_dids=None, test_dids=None, dev_dids=None, merge_train=False, compress=False, drop_dupes=False, drop_tests=False, fail_on_error=False, n_jobs=DEF_N_JOBS, **kwargs): if kwargs: log.warning(f"Args are ignored: {kwargs}") from mtdata.data import Dataset assert train_dids or test_dids, 'Required --train or --test or both' dataset = Dataset.prepare(langs, train_dids=train_dids, test_dids=test_dids, out_dir=out_dir, dev_dids=dev_dids, cache_dir=CACHE_DIR, merge_train=merge_train, compress=compress, drop_dupes=drop_dupes, drop_tests=drop_tests, fail_on_error=fail_on_error, n_jobs=n_jobs) cli_sig = f'-l {"-".join(str(l) for l in langs)}' for flag, dids in [('-tr', train_dids), ('-ts', test_dids), ('-dv', dev_dids)]: if dids: cli_sig += f' {flag} {" ".join(map(str, dids))}' for flag, val in [('--merge', merge_train), ('--compress', compress), ('-dd', drop_dupes), ('-dt', drop_tests)]: if val: cli_sig += ' ' + flag sig = f'mtdata get {cli_sig} -o <out-dir>\nmtdata version {mtdata.__version__}\n' log.info(f'Dataset is ready at {dataset.dir}') log.info(f'mtdata args for reproducing this dataset:\n {sig}') with IO.writer(out_dir / 'mtdata.signature.txt', append=True) as w: w.write(sig)