def add_train_entries(self, entries, merge_train=False): self.add_parts(self.train_parts_dir, entries, drop_noise=self.drop_train_noise) if not merge_train: return # merge l1, l2 = self.langs l1_files = list(self.train_parts_dir.glob(f"*.{l1}")) assert l1_files and len(l1_files) >= len(entries) l2_files = [l1_f.with_suffix(f".{l2}") for l1_f in l1_files] assert all(l2_f.exists() for l2_f in l2_files) log.info(f"Going to merge {len(l1_files)} files as one train file") counts = coll.defaultdict(int) of1 = self.dir / f'train.{l1}' of2 = self.dir / f'train.{l2}' of3 = self.dir / f'train.meta.gz' with IO.writer(of1) as w1, IO.writer(of2) as w2, IO.writer(of3) as w3: for if1, if2 in zip(l1_files, l2_files): name = if1.name.rstrip(f'.{l1}') for seg1, seg2 in self.read_parallel(if1, if2): w1.write(seg1 + '\n') w2.write(seg2 + '\n') w3.write(name + '\n') counts[name] += 1 total = sum(counts.values()) counts = {'total': total, 'parts': counts} counts_msg = json.dumps(counts, indent=2) log.info('Train stats:\n' + counts_msg) IO.write_lines(self.dir / 'train.stats.json', counts_msg) return counts
def read_parallel(cls, file1: Path, file2: Path): with IO.reader(file1) as r1, IO.reader(file2) as r2: for seg1, seg2 in zip_longest(r1, r2): if seg1 is None or seg2 is None: raise Exception( f'{file1} {file2} have unequal num of lines. Thats an error' ) yield seg1.strip(), seg2.strip()
def add_part(self, dir_path: Path, entry: Entry, drop_noise=False, compress=False): flag_file = dir_path / f'.valid.{entry.did}' if flag_file.exists(): log.info(f"{flag_file} exits. Skipping") return -1, -1 path = self.cache.get_entry(entry) # swap = entry.is_swap(self.langs) parser = Parser(path, ext=entry.in_ext or None, ent=entry) # langs = '_'.join(str(lang) for lang in self.langs) # Check that files are written in correct order l1, l2 = self.get_paths(dir_path, entry, compress=compress) io_args = dict(encoding='utf-8', errors='ignore') with IO.writer(l1, **io_args) as f1, IO.writer(l2, **io_args) as f2: count, skips, noise = 0, 0, 0 for rec in parser.read_segs(): rec = rec[:2] # get the first two recs if len(rec) != 2: skips += 1 continue if drop_noise and entry.is_noisy(seg1=rec[0], seg2=rec[1]): skips += 1 noise += 1 continue sent1, sent2 = [s.strip() for s in rec] if not sent1 or not sent2: skips += 1 continue sent1 = sent1.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ') sent2 = sent2.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ') f1.write(f'{sent1}\n') f2.write(f'{sent2}\n') count += 1 msg = f'Looks like an error. {count} segs are valid {skips} are invalid: {entry}' assert count > 0, msg if skips > count: log.warning(msg) if noise > 0: log.info( f"{entry}: Noise : {noise:,}/{count:,} => {100 * noise / count:.4f}%" ) log.info(f"wrote {count} lines to {l1} == {l2}") flag_file.touch() return count, skips
def cat_bitexts(self, in_paths: List[Tuple[Path, Path]], out_paths: Tuple[Path, Path]): of1, of2 = out_paths of1.parent.mkdir(exist_ok=True) of2.parent.mkdir(exist_ok=True) with pbar_man.counter(color='green', total=len(in_paths), unit='it', desc="Merging") as pbar, \ IO.writer(of1) as w1, IO.writer(of2) as w2: for if1, if2 in in_paths: assert if1.exists() assert if2.exists() for seg1, seg2 in self.read_parallel(if1, if2): w1.write(seg1 + '\n') w2.write(seg2 + '\n') pbar.update()
def read_alignments(cls, align_file: Path): assert align_file.is_file(), f'{align_file} not found' with IO.reader(align_file) as reader: context = ET.iterparse(reader, events=['end']) docs = (el for event, el in context if el.tag == 'linkGrp') count = 0 skip_count = 0 for doc in docs: algn = [] doc_parse = { 'src_doc': doc.attrib['fromDoc'], 'tgt_doc': doc.attrib['toDoc'], 'align': algn } for seg in doc.findall('.//link'): parts = seg.attrib.get('xtargets', ';').strip().split(';') if len(parts) != 2: skip_count += 1 continue src_ids, tgt_ids = parts src_ids, tgt_ids = src_ids.split(), tgt_ids.split() confidence = float(seg.attrib.get('certainty', '-inf')) algn.append((confidence, src_ids, tgt_ids)) yield doc_parse doc.clear() count += 1 log.info(f"read {count} docs from {align_file}")
def read_tmx(path: Union[Path, str], langs=None): """ reads a TMX file as records :param path: path to .tmx file :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect :return: stream of (text1, text2) """ passes = 0 fails = 0 with IO.reader(path) as data: recs = parse_tmx(data) for lang_seg in recs: if langs is None: log.warning( "langs not set; this could result in language mismatch") if len(lang_seg) == 2: langs = tuple(lang_seg.keys()) else: raise Exception( f"Language autodetect for TMX only supports 2 languages, but provided with {lang_seg.keys()} in TMX {path}" ) if langs[0] in lang_seg and langs[1] in lang_seg: yield lang_seg[langs[0]], lang_seg[langs[1]] passes += 1 else: fails += 1 if passes == 0: if fails == 0: raise Exception(f"Empty TMX {path}") raise Exception(f"Nothing for {langs[0]}--{langs[1]} in TMX {path}") if fails != 0: log.warning( f"Skipped {fails} entries due to language mismatch in TMX {path}") log.info(f"Extracted {passes} pairs from TMX {path}")
def main(inp, out): segs = read_sgm(inp) with IO.writer(out) as out: count = 0 for seg in segs: seg = seg.replace('\t', ' ') out.write(seg + '\n') count += 1 log.warning(f"Wrote {count} lines to {out}")
def main(inp, out, langs): recs = read_tmx(inp, langs=langs) with IO.writer(out) as out: count = 0 for rec in recs: rec = [l.replace('\t', ' ') for l in rec] out.write('\t'.join(rec) + '\n') count += 1 log.warning(f"Wrote {count} lines to {out}")
def main(inp, out, wmt21xml=False): parser = read_wmt21_xml if wmt21xml else read_sgm stream = parser(inp) with IO.writer(out) as out: count = 0 for rec in stream: if isinstance(rec, str): rec = (rec, ) line = '\t'.join(rec) + '\n' out.write(line) count += 1 log.info(f"Wrote {count} lines to {out}")
def read_sgm_xml(data: Path) -> Iterator[str]: """Extract sgm using XML parse This one breaks if there is any error in XML e.g. an & is not escaped ; see newstest2019-frde-ref.de.sgm for example! """ with IO.reader(data) as data: context = ET.iterparse(data, events=['end']) segs = (el for event, el in context if el.tag == 'seg') count = 0 for seg in segs: yield seg.text seg.clear() count += 1 log.info(f"read {count} segments from {data}")
def read_tsv(self, path, delim='\t', cols=None): """ Read data from TSV file :param path: path to TSV file :param delim: delimiter default is \\t :param cols: if certain columns are to be extracted; default is None, which returns all columns :return: """ with IO.reader(path) as stream: for line in stream: row = [x.strip() for x in line.rstrip('\n').split(delim)] if cols: row = [row[idx] for idx in cols] yield row
def get_data(args): assert args.train_names or args.test_names, 'Required --train or --test or both' dataset = Dataset.prepare(args.langs, train_names=args.train_names, test_names=args.test_names, out_dir=args.out, cache_dir=args.cache, merge_train=args.merge) cli_sig = f'-l {"-".join(args.langs)}' cli_sig += f' -tr {" ".join(args.train_names)}' if args.train_names else '' cli_sig += f' -ts {" ".join(args.test_names)}' if args.test_names else '' sig = f'mtdata get {cli_sig} -o <out-dir>\nmtdata version {mtdata.__version__}\n' log.info(f'Dataset is ready at {dataset.dir}') log.info(f'mtdata args for reproducing this dataset:\n {sig}') with IO.writer(args.out / 'mtdata.signature.txt', append=True) as w: w.write(sig)
def read_tmx(path: Union[Path], langs=None): """ reads a TMX file as records :param path: path to .tmx file :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect :return: stream of (text1, text2) """ with IO.reader(path) as data: recs = parse_tmx(data) for rec in recs: if langs is None: langs = [name for name, val in rec] (l1, t1), (l2, t2) = rec if l1 == langs[0] and l2 == langs[1]: yield t1, t2 else: yield t2, t1
def read_tmx(path: Union[Path, str], langs=None): """ reads a TMX file as records :param path: path to .tmx file :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect :return: stream of (text1, text2) """ passes = 0 fails = 0 if langs: assert len(langs) == 2 langs = [bcp47(lang) for lang in langs] assert not BCP47Tag.are_compatible( *langs), f'{langs} expected to be different (/unambiguous)' with IO.reader(path) as data: recs = parse_tmx(data) for lang_seg in recs: if langs is None: log.warning( "langs not set; this could result in language mismatch") if len(lang_seg) == 2: langs = tuple(lang_seg.keys()) else: raise Exception( f"Language autodetect for TMX only supports 2 languages," f" but provided with {lang_seg.keys()} in TMX {path}") seg1, seg2 = None, None for lang, seg in lang_seg.items(): if BCP47Tag.are_compatible(langs[0], lang): seg1 = seg elif BCP47Tag.are_compatible(langs[1], lang): seg2 = seg # else ignore if seg1 and seg2: # both segs are found yield seg1, seg2 passes += 1 else: fails += 1 if passes == 0: if fails == 0: raise Exception(f"Empty TMX {path}") raise Exception(f"Nothing for {langs[0]}-{langs[1]} in TMX {path}") if fails != 0: log.warning( f"Skipped {fails} entries due to language mismatch in TMX {path}") log.info(f"Extracted {passes} pairs from TMX {path}")
def read_sgm_regex(data: Path) -> Iterator[str]: """ Extract sgm using regex. assumes each sgm is in its own line of form <seg id="xx"> yy</sgm> and line breaks are used between :param data: :return: """ patt = re.compile(r'<seg id="(.*)">(.*)</seg>') count = 0 with IO.reader(data) as data: for line in data: line = line.strip() match = patt.search(line) if match: yield unescape(match.group(2)) count += 1 log.info(f"read {count} segments from {data}")
def get_data(langs, out_dir, train_dids=None, test_dids=None, dev_dids=None, merge_train=False, compress=False, drop_dupes=False, drop_tests=False, fail_on_error=False, n_jobs=DEF_N_JOBS, **kwargs): if kwargs: log.warning(f"Args are ignored: {kwargs}") from mtdata.data import Dataset assert train_dids or test_dids, 'Required --train or --test or both' dataset = Dataset.prepare(langs, train_dids=train_dids, test_dids=test_dids, out_dir=out_dir, dev_dids=dev_dids, cache_dir=CACHE_DIR, merge_train=merge_train, compress=compress, drop_dupes=drop_dupes, drop_tests=drop_tests, fail_on_error=fail_on_error, n_jobs=n_jobs) cli_sig = f'-l {"-".join(str(l) for l in langs)}' for flag, dids in [('-tr', train_dids), ('-ts', test_dids), ('-dv', dev_dids)]: if dids: cli_sig += f' {flag} {" ".join(map(str, dids))}' for flag, val in [('--merge', merge_train), ('--compress', compress), ('-dd', drop_dupes), ('-dt', drop_tests)]: if val: cli_sig += ' ' + flag sig = f'mtdata get {cli_sig} -o <out-dir>\nmtdata version {mtdata.__version__}\n' log.info(f'Dataset is ready at {dataset.dir}') log.info(f'mtdata args for reproducing this dataset:\n {sig}') with IO.writer(out_dir / 'mtdata.signature.txt', append=True) as w: w.write(sig)
def add_train_entries(self, entries, merge_train=False, compress=False, drop_hashes=None): self.add_parts(self.train_parts_dir, entries, drop_noise=self.drop_train_noise, compress=compress, desc='Training sets', fail_on_error=self.fail_on_error) if not merge_train: return lang1, lang2 = self.langs # paired_files = self.find_bitext_pairs(self.train_parts_dir, lang1, lang2) paired_files = {} for ent in entries: e1, e2 = self.get_paths(self.train_parts_dir, ent) _, swapped = BCP47Tag.check_compat_swap(self.langs, ent.did.langs, fail_on_incompat=True) if swapped: e1, e2 = e2, e1 paired_files[str(ent.did)] = e1, e2 log.info(f"Going to merge {len(paired_files)} files as one train file") compress_ext = f'.{DEF_COMPRESS}' if compress else '' l1_ext = f'{lang1}{compress_ext}' l2_ext = f'{lang2}{compress_ext}' of1 = self.dir / f'train.{l1_ext}' of2 = self.dir / f'train.{l2_ext}' of3 = self.dir / f'train.meta.{DEF_COMPRESS}' counts = dict(total=coll.defaultdict(int), dupes_skips=coll.defaultdict(int), test_overlap_skips=coll.defaultdict(int), selected=coll.defaultdict(int)) train_hashes = set() with IO.writer(of1) as w1, IO.writer(of2) as w2, IO.writer(of3) as w3: with pbar_man.counter(color='green', total=len(paired_files), unit='it', desc="Merging", autorefresh=False) as pbar: for name, (if1, if2) in paired_files.items(): for seg1, seg2 in self.read_parallel(if1, if2): counts['total'][name] += 1 if self.drop_dupes or self.drop_tests: hash_val = hash((seg1, seg2)) if drop_hashes and (hash_val in drop_hashes or hash(seg1) in drop_hashes or hash(seg2) in drop_hashes): counts['test_overlap_skips'][name] += 1 continue if self.drop_dupes: if hash_val in train_hashes: counts['dupes_skips'][name] += 1 continue train_hashes.add(hash_val) w1.write(seg1 + '\n') w2.write(seg2 + '\n') w3.write(name + '\n') counts['selected'][name] += 1 pbar.update() stats = dict(selected=sum(counts['selected'].values()), total=sum(counts['total'].values()), counts=counts) stats_msg = json.dumps(stats, indent=2) log.info('Train stats:\n' + stats_msg) IO.write_lines(self.dir / 'train.stats.json', stats_msg) return counts
def read_plain(self, path): with IO.reader(path) as stream: for line in stream: yield line.strip()