def read_parallel(cls, file1: Path, file2: Path): with IO.reader(file1) as r1, IO.reader(file2) as r2: for seg1, seg2 in zip_longest(r1, r2): if seg1 is None or seg2 is None: raise Exception( f'{file1} {file2} have unequal num of lines. Thats an error' ) yield seg1.strip(), seg2.strip()
def read_tmx(path: Union[Path, str], langs=None): """ reads a TMX file as records :param path: path to .tmx file :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect :return: stream of (text1, text2) """ passes = 0 fails = 0 with IO.reader(path) as data: recs = parse_tmx(data) for lang_seg in recs: if langs is None: log.warning( "langs not set; this could result in language mismatch") if len(lang_seg) == 2: langs = tuple(lang_seg.keys()) else: raise Exception( f"Language autodetect for TMX only supports 2 languages, but provided with {lang_seg.keys()} in TMX {path}" ) if langs[0] in lang_seg and langs[1] in lang_seg: yield lang_seg[langs[0]], lang_seg[langs[1]] passes += 1 else: fails += 1 if passes == 0: if fails == 0: raise Exception(f"Empty TMX {path}") raise Exception(f"Nothing for {langs[0]}--{langs[1]} in TMX {path}") if fails != 0: log.warning( f"Skipped {fails} entries due to language mismatch in TMX {path}") log.info(f"Extracted {passes} pairs from TMX {path}")
def read_alignments(cls, align_file: Path): assert align_file.is_file(), f'{align_file} not found' with IO.reader(align_file) as reader: context = ET.iterparse(reader, events=['end']) docs = (el for event, el in context if el.tag == 'linkGrp') count = 0 skip_count = 0 for doc in docs: algn = [] doc_parse = { 'src_doc': doc.attrib['fromDoc'], 'tgt_doc': doc.attrib['toDoc'], 'align': algn } for seg in doc.findall('.//link'): parts = seg.attrib.get('xtargets', ';').strip().split(';') if len(parts) != 2: skip_count += 1 continue src_ids, tgt_ids = parts src_ids, tgt_ids = src_ids.split(), tgt_ids.split() confidence = float(seg.attrib.get('certainty', '-inf')) algn.append((confidence, src_ids, tgt_ids)) yield doc_parse doc.clear() count += 1 log.info(f"read {count} docs from {align_file}")
def read_sgm_xml(data: Path) -> Iterator[str]: """Extract sgm using XML parse This one breaks if there is any error in XML e.g. an & is not escaped ; see newstest2019-frde-ref.de.sgm for example! """ with IO.reader(data) as data: context = ET.iterparse(data, events=['end']) segs = (el for event, el in context if el.tag == 'seg') count = 0 for seg in segs: yield seg.text seg.clear() count += 1 log.info(f"read {count} segments from {data}")
def read_tsv(self, path, delim='\t', cols=None): """ Read data from TSV file :param path: path to TSV file :param delim: delimiter default is \\t :param cols: if certain columns are to be extracted; default is None, which returns all columns :return: """ with IO.reader(path) as stream: for line in stream: row = [x.strip() for x in line.rstrip('\n').split(delim)] if cols: row = [row[idx] for idx in cols] yield row
def read_tmx(path: Union[Path, str], langs=None): """ reads a TMX file as records :param path: path to .tmx file :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect :return: stream of (text1, text2) """ passes = 0 fails = 0 if langs: assert len(langs) == 2 langs = [bcp47(lang) for lang in langs] assert not BCP47Tag.are_compatible( *langs), f'{langs} expected to be different (/unambiguous)' with IO.reader(path) as data: recs = parse_tmx(data) for lang_seg in recs: if langs is None: log.warning( "langs not set; this could result in language mismatch") if len(lang_seg) == 2: langs = tuple(lang_seg.keys()) else: raise Exception( f"Language autodetect for TMX only supports 2 languages," f" but provided with {lang_seg.keys()} in TMX {path}") seg1, seg2 = None, None for lang, seg in lang_seg.items(): if BCP47Tag.are_compatible(langs[0], lang): seg1 = seg elif BCP47Tag.are_compatible(langs[1], lang): seg2 = seg # else ignore if seg1 and seg2: # both segs are found yield seg1, seg2 passes += 1 else: fails += 1 if passes == 0: if fails == 0: raise Exception(f"Empty TMX {path}") raise Exception(f"Nothing for {langs[0]}-{langs[1]} in TMX {path}") if fails != 0: log.warning( f"Skipped {fails} entries due to language mismatch in TMX {path}") log.info(f"Extracted {passes} pairs from TMX {path}")
def read_tmx(path: Union[Path], langs=None): """ reads a TMX file as records :param path: path to .tmx file :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect :return: stream of (text1, text2) """ with IO.reader(path) as data: recs = parse_tmx(data) for rec in recs: if langs is None: langs = [name for name, val in rec] (l1, t1), (l2, t2) = rec if l1 == langs[0] and l2 == langs[1]: yield t1, t2 else: yield t2, t1
def read_sgm_regex(data: Path) -> Iterator[str]: """ Extract sgm using regex. assumes each sgm is in its own line of form <seg id="xx"> yy</sgm> and line breaks are used between :param data: :return: """ patt = re.compile(r'<seg id="(.*)">(.*)</seg>') count = 0 with IO.reader(data) as data: for line in data: line = line.strip() match = patt.search(line) if match: yield unescape(match.group(2)) count += 1 log.info(f"read {count} segments from {data}")
def read_plain(self, path): with IO.reader(path) as stream: for line in stream: yield line.strip()