Example #1
0
File: data.py Project: kpu/mtdata
 def read_parallel(cls, file1: Path, file2: Path):
     with IO.reader(file1) as r1, IO.reader(file2) as r2:
         for seg1, seg2 in zip_longest(r1, r2):
             if seg1 is None or seg2 is None:
                 raise Exception(
                     f'{file1} {file2} have unequal num of lines. Thats an error'
                 )
             yield seg1.strip(), seg2.strip()
Example #2
0
File: tmx.py Project: kpu/mtdata
def read_tmx(path: Union[Path, str], langs=None):
    """
    reads a TMX file as records
    :param path: path to .tmx file
    :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect
    :return: stream of (text1, text2)
    """
    passes = 0
    fails = 0
    with IO.reader(path) as data:
        recs = parse_tmx(data)
        for lang_seg in recs:
            if langs is None:
                log.warning(
                    "langs not set; this could result in language mismatch")
                if len(lang_seg) == 2:
                    langs = tuple(lang_seg.keys())
                else:
                    raise Exception(
                        f"Language autodetect for TMX only supports 2 languages, but provided with {lang_seg.keys()} in TMX {path}"
                    )
            if langs[0] in lang_seg and langs[1] in lang_seg:
                yield lang_seg[langs[0]], lang_seg[langs[1]]
                passes += 1
            else:
                fails += 1
    if passes == 0:
        if fails == 0:
            raise Exception(f"Empty TMX {path}")
        raise Exception(f"Nothing for {langs[0]}--{langs[1]} in TMX {path}")
    if fails != 0:
        log.warning(
            f"Skipped {fails} entries due to language mismatch in TMX {path}")
    log.info(f"Extracted {passes} pairs from TMX {path}")
Example #3
0
 def read_alignments(cls, align_file: Path):
     assert align_file.is_file(), f'{align_file} not found'
     with IO.reader(align_file) as reader:
         context = ET.iterparse(reader, events=['end'])
         docs = (el for event, el in context if el.tag == 'linkGrp')
         count = 0
         skip_count = 0
         for doc in docs:
             algn = []
             doc_parse = {
                 'src_doc': doc.attrib['fromDoc'],
                 'tgt_doc': doc.attrib['toDoc'],
                 'align': algn
             }
             for seg in doc.findall('.//link'):
                 parts = seg.attrib.get('xtargets', ';').strip().split(';')
                 if len(parts) != 2:
                     skip_count += 1
                     continue
                 src_ids, tgt_ids = parts
                 src_ids, tgt_ids = src_ids.split(), tgt_ids.split()
                 confidence = float(seg.attrib.get('certainty', '-inf'))
                 algn.append((confidence, src_ids, tgt_ids))
             yield doc_parse
             doc.clear()
             count += 1
         log.info(f"read {count} docs from {align_file}")
Example #4
0
def read_sgm_xml(data: Path) -> Iterator[str]:
    """Extract sgm using XML parse
    This one breaks if there is any error in XML e.g. an & is not escaped ;
      see newstest2019-frde-ref.de.sgm for example!
    """
    with IO.reader(data) as data:
        context = ET.iterparse(data, events=['end'])
        segs = (el for event, el in context if el.tag == 'seg')
        count = 0
        for seg in segs:
            yield seg.text
            seg.clear()
            count += 1
        log.info(f"read {count} segments from {data}")
Example #5
0
 def read_tsv(self, path, delim='\t', cols=None):
     """
     Read data from TSV file
     :param path: path to TSV file
     :param delim: delimiter default is \\t
     :param cols: if certain columns are to be extracted;
         default is None, which returns all columns
     :return:
     """
     with IO.reader(path) as stream:
         for line in stream:
             row = [x.strip() for x in line.rstrip('\n').split(delim)]
             if cols:
                 row = [row[idx] for idx in cols]
             yield row
Example #6
0
def read_tmx(path: Union[Path, str], langs=None):
    """
    reads a TMX file as records
    :param path: path to .tmx file
    :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect
    :return: stream of (text1, text2)
    """
    passes = 0
    fails = 0
    if langs:
        assert len(langs) == 2
        langs = [bcp47(lang) for lang in langs]
        assert not BCP47Tag.are_compatible(
            *langs), f'{langs} expected to be different (/unambiguous)'
    with IO.reader(path) as data:
        recs = parse_tmx(data)
        for lang_seg in recs:
            if langs is None:
                log.warning(
                    "langs not set; this could result in language mismatch")
                if len(lang_seg) == 2:
                    langs = tuple(lang_seg.keys())
                else:
                    raise Exception(
                        f"Language autodetect for TMX only supports 2 languages,"
                        f" but provided with {lang_seg.keys()} in TMX {path}")
            seg1, seg2 = None, None
            for lang, seg in lang_seg.items():
                if BCP47Tag.are_compatible(langs[0], lang):
                    seg1 = seg
                elif BCP47Tag.are_compatible(langs[1], lang):
                    seg2 = seg
                # else ignore
            if seg1 and seg2:  # both segs are found
                yield seg1, seg2
                passes += 1
            else:
                fails += 1
    if passes == 0:
        if fails == 0:
            raise Exception(f"Empty TMX {path}")
        raise Exception(f"Nothing for {langs[0]}-{langs[1]} in TMX {path}")
    if fails != 0:
        log.warning(
            f"Skipped {fails} entries due to language mismatch in TMX {path}")
    log.info(f"Extracted {passes} pairs from TMX {path}")
Example #7
0
def read_tmx(path: Union[Path], langs=None):
    """
    reads a TMX file as records
    :param path: path to .tmx file
    :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect
    :return: stream of (text1, text2)
    """
    with IO.reader(path) as data:
        recs = parse_tmx(data)
        for rec in recs:
            if langs is None:
                langs = [name for name, val in rec]
            (l1, t1), (l2, t2) = rec
            if l1 == langs[0] and l2 == langs[1]:
                yield t1, t2
            else:
                yield t2, t1
Example #8
0
def read_sgm_regex(data: Path) -> Iterator[str]:
    """
    Extract sgm using regex.
    assumes each sgm is in its own line of form <seg id="xx"> yy</sgm>
    and line breaks are used between
    :param data:
    :return:
    """
    patt = re.compile(r'<seg id="(.*)">(.*)</seg>')
    count = 0
    with IO.reader(data) as data:
        for line in data:
            line = line.strip()
            match = patt.search(line)
            if match:
                yield unescape(match.group(2))
                count += 1
    log.info(f"read {count} segments from {data}")
Example #9
0
 def read_plain(self, path):
     with IO.reader(path) as stream:
         for line in stream:
             yield line.strip()