Python IO.reader Examples

Programming Language: Python

Namespace/Package Name: mtdata.utils

Class/Type: IO

Method/Function: reader

Examples at hotexamples.com: 9

Python IO.reader - 9 examples found. These are the top rated real world Python examples of mtdata.utils.IO.reader extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

reader(9)

writer(9)

write_lines(2)

Frequently Used Methods

reader (9)

writer (9)

write_lines (2)

Example #1

Show file

File: data.py Project: kpu/mtdata

 def read_parallel(cls, file1: Path, file2: Path):
     with IO.reader(file1) as r1, IO.reader(file2) as r2:
         for seg1, seg2 in zip_longest(r1, r2):
             if seg1 is None or seg2 is None:
                 raise Exception(
                     f'{file1} {file2} have unequal num of lines. Thats an error'
                 )
             yield seg1.strip(), seg2.strip()

Example #2

Show file

File: tmx.py Project: kpu/mtdata

def read_tmx(path: Union[Path, str], langs=None):
    """
    reads a TMX file as records
    :param path: path to .tmx file
    :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect
    :return: stream of (text1, text2)
    """
    passes = 0
    fails = 0
    with IO.reader(path) as data:
        recs = parse_tmx(data)
        for lang_seg in recs:
            if langs is None:
                log.warning(
                    "langs not set; this could result in language mismatch")
                if len(lang_seg) == 2:
                    langs = tuple(lang_seg.keys())
                else:
                    raise Exception(
                        f"Language autodetect for TMX only supports 2 languages, but provided with {lang_seg.keys()} in TMX {path}"
                    )
            if langs[0] in lang_seg and langs[1] in lang_seg:
                yield lang_seg[langs[0]], lang_seg[langs[1]]
                passes += 1
            else:
                fails += 1
    if passes == 0:
        if fails == 0:
            raise Exception(f"Empty TMX {path}")
        raise Exception(f"Nothing for {langs[0]}--{langs[1]} in TMX {path}")
    if fails != 0:
        log.warning(
            f"Skipped {fails} entries due to language mismatch in TMX {path}")
    log.info(f"Extracted {passes} pairs from TMX {path}")

Example #3

Show file

 def read_alignments(cls, align_file: Path):
     assert align_file.is_file(), f'{align_file} not found'
     with IO.reader(align_file) as reader:
         context = ET.iterparse(reader, events=['end'])
         docs = (el for event, el in context if el.tag == 'linkGrp')
         count = 0
         skip_count = 0
         for doc in docs:
             algn = []
             doc_parse = {
                 'src_doc': doc.attrib['fromDoc'],
                 'tgt_doc': doc.attrib['toDoc'],
                 'align': algn
             }
             for seg in doc.findall('.//link'):
                 parts = seg.attrib.get('xtargets', ';').strip().split(';')
                 if len(parts) != 2:
                     skip_count += 1
                     continue
                 src_ids, tgt_ids = parts
                 src_ids, tgt_ids = src_ids.split(), tgt_ids.split()
                 confidence = float(seg.attrib.get('certainty', '-inf'))
                 algn.append((confidence, src_ids, tgt_ids))
             yield doc_parse
             doc.clear()
             count += 1
         log.info(f"read {count} docs from {align_file}")

Example #4

Show file

def read_sgm_xml(data: Path) -> Iterator[str]:
    """Extract sgm using XML parse
    This one breaks if there is any error in XML e.g. an & is not escaped ;
      see newstest2019-frde-ref.de.sgm for example!
    """
    with IO.reader(data) as data:
        context = ET.iterparse(data, events=['end'])
        segs = (el for event, el in context if el.tag == 'seg')
        count = 0
        for seg in segs:
            yield seg.text
            seg.clear()
            count += 1
        log.info(f"read {count} segments from {data}")

Example #5

Show file

 def read_tsv(self, path, delim='\t', cols=None):
     """
     Read data from TSV file
     :param path: path to TSV file
     :param delim: delimiter default is \\t
     :param cols: if certain columns are to be extracted;
         default is None, which returns all columns
     :return:
     """
     with IO.reader(path) as stream:
         for line in stream:
             row = [x.strip() for x in line.rstrip('\n').split(delim)]
             if cols:
                 row = [row[idx] for idx in cols]
             yield row

Example #6

Show file

def read_tmx(path: Union[Path, str], langs=None):
    """
    reads a TMX file as records
    :param path: path to .tmx file
    :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect
    :return: stream of (text1, text2)
    """
    passes = 0
    fails = 0
    if langs:
        assert len(langs) == 2
        langs = [bcp47(lang) for lang in langs]
        assert not BCP47Tag.are_compatible(
            *langs), f'{langs} expected to be different (/unambiguous)'
    with IO.reader(path) as data:
        recs = parse_tmx(data)
        for lang_seg in recs:
            if langs is None:
                log.warning(
                    "langs not set; this could result in language mismatch")
                if len(lang_seg) == 2:
                    langs = tuple(lang_seg.keys())
                else:
                    raise Exception(
                        f"Language autodetect for TMX only supports 2 languages,"
                        f" but provided with {lang_seg.keys()} in TMX {path}")
            seg1, seg2 = None, None
            for lang, seg in lang_seg.items():
                if BCP47Tag.are_compatible(langs[0], lang):
                    seg1 = seg
                elif BCP47Tag.are_compatible(langs[1], lang):
                    seg2 = seg
                # else ignore
            if seg1 and seg2:  # both segs are found
                yield seg1, seg2
                passes += 1
            else:
                fails += 1
    if passes == 0:
        if fails == 0:
            raise Exception(f"Empty TMX {path}")
        raise Exception(f"Nothing for {langs[0]}-{langs[1]} in TMX {path}")
    if fails != 0:
        log.warning(
            f"Skipped {fails} entries due to language mismatch in TMX {path}")
    log.info(f"Extracted {passes} pairs from TMX {path}")

Example #7

Show file

def read_tmx(path: Union[Path], langs=None):
    """
    reads a TMX file as records
    :param path: path to .tmx file
    :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect
    :return: stream of (text1, text2)
    """
    with IO.reader(path) as data:
        recs = parse_tmx(data)
        for rec in recs:
            if langs is None:
                langs = [name for name, val in rec]
            (l1, t1), (l2, t2) = rec
            if l1 == langs[0] and l2 == langs[1]:
                yield t1, t2
            else:
                yield t2, t1

Example #8

Show file

def read_sgm_regex(data: Path) -> Iterator[str]:
    """
    Extract sgm using regex.
    assumes each sgm is in its own line of form <seg id="xx"> yy</sgm>
    and line breaks are used between
    :param data:
    :return:
    """
    patt = re.compile(r'<seg id="(.*)">(.*)</seg>')
    count = 0
    with IO.reader(data) as data:
        for line in data:
            line = line.strip()
            match = patt.search(line)
            if match:
                yield unescape(match.group(2))
                count += 1
    log.info(f"read {count} segments from {data}")

Example #9

Show file

 def read_plain(self, path):
     with IO.reader(path) as stream:
         for line in stream:
             yield line.strip()