Exemple #1
0
    def load(
        self,
        iso_lang_map: Dict[str, str],
        min_len: int = 25,
    ) -> List[Tuple[str, str]]:
        """
        Args:
            iso_lang_map
            min_len: Minimum text length in *chars* for a given example to be included.

        Returns:
            Sequence of (text, lang) examples.
        """
        data = []
        # we'll combine train/test from individual datasets
        # and instead split on the full, aggregated dataset
        for subset in ("train", "test"):
            text_lines = tio.read_text(
                self.data_dir.joinpath(f"x_{subset}.txt"), lines=True
            )
            lang_lines = tio.read_text(
                self.data_dir.joinpath(f"y_{subset}.txt"), lines=True
            )
            texts = (line.strip() for line in text_lines)
            langs = (line.strip() for line in lang_lines)
            data.extend(
                (text, iso_lang_map[lang])
                for text, lang in zip(texts, langs)
                if lang in iso_lang_map
                and itertoolz.count(char for char in text if char.isalnum()) >= min_len
            )
        LOGGER.info("loaded Wili2018Dataset data:\n%s ...", data[:3])
        return data
Exemple #2
0
 def test_read_write_unicode(self, tmpdir):
     expected = TEXT
     for ext in (".txt", ".gz", ".bz2", ".xz"):
         filepath = str(tmpdir.join("test_read_write_file_unicode" + ext))
         io.write_text(expected, filepath, mode="wt", make_dirs=True)
         observed = next(io.read_text(filepath, mode="rt"))
         assert observed == expected
Exemple #3
0
 def test_read_write_bytes(self, tmpdir):
     expected = utils.to_bytes(TEXT)
     for ext in (".txt", ".gz", ".bz2", ".xz"):
         filepath = str(tmpdir.join("test_read_write_file_bytes" + ext))
         io.write_text(expected, filepath, mode="wb", make_dirs=True)
         observed = next(io.read_text(filepath, mode="rb"))
         assert observed == expected
Exemple #4
0
    def load(self, langs: Set[str], min_len: int = 25) -> List[Tuple[str, str]]:
        """
        Args:
            langs
            min_len: Minimum text length in *chars* for a given example to be included.

        Returns:
            Sequence of (text, lang) examples.
        """
        data = []
        fstubs = [
            "dslcc3/train/task1-train.txt",
            "dslcc3/train/task1-dev.txt",
            "dslcc4/DSL-TRAIN.txt",
            "dslcc4/DSL-DEV.txt",
        ]
        for fstub in fstubs:
            filepath = self.data_dir.joinpath(fstub)
            lines = tio.read_text(filepath, mode="rt", encoding="utf-8", lines=True)
            for line in lines:
                if not line.strip():
                    continue
                try:
                    text, lang = line.split("\t")
                    if (
                        lang[:2] in langs
                        and itertoolz.count(c for c in text if c.isalnum()) >= min_len
                    ):
                        data.append((text, lang[:2]))
                except Exception:
                    LOGGER.debug("bad line in data")
                    pass
        data = sorted(set(data), key=operator.itemgetter(1))
        LOGGER.info("loaded DSLCCDataset data:\n%s ...", data[:3])
        return data
Exemple #5
0
 def test_read_write_unicode(self, tmpdir):
     expected = TEXT
     for ext in (".txt", ".gz", ".bz2", ".xz"):
         filepath = str(tmpdir.join("test_read_write_file_unicode" + ext))
         if compat.PY2 is True and ext != ".txt":
             with pytest.raises(ValueError):
                 io.open_sesame(filepath, mode="wt", encoding="utf-8", make_dirs=True)
         else:
             io.write_text(expected, filepath, mode="wt", make_dirs=True)
             observed = next(io.read_text(filepath, mode="rt"))
             assert observed == expected
Exemple #6
0
def test_read_write_text_unicode(tmpdir):
    expected = TEXT
    for ext in ('.txt', '.gz', '.bz2', '.xz'):
        filename = str(tmpdir.join('test_read_write_file_unicode' + ext))
        if compat.is_python2 is True and ext != '.txt':
            with pytest.raises(ValueError):
                io.open_sesame(
                    filename, mode='wt', encoding='utf-8', make_dirs=True)
        else:
            io.write_text(expected, filename, mode='wt', make_dirs=True)
            observed = next(io.read_text(filename, mode='rt'))
            assert observed == expected
Exemple #7
0
 def test_read_write_unicode_lines(self, tmpdir, spacy_doc):
     expected = [sent.text for sent in spacy_doc.sents]
     for ext in (".txt", ".gz", ".bz2", ".xz"):
         filepath = str(tmpdir.join("test_read_write_file_lines_unicode" + ext))
         if compat.PY2 is True and ext != ".txt":
             with pytest.raises(ValueError):
                 io.open_sesame(filepath, mode="wt", encoding=None, make_dirs=True)
         else:
             io.write_text(expected, filepath, mode="wt", make_dirs=True, lines=True)
             observed = [
                 line.strip() for line in io.read_text(filepath, mode="rt", lines=True)
             ]
             assert observed == expected
Exemple #8
0
def test_read_write_text_lines_bytes(tmpdir, spacy_doc):
    expected = [compat.unicode_to_bytes(sent.text) for sent in spacy_doc.sents]
    for ext in ('.txt', '.gz', '.bz2', '.xz'):
        filename = str(tmpdir.join('test_read_write_file_lines_bytes' + ext))
        if compat.is_python2 is True and ext == '.xz':
            with pytest.raises(ValueError):
                io.open_sesame(
                    filename, mode='wb', encoding='utf-8', make_dirs=True)
        else:
            io.write_text(expected, filename, mode='wb', make_dirs=True, lines=True)
            observed = [
                line.strip()
                for line in io.read_text(filename, mode='rb', lines=True)]
            assert observed == expected
Exemple #9
0
def test_read_write_text_bytes(tmpdir):
    expected = compat.unicode_to_bytes(TEXT)
    for ext in (".txt", ".gz", ".bz2", ".xz"):
        filename = str(tmpdir.join("test_read_write_file_bytes" + ext))
        if compat.is_python2 is True and ext == ".xz":
            with pytest.raises(ValueError):
                io.open_sesame(filename,
                               mode="wb",
                               encoding="utf-8",
                               make_dirs=True)
        else:
            io.write_text(expected, filename, mode="wb", make_dirs=True)
            observed = next(io.read_text(filename, mode="rb"))
            assert observed == expected
Exemple #10
0
 def test_read_write_unicode_lines(self, tmpdir, spacy_doc):
     expected = [sent.text for sent in spacy_doc.sents]
     for ext in (".txt", ".gz", ".bz2", ".xz"):
         filepath = str(
             tmpdir.join("test_read_write_file_lines_unicode" + ext))
         io.write_text(expected,
                       filepath,
                       mode="wt",
                       make_dirs=True,
                       lines=True)
         observed = [
             line.strip()
             for line in io.read_text(filepath, mode="rt", lines=True)
         ]
         assert observed == expected
Exemple #11
0
def test_read_write_text_lines_bytes(tmpdir, spacy_doc):
    expected = [compat.unicode_to_bytes(sent.text) for sent in spacy_doc.sents]
    for ext in (".txt", ".gz", ".bz2", ".xz"):
        filename = str(tmpdir.join("test_read_write_file_lines_bytes" + ext))
        if compat.is_python2 is True and ext == ".xz":
            with pytest.raises(ValueError):
                io.open_sesame(filename,
                               mode="wb",
                               encoding="utf-8",
                               make_dirs=True)
        else:
            io.write_text(expected,
                          filename,
                          mode="wb",
                          make_dirs=True,
                          lines=True)
            observed = [
                line.strip()
                for line in io.read_text(filename, mode="rb", lines=True)
            ]
            assert observed == expected