def load(self, langs: Set[str], min_len: int = 25) -> List[Tuple[str, str]]: """ Args: langs min_len: Minimum text length in *chars* for a given example to be included. Returns: Sequence of (text, lang) examples. """ data = [] match_regex = r"ud-(train|test|dev)\.txt" for fpath in tio.get_filepaths( self.data_dir, match_regex=match_regex, recursive=True ): fname = pathlib.Path(fpath).name lang, _ = fname.split("_", maxsplit=1) if lang not in langs: continue with open(fpath, mode="rt") as f: text = f.read() if "\n" in text: data.extend( (text_segment, lang) for text_segment in re.split(r"\n+", text) if len(text_segment) >= min_len ) else: data.extend( (text_segment, lang) for text_segment in _randomly_segment_text(text, (50, 1000)) if len(text_segment) >= min_len ) LOGGER.info("loaded TatoebaDataset data:\n%s ...", data[:3]) return data
def test_get_filepaths(self): expected = sorted( os.path.join(TESTS_DIR, fname) for fname in os.listdir(TESTS_DIR) if os.path.isfile(os.path.join(TESTS_DIR, fname))) observed = sorted( io.get_filepaths(TESTS_DIR, ignore_invisible=False, recursive=False)) assert observed == expected
def test_get_filepaths_ignore_regex(self): assert ( len( list( io.get_filepaths(TESTS_DIR, ignore_regex="test_", ignore_invisible=True) ) ) == 0 )
def test_get_filepaths_match_regex(self): assert (len( list(io.get_filepaths(TESTS_DIR, match_regex="io", extension=".py"))) == 1)
def test_get_filepaths_ignore_invisible(self): dirpath = os.path.dirname(os.path.abspath(__file__)) assert len(list(io.get_filepaths( dirpath, ignore_invisible=True))) <= len( list(io.get_filepaths(dirpath, ignore_invisible=False)))
def test_get_filepaths_match_regex(self): result = list( io.get_filepaths(TESTS_DIR, match_regex="_io", extension=".py")) assert len(result) == 1