def test_hash3(self): s1 = from_items(1, 2, 3, 4, 5) s2 = s1 | split() s3 = s1 | select("key") self.assertNotEqual(s1.hash, s2.hash) self.assertNotEqual(s1.hash, s3.hash) self.assertNotEqual(s2.hash, s3.hash)
def test_lines_splits(self): s = lines(Path(__file__).parent / "sample_text.txt") s2 = from_items(Path(__file__).parent / "sample_text.txt") | lines s3 = from_items( Path(__file__).parent / "sample_text.txt") | lines | flatten self.assertEqual(4, len(s)) self.assertEqual("hello 10", s[0]) self.assertEqual(list(s), list(s2[0])) self.assertEqual(list(s), list(s3)) s = s | split() self.assertEqual(4, len(s)) self.assertEqual(["hello", "10"], s[0]) s |= (None, int) self.assertEqual(4, len(s)) self.assertEqual(("hello", 10), s[0])
zipped = ja * en assert len(zipped) == len(ja) for data in zipped: assert isinstance(data, tuple) assert len(data) == 2 j, e = data assert isinstance(j, str) assert isinstance(e, str) break v = VocabBuilder("ja") ja >> v ja |= split() | v.numericalizer en |= split() dataset = ja * en | to_dict("ja", "en") for example in dataset: assert isinstance(example, dict) assert "ja" in example value = example["ja"] assert isinstance(value, list) assert isinstance(value[0], int), "converted to word index(numericalize)" assert isinstance(example["en"], list) assert isinstance(example["en"][0], str), "disable numericalize" special_delimiter_text = lines("data/special_delimiter.txt") | split("|||") ja = special_delimiter_text | select(3) en = special_delimiter_text | select(4)
zipped = ja * en assert len(zipped) == len(ja) for data in zipped: assert isinstance(data, tuple) assert len(data) == 2 j, e = data assert isinstance(j, str) assert isinstance(e, str) break dataset = ja * en | mapped(lambda t: {"ja": t[0], "en": t[1]}) for example in dataset: assert isinstance(example, dict) assert "ja" in example assert isinstance(example["ja"], str) assert isinstance(example["en"], str) special_delimiter_text = lines("data/special_delimiter.txt") | split("|||") for third_column in special_delimiter_text | select(3): assert isinstance(third_column, str) break dataset = special_delimiter_text | select(3) for japanese_column in dataset: pass for japanese_column in dataset: assert isinstance(japanese_column, str) assert "現在" in japanese_column break
import pathlib from flowder.pipes import split, select from flowder.source.base import mapped from flowder.utils import lines ls = lines("data/kftt.ja") assert len(ls) == 10, "there should be 10 lines" for s in ls: assert isinstance(s, str), "Source iterate the raw values" break for s in ls | mapped(lambda x: len(x)): assert isinstance(s, int), "Source iterate the raw values" break for spl in ls | split(): assert isinstance(spl, list) assert isinstance(spl[0], str) break delimiter = "|||" special_delimiter_text = lines("data/special_delimiter.txt") | split(delimiter) for third_column in special_delimiter_text | select(3): assert isinstance(third_column, str) break