Exemple #1
0
    def test_serialization(self):
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("john") + ["</w>", "<cpad>"],
                ["<w>"] + list("loves") + ["</w>"],
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ],
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>"] + list("cries") + ["</w>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                ["<cpad>"] * 7,
            ]
        ]

        field_pickle_filename = "char_field.pl"
        field_pickle_path = os.path.join(self.test_dir, field_pickle_filename)
        torch.save(field, field_pickle_path)

        loaded_field = torch.load(field_pickle_path)
        assert loaded_field == field

        original_numericalization = field.numericalize(examples_data)
        pickled_numericalization = loaded_field.numericalize(examples_data)

        assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
Exemple #2
0
 def gen_dataset(
     self,
     data: Iterable[Dict[str, Any]],
     include_label_fields: bool = True,
     shard_range: Tuple[int, int] = None,
 ) -> textdata.Dataset:
     """
     Generate torchtext Dataset from raw in memory data.
     Returns:
         dataset (TorchText.Dataset)
     """
     to_process = {}
     to_process.update(self.features)
     to_process.update(self.extra_fields)
     if include_label_fields:
         to_process.update(self.labels)
     else:
         to_process.pop(Target.TARGET_LABEL_FIELD, None)
     fields = {name: (name, field) for name, field in to_process.items()}
     # generate example from dataframe
     examples = [
         textdata.Example.fromdict(row, fields)
         for idx, row in enumerate(self.preprocess(data))
         if not shard_range or shard_range[0] <= idx <= shard_range[1]
     ]
     return textdata.Dataset(examples, to_process)
 def make_dataset(self, files, dset):
     dataset = data.Dataset(
         list(
             tmap(lambda d: data.Example.fromdict(d, self.example_fields),
                  chain(*map(lambda f: dset[f], files)),
                  mininterval=0.5)), self.dataset_fields)
     for file in files:
         del dset[file]
     return dataset
Exemple #4
0
 def sentencelist2iterator(self, sentences):
     examples = list()
     for sentence in sentences:
         example = self.sent2example(sentence)
         examples.append(example)
     dataset = data.Dataset(examples,
                            fields=[('src', self.SRC), ('rsrc', self.rSRC)])
     self.iterator = data.Iterator(dataset,
                                   batch_size=1,
                                   sort_key=lambda x: len(x.src),
                                   sort=True,
                                   sort_within_batch=True,
                                   device=self.device)
Exemple #5
0
def filter_init(ex_val1, ex_val2, ex_val3):
    text_field = data.Field(sequential=True)
    label_field = data.Field(sequential=False)
    fields = [("text1", text_field), ("text2", text_field),
              ("label", label_field)]

    example1 = data.Example.fromlist(ex_val1, fields)
    example2 = data.Example.fromlist(ex_val2, fields)
    example3 = data.Example.fromlist(ex_val3, fields)
    examples = [example1, example2, example3]

    dataset = data.Dataset(examples, fields)
    text_field.build_vocab(dataset)

    return dataset, text_field
Exemple #6
0
def read_data(corpus_file, datafields):
    with open(corpus_file, encoding='utf-8') as f:
        examples = []
        words = []
        labels = []
        for line in f:
            line = line.strip()
            if line == '*':
                examples.append(data.Example.fromlist([words, labels], datafields))
                words = []
                labels = []
            else:
                columns = line.split()
                words.append(columns[1])
                labels.append(columns[2])
        return data.Dataset(examples, datafields)
Exemple #7
0
    def test_build_vocab_from_dataset(self):
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>")
        ex1 = data.Example.fromlist(["aaa bbb c"], [("chars", CHARS)])
        ex2 = data.Example.fromlist(["bbb aaa"], [("chars", CHARS)])
        dataset = data.Dataset([ex1, ex2], [("chars", CHARS)])

        CHARS.build_vocab(dataset, min_freq=2)

        expected = "a b <w> </w> <s> </s> <cunk> <cpad>".split()
        assert len(CHARS.vocab) == len(expected)
        for c in expected:
            assert c in CHARS.vocab.stoi

        expected_freqs = Counter({"a": 6, "b": 6, "c": 1})
        assert CHARS.vocab.freqs == CHARS.nesting_field.vocab.freqs == expected_freqs
Exemple #8
0
def split(table,
          path,
          train_prefix,
          validation_prefix,
          test_prefix,
          split_ratio=[0.6, 0.2, 0.2],
          stratified=False,
          strata_field='label'):
    """Split a pandas dataframe or CSV file into train / validation / test data sets.

    Args:
        table (pandas.Dataframe or string): The pandas dataframe or CSV file to split.
        path (string): The directory to save the train, validation and test CSV files to.
        train: Suffix to add to `path` to get the training set save path.
        validation: Suffix to add to `path` to get the validation set save path.
        test: Suffix to add to `path` to get the test set save path.
        split_ratio (List of floats): a list of 3 numbers denoting the relative sizes of
            train, test and valid splits respectively. Default is [0.6, 0.2, 0.2].
        stratified (bool): whether the sampling should be stratified.
            Default is False.
        strata_field (str): name of the examples Field stratified over.
            Default is 'label' for the conventional label field.
    """
    assert len(split_ratio) == 3

    if not isinstance(table, pd.DataFrame):
        table = pd.read_csv(table)
    if table.index.name is not None:
        table = table.reset_index()

    examples = list(table.itertuples(index=False))
    fields = [(col, None) for col in list(table)]
    dataset = data.Dataset(examples, fields)
    train, valid, test = dataset.split(split_ratio, stratified, strata_field)

    tables = (pd.DataFrame(train.examples), pd.DataFrame(valid.examples),
              pd.DataFrame(test.examples))
    prefixes = (train_prefix, validation_prefix, test_prefix)

    for i in range(len(tables)):
        tables[i].columns = table.columns
        tables[i].to_csv(os.path.join(path, prefixes[i]), index=False)
Exemple #9
0
    def test_numericalize(self):
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("john") + ["</w>", "<cpad>"],
                ["<w>"] + list("loves") + ["</w>"],
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ],
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>"] + list("cries") + ["</w>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                ["<cpad>"] * 7,
            ]
        ]
        numericalized = field.numericalize(examples_data)

        assert numericalized.dim() == 3
        assert numericalized.size(0) == len(examples_data)
        for example, numericalized_example in zip(examples_data, numericalized):
            verify_numericalized_example(
                field, example, numericalized_example, batch_first=True)

        # test include_lengths
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field, include_lengths=True)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("john") + ["</w>", "<cpad>"],
                ["<w>"] + list("loves") + ["</w>"],
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ],
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>"] + list("cries") + ["</w>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                ["<cpad>"] * 7,
            ]
        ]

        numericalized, seq_len, word_len = field.numericalize(
            (examples_data, [5, 4], [[3, 6, 7, 6, 3], [3, 6, 7, 3, 0]]))

        assert numericalized.dim() == 3
        assert len(seq_len) == 2
        assert len(word_len) == 2

        assert numericalized.size(0) == len(examples_data)
        for example, numericalized_example in zip(examples_data, numericalized):
            verify_numericalized_example(
                field, example, numericalized_example, batch_first=True)