def __init__(
        self,
        train_filename: str,
        dev_filename: str,
        test_filename: str,
        tokenizers: Dict[str, BaseTokenizer] = None,
        namespace_vocab_options: Dict[str, Dict[str, Any]] = None,
        namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None,
        batch_size: int = 10,
    ):

        self.train_filename = train_filename
        self.dev_filename = dev_filename
        self.test_filename = test_filename
        self.tokenizers = tokenizers or {
            "tokens": WordTokenizer(tokenizer="vanilla"),
            "char_tokens": CharacterTokenizer(),
        }
        self.namespace_vocab_options = namespace_vocab_options or {
            "char_tokens": {
                "start_token": " ",
                "end_token": " ",
                "pad_token": " ",
                "unk_token": " ",
            }
        }
        self.namespace_numericalizer_map = namespace_numericalizer_map or {
            "tokens": Numericalizer(),
            "char_tokens": Numericalizer(),
        }
        self.namespace_numericalizer_map["seq_label"] = Numericalizer()

        self.batch_size = batch_size

        self.train_dataset = SeqLabellingDataset(
            filename=self.train_filename, tokenizers=self.tokenizers
        )

        self.dev_dataset = SeqLabellingDataset(
            filename=self.dev_filename, tokenizers=self.tokenizers
        )

        self.test_dataset = SeqLabellingDataset(
            filename=self.test_filename, tokenizers=self.tokenizers
        )

        super(SeqLabellingDatasetManager, self).__init__(
            train_dataset=self.train_dataset,
            dev_dataset=self.dev_dataset,
            test_dataset=self.test_dataset,
            namespace_vocab_options=self.namespace_vocab_options,
            namespace_numericalizer_map=self.namespace_numericalizer_map,
            batch_size=batch_size,
        )
    def __init__(
        self,
        train_filename: str,
        dev_filename: str,
        test_filename: str,
        tokenizers: Dict[str, BaseTokenizer] = None,
        namespace_vocab_options: Dict[str, Dict[str, Any]] = None,
        namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None,
        batch_size: int = 10,
    ):
        self.train_filename = train_filename
        self.dev_filename = dev_filename
        self.test_filename = test_filename
        self.tokenizers = tokenizers or {
            "tokens": WordTokenizer(),
            "char_tokens": CharacterTokenizer(),
        }
        self.namespace_vocab_options = namespace_vocab_options or {
            "char_tokens": {
                "start_token": " ",
                "end_token": " ",
                "pad_token": " ",
                "unk_token": " ",
            },
            "label": {
                "include_special_vocab": False
            },
        }
        self.namespace_numericalizer_map = namespace_numericalizer_map or {
            "tokens": Numericalizer(),
            "char_tokens": Numericalizer(),
        }
        self.namespace_numericalizer_map["label"] = Numericalizer()
        self.batch_size = batch_size

        self.train_dataset = TextClassificationDataset(
            filename=self.train_filename, tokenizers=self.tokenizers)
        self.dev_dataset = TextClassificationDataset(
            filename=self.dev_filename, tokenizers=self.tokenizers)
        self.test_dataset = TextClassificationDataset(
            filename=self.test_filename, tokenizers=self.tokenizers)

        super(TextClassificationDatasetManager, self).__init__(
            train_dataset=self.train_dataset,
            dev_dataset=self.dev_dataset,
            test_dataset=self.test_dataset,
            namespace_vocab_options=self.namespace_vocab_options,
            namespace_numericalizer_map=self.namespace_numericalizer_map,
            batch_size=batch_size,
        )
def get_numericalized_instances(get_preprocessed_instances):
    instances, labels = get_preprocessed_instances
    MAX_NUM_WORDS = 3000
    MAX_LENGTH = 15
    vocab = Vocab(instances=instances, max_num_tokens=MAX_NUM_WORDS)
    vocab.build_vocab()
    numericalizer = Numericalizer(vocabulary=vocab)
    numericalized_instances = numericalizer.numericalize_batch_instances(
        instances[:32])
    return {
        "numericalized_instances": numericalized_instances,
        "labels": labels,
        "max_length": MAX_LENGTH,
        "max_num_words": MAX_NUM_WORDS,
        "vocab": vocab,
    }
Exemple #4
0
def single_instance_setup(instances):
    single_instance = instances["single_instance"]
    MAX_NUM_WORDS = 100

    vocabulary = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS)

    numericalizer = Numericalizer(vocabulary=vocabulary)

    return single_instance, numericalizer, vocabulary
Exemple #5
0
    def __init__(
        self,
        train_filename: str,
        dev_filename: str,
        test_filename: str,
        tokenizers: Dict[str, BaseTokenizer] = None,
        namespace_vocab_options: Dict[str, Dict[str, Any]] = None,
        namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None,
        batch_size=10,
        column_names: List[str] = None,
        train_only: Optional[str] = None,
    ):

        self.train_filename = train_filename
        self.dev_filename = dev_filename
        self.test_filename = test_filename
        self.tokenizers = tokenizers or {
            "tokens": WordTokenizer(tokenizer="vanilla"),
            "char_tokens": CharacterTokenizer(),
        }

        if namespace_vocab_options is None:
            namespace_vocab_options = {}

        namespace_vocab_options_defaults = {
            "char_tokens": {
                "start_token": " ",
                "end_token": " ",
                "pad_token": " ",
                "unk_token": " ",
            }
        }
        self.namespace_vocab_options = {}

        vocab_namespaces = set(namespace_vocab_options.keys()).union(
            namespace_vocab_options_defaults.keys())

        for namespace in vocab_namespaces:
            user_passed = namespace_vocab_options.get(namespace, {})
            defaults = namespace_vocab_options_defaults.get(namespace, {})
            self.namespace_vocab_options[namespace] = {
                **defaults,
                **user_passed
            }

        self.namespace_numericalizer_map = namespace_numericalizer_map or {
            "tokens": Numericalizer(),
            "char_tokens": Numericalizer(),
        }

        self.batch_size = batch_size

        if column_names is None:
            column_names = ["label_1"]

        valid_column_names = [column_names[0]]

        for column_name in valid_column_names:
            self.namespace_numericalizer_map[column_name] = Numericalizer()

        self.train_dataset = BioNerDataset(
            filename=self.train_filename,
            tokenizers=self.tokenizers,
            column_names=column_names,
            train_only=train_only,
        )

        self.dev_dataset = BioNerDataset(
            filename=self.dev_filename,
            tokenizers=self.tokenizers,
            column_names=column_names,
            train_only=train_only,
        )

        self.test_dataset = BioNerDataset(
            filename=self.test_filename,
            tokenizers=self.tokenizers,
            column_names=column_names,
            train_only=train_only,
        )

        super(BioNERDatasetManager, self).__init__(
            train_dataset=self.train_dataset,
            dev_dataset=self.dev_dataset,
            test_dataset=self.test_dataset,
            namespace_vocab_options=self.namespace_vocab_options,
            namespace_numericalizer_map=self.namespace_numericalizer_map,
            batch_size=batch_size,
        )
Exemple #6
0
    def __init__(
        self,
        train_filename: str,
        dev_filename: str,
        test_filename: str,
        tokenizers: Dict[str, BaseTokenizer] = None,
        namespace_vocab_options: Dict[str, Dict[str, Any]] = None,
        namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None,
        batch_size=10,
        column_names: List[str] = None,
    ):
        self.train_filename = train_filename
        self.dev_filename = dev_filename
        self.test_filename = test_filename
        self.tokenizers = tokenizers or {
            "tokens": WordTokenizer(tokenizer="vanilla"),
            "char_tokens": CharacterTokenizer(),
        }

        namespace_vocab_options_defaults = {
            "char_tokens": {
                "start_token": " ",
                "end_token": " ",
                "pad_token": " ",
                "unk_token": " ",
            }
        }

        if namespace_vocab_options is None:
            namespace_vocab_options = {}

        self.namespace_vocab_options = copy.deepcopy(
            namespace_vocab_options_defaults)

        for namespace, options in self.namespace_vocab_options.items():
            user_passed = namespace_vocab_options.get(namespace, {})
            self.namespace_vocab_options[namespace] = {
                **options,
                **user_passed
            }

        self.namespace_numericalizer_map = namespace_numericalizer_map or {
            "tokens": Numericalizer(),
            "char_tokens": Numericalizer(),
        }

        self.batch_size = batch_size

        if column_names is None:
            column_names = ["NER"]

        for column_name in column_names:
            self.namespace_numericalizer_map[column_name] = Numericalizer()

        self.train_dataset = ConllYagoDataset(
            filename=self.train_filename,
            tokenizers=self.tokenizers,
            column_names=column_names,
        )

        self.dev_dataset = ConllYagoDataset(
            filename=self.dev_filename,
            tokenizers=self.tokenizers,
            column_names=column_names,
        )

        self.test_dataset = ConllYagoDataset(
            filename=self.test_filename,
            tokenizers=self.tokenizers,
            column_names=column_names,
        )

        super(ConllYagoDatasetsManager, self).__init__(
            train_dataset=self.train_dataset,
            dev_dataset=self.dev_dataset,
            test_dataset=self.test_dataset,
            namespace_vocab_options=self.namespace_vocab_options,
            namespace_numericalizer_map=self.namespace_numericalizer_map,
            batch_size=batch_size,
        )
    def __init__(
        self,
        train_filename: str,
        dev_filename: str,
        test_filename: str,
        tokenizers: Dict[str, BaseTokenizer] = None,
        namespace_vocab_options: Dict[str, Dict[str, Any]] = None,
        namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None,
        batch_size: int = 10,
    ):
        """

        Parameters
        ----------
        train_filename: str
            The path wehere the train file is stored
        dev_filename: str
            The path where the dev file is stored
        test_filename: str
            The path where the test file is stored
        tokenizers: Dict[str, BaseTokenizer]
            A mapping from namespace to the tokenizer
        namespace_vocab_options: Dict[str, Dict[str, Any]]
            A mapping from the name to options
        namespace_numericalizer_map: Dict[str, BaseNumericalizer]
            Every namespace can have a different numericalizer specified
        batch_size: int
            The batch size of the data returned
        """
        self.train_filename = train_filename
        self.dev_filename = dev_filename
        self.test_filename = test_filename
        self.tokenizers = tokenizers or {
            "tokens": WordTokenizer(),
            "char_tokens": CharacterTokenizer(),
        }
        self.namespace_vocab_options = namespace_vocab_options or {
            "char_tokens": {
                "start_token": " ",
                "end_token": " ",
                "pad_token": " ",
                "unk_token": " ",
            },
            "label": {
                "include_special_vocab": False
            },
        }
        self.namespace_numericalizer_map = namespace_numericalizer_map or {
            "tokens": Numericalizer(),
            "char_tokens": Numericalizer(),
        }
        self.namespace_numericalizer_map["label"] = Numericalizer()
        self.batch_size = batch_size

        self.train_dataset = TextClassificationDataset(
            filename=self.train_filename, tokenizers=self.tokenizers)
        self.dev_dataset = TextClassificationDataset(
            filename=self.dev_filename, tokenizers=self.tokenizers)
        self.test_dataset = TextClassificationDataset(
            filename=self.test_filename, tokenizers=self.tokenizers)

        super(TextClassificationDatasetManager, self).__init__(
            train_dataset=self.train_dataset,
            dev_dataset=self.dev_dataset,
            test_dataset=self.test_dataset,
            namespace_vocab_options=self.namespace_vocab_options,
            namespace_numericalizer_map=self.namespace_numericalizer_map,
            batch_size=batch_size,
        )