Ejemplo n.º 1
0
def test_word_tokenizer():
    tokenizer = WordTokenizer()

    dummy = "justo. Praesent luctus."
    assert tokenizer(dummy) == ["justo.", "Praesent", "luctus."]
    dummy = ""
    assert tokenizer(dummy) == []
Ejemplo n.º 2
0
def test_word_tokenizer():
    tokenizer = WordTokenizer()

    dummy = "justo. Praesent luctus."
    assert tokenizer(dummy) == ['justo.', 'Praesent', 'luctus.']
    dummy = ""
    assert tokenizer(dummy) == []
Ejemplo n.º 3
0
    def __init__(
            self,  # nosec
            tokenizer: Optional[Tokenizer] = None,
            lower: bool = False,
            pad_token: Optional[str] = '<pad>',
            unk_token: Optional[str] = '<unk>',
            sos_token: Optional[str] = None,
            eos_token: Optional[str] = None,
            embeddings: Optional[str] = None,
            embeddings_format: str = 'glove',
            embeddings_binary: bool = False,
            unk_init_all: bool = False,
            drop_unknown: bool = False) -> None:
        """Initialize the TextField.

        Parameters
        ----------
        tokenizer : Tokenizer, optional
            Tokenizer to use, by default WordTokenizer()
        lower : bool, optional
            If given, lowercase the input, by default False
        pad_token : str, optional
            Reserved padding token. Note that this object does not
            perform padding. Padding is done on the fly, when sampling.
            (defaults to '<pad>')
        unk_token : str, optional
            The token to use for out of vocabulary tokens
            (defaults to '<unk>')
        sos_token : str, optional
            Start of sentence tokens to add to the start of
            each sequence (defaults to '<sos>')
        eos : Iterable[str], optional
            List of end of sentence tokens to add to the end of each
            sequence (defaults to an empty list)
        embeddings : Optional[str], optional
            Path to pretrained embeddings, by default None
        embeddings_format : str, optional
            The format of the input embeddings, should be one of:
            'glove', 'word2vec', 'fasttext' or 'gensim'. The latter can
            be used to download embeddings hosted on gensim on the fly.
            See https://github.com/RaRe-Technologies/gensim-data
            for the list of available embedding aliases.
        embeddings_binary : bool, optional
            Whether the input embeddings are provided in binary format,
            by default False
        unk_init_all : bool, optional
            If True, every token not provided in the input embeddings is
            given a random embedding from a normal distribution.
            Otherwise, all of them map to the '<unk>' token.
        drop_unknown: bool
            Whether to drop tokens that don't have embeddings
            associated. Defaults to True.
            Important: this flag will only work when using embeddings.

        """
        self.tokenizer = tokenizer or WordTokenizer()
        self.lower = lower

        self.pad = pad_token
        self.unk = unk_token
        self.sos = sos_token
        self.eos = eos_token

        self.embeddings = embeddings
        self.embeddings_format = embeddings_format
        self.embeddings_binary = embeddings_binary
        self.embedding_matrix: Optional[torch.Tensor] = None
        self.unk_init_all = unk_init_all
        self.drop_unknown = drop_unknown

        self.unk_numericals: Set[int] = set()

        self.vocab: Dict = odict()
        specials = [pad_token, unk_token, sos_token, eos_token]
        self.specials = [
            special for special in specials if special is not None
        ]

        index = -1
        for token in self.specials:
            self.vocab[token] = index = index + 1

        self.register_attrs('vocab')
Ejemplo n.º 4
0
def test_ngram_tokenizer_equivalence():
    t1 = NGramsTokenizer(1)
    t2 = WordTokenizer()

    assert t1(example) == t2(example)
Ejemplo n.º 5
0
    def __init__(
            self,  # nosec
            tokenizer: Optional[Tokenizer] = None,
            lower: bool = False,
            pad_token: Optional[str] = '<pad>',
            unk_token: str = '<unk>',
            sos_token: Optional[str] = None,
            eos_token: Optional[str] = None,
            embeddings_info: Optional[EmbeddingsInformation] = None,
            embeddings: Optional[str] = None,
            embeddings_format: str = 'glove',
            embeddings_binary: bool = False,
            unk_init_all: bool = False,
            drop_unknown: bool = False,
            max_seq_len: Optional[int] = None,
            truncate_end: bool = False,
            setup_all_embeddings: bool = False) -> None:
        """Initialize the TextField.

        Parameters
        ----------
        tokenizer : Tokenizer, optional
            Tokenizer to use, by default WordTokenizer()
        lower : bool, optional
            If given, lowercase the input, by default False
        pad_token : str, optional
            Reserved padding token. Note that this object does not
            perform padding. Padding is done on the fly, when sampling.
            (defaults to '<pad>')
        unk_token : str, optional
            The token to use for out of vocabulary tokens
            (defaults to '<unk>')
        sos_token : str, optional
            Start of sentence tokens to add to the start of
            each sequence (defaults to '<sos>')
        eos : Iterable[str], optional
            List of end of sentence tokens to add to the end of each
            sequence (defaults to an empty list)
        embeddings_info : EmbeddingsInformation, optional
            The embeddings information. By default None
        embeddings : str
            WIlL BE DEPRECATED SOON. USE 'from_embeddings'
            FACTORY INSTEAD.
            Path to pretrained embeddings or the embedding name
            in case format is gensim.
        embeddings_format : str, optional
            WIlL BE DEPRECATED SOON. USE 'from_embeddings'
            FACTORY INSTEAD.
            The format of the input embeddings, should be one of:
            'glove', 'word2vec', 'fasttext' or 'gensim'. The latter can
            be used to download embeddings hosted on gensim on the fly.
            See https://github.com/RaRe-Technologies/gensim-data
            for the list of available embedding aliases.
        embeddings_binary : bool, optional
            WIlL BE DEPRECATED SOON. USE 'from_embeddings'
            FACTORY INSTEAD.
            Whether the input embeddings are provided in binary format,
            by default False
        unk_init_all : bool, optional
            If True, every token not provided in the input embeddings is
            given a random embedding from a normal distribution.
            Otherwise, all of them map to the '<unk>' token.
        drop_unknown: bool
            WIlL BE DEPRECATED SOON. USE 'from_embeddings'
            FACTORY INSTEAD.
            Whether to drop tokens that don't have embeddings
            associated. Defaults to True.
            Important: this flag will only work when using embeddings.
        max_seq_len: int, optional
            The maximum length possibly output by the process func.
            If len of input tokens is larger than this number - then
            the output will be truncated as a post processing step.
        truncate_end: bool
            Determines the window of observed text in process if the
            input is larger than max_seq_len. If this value is True
            the window starts from the end of the utterance.
            Defaults to False.

            example: max_seq_len=3, input_text=1 2 3 4 5
            truncate_end=false: output=1 2 3
            truncate_end=true: output=3 4 5
        setup_all_embeddings: bool
            WIlL BE DEPRECATED SOON. USE 'from_embeddings'
            FACTORY INSTEAD.
            Controls if all words from the optional provided
            embeddings will be added to the vocabulary and to the
            embedding matrix. Defaults to False.

        """
        if embeddings:
            if embeddings_info:
                raise ValueError(
                    "Cannot submit embeddings information and use the embeddings parameters"
                    +
                    "simultaneously. Use the 'from_embeddings' factory instead."
                )

            warnings.warn(
                "The embeddings-exclusive parameters " +
                "('embeddings', 'embeddings_format', 'embeddings_binary', " +
                "'setup_all_embeddings', 'drop_unknown', 'unk_init_all') " +
                "will be deprecated in a future release. " +
                "Please migrate to use the 'from_embeddings' factory.")

            embeddings_info = EmbeddingsInformation(
                embeddings=embeddings,
                embeddings_format=embeddings_format,
                embeddings_binary=embeddings_binary,
                build_vocab_from_embeddings=setup_all_embeddings,
                unk_init_all=unk_init_all,
                drop_unknown=drop_unknown)

        self.tokenizer = tokenizer or WordTokenizer()
        self.lower = lower

        self.pad = pad_token
        self.unk = unk_token
        self.sos = sos_token
        self.eos = eos_token

        self.embeddings_info = embeddings_info

        self.embedding_matrix: Optional[torch.Tensor] = None

        self.max_seq_len = max_seq_len
        self.truncate_end = truncate_end

        self.unk_numericals: Set[int] = set()

        self.vocab: Dict = odict()
        specials = [pad_token, unk_token, sos_token, eos_token]
        self.specials = [
            special for special in specials if special is not None
        ]

        self.register_attrs('vocab')
Ejemplo n.º 6
0
    def __init__(
            self,  # nosec
            tokenizer: Optional[Tokenizer] = None,
            lower: bool = False,
            pad_token: Optional[str] = '<pad>',
            unk_token: str = '<unk>',
            sos_token: Optional[str] = None,
            eos_token: Optional[str] = None,
            embeddings: Optional[str] = None,
            embeddings_format: str = 'glove',
            embeddings_binary: bool = False,
            model: Optional[KeyedVectors] = None,
            unk_init_all: bool = False,
            drop_unknown: bool = False,
            max_seq_len: Optional[int] = None,
            truncate_end: bool = False,
            setup_all_embeddings: bool = False) -> None:
        """Initialize the TextField.

        Parameters
        ----------
        tokenizer : Tokenizer, optional
            Tokenizer to use, by default WordTokenizer()
        lower : bool, optional
            If given, lowercase the input, by default False
        pad_token : str, optional
            Reserved padding token. Note that this object does not
            perform padding. Padding is done on the fly, when sampling.
            (defaults to '<pad>')
        unk_token : str, optional
            The token to use for out of vocabulary tokens
            (defaults to '<unk>')
        sos_token : str, optional
            Start of sentence tokens to add to the start of
            each sequence (defaults to '<sos>')
        eos : Iterable[str], optional
            List of end of sentence tokens to add to the end of each
            sequence (defaults to an empty list)
        model : KeyedVectors, optional
            The embeddings model used for retrieving text embeddings,
            by default None
        unk_init_all : bool, optional
            If True, every token not provided in the input embeddings is
            given a random embedding from a normal distribution.
            Otherwise, all of them map to the '<unk>' token.
        drop_unknown: bool
            Whether to drop tokens that don't have embeddings
            associated. Defaults to True.
            Important: this flag will only work when using embeddings.
        max_seq_len: int, optional
            The maximum length possibly output by the process func.
            If len of input tokens is larger than this number - then
            the output will be truncated as a post processing step.
        truncate_end: bool
            Determines the window of observed text in process if the
            input is larger than max_seq_len. If this value is True
            the window starts from the end of the utterance.
            Defaults to False.

            example: max_seq_len=3, input_text=1 2 3 4 5
            truncate_end=false: output=1 2 3
            truncate_end=true: output=3 4 5
        setup_all_embeddings: bool
            Controls if all words from the optional provided
            embeddings will be added to the vocabulary and to the
            embedding matrix. Defaults to False.

        """
        if embeddings:
            if model:
                raise ValueError(
                    "Cannot submit a model and use the embeddings parameters" +
                    "simultaneously. Use the 'from_embeddings' factory instead."
                )

            warnings.warn(
                "The embeddings-exclusive parameters " +
                "('embeddings', 'embeddings_format', 'embeddings_binary', " +
                "'setup_all_embeddings', 'drop_unknown', 'unk_init_all') will be "
                + "deprecated in a future release. " +
                "Please migrate to use the 'from_embeddings' factory.")

            model = get_embeddings(embeddings, embeddings_format,
                                   embeddings_binary)

        if setup_all_embeddings and not model:
            raise ValueError(
                "'setup_all_embeddings' cannot be enabled without passing embeddings."
            )

        self.tokenizer = tokenizer or WordTokenizer()
        self.lower = lower

        self.pad = pad_token
        self.unk = unk_token
        self.sos = sos_token
        self.eos = eos_token

        self.model = model
        self.embedding_matrix: Optional[torch.Tensor] = None
        self.unk_init_all = unk_init_all
        self.drop_unknown = drop_unknown
        self.setup_all_embeddings = setup_all_embeddings
        self.max_seq_len = max_seq_len
        self.truncate_end = truncate_end

        self.unk_numericals: Set[int] = set()

        self.vocab: Dict = odict()
        specials = [pad_token, unk_token, sos_token, eos_token]
        self.specials = [
            special for special in specials if special is not None
        ]

        self.register_attrs('vocab')