Beispiel #1
0
    def initialize_dictionary(self, namespace: str, unk_num: int,
                              mode: MappingMode):
        if mode == MappingMode.token2index:
            if any(
                    namespace_match(pattern, namespace)
                    for pattern in self._non_padded_namespaces):
                dict.__setitem__(self, namespace, {})
            else:
                init_namespace_dictionary = RandomHashDict(
                    unk_num=unk_num, oov_token=self.oov_token)
                init_namespace_dictionary.update({self.padding_token: 0})
                init_namespace_dictionary.add_unk_tokens()

                dict.__setitem__(self, namespace, init_namespace_dictionary)

        elif mode == MappingMode.index2token:
            if any(
                    namespace_match(pattern, namespace)
                    for pattern in self._non_padded_namespaces):
                dict.__setitem__(self, namespace, {})
            else:
                init_namespace_dictionary = {0: self.padding_token}
                for i in range(unk_num):
                    init_namespace_dictionary[
                        len(init_namespace_dictionary
                            )] = f"@@{self.oov_token}#{str(i)}@@"

                dict.__setitem__(self, namespace, init_namespace_dictionary)
Beispiel #2
0
    def from_files(cls, directory: str) -> 'Vocabulary':
        """
        Loads a ``Vocabulary`` that was serialized using ``save_to_files``.

        Parameters
        ----------
        directory : ``str``
            The directory containing the serialized vocabulary.
        """
        logger.info("Loading token dictionary from %s.", directory)
        with codecs.open(os.path.join(directory, NAMESPACE_PADDING_FILE), 'r',
                         'utf-8') as namespace_file:
            non_padded_namespaces = [
                namespace_str.strip() for namespace_str in namespace_file
            ]

        vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)

        # Check every file in the directory.
        for namespace_filename in os.listdir(directory):
            if namespace_filename == NAMESPACE_PADDING_FILE:
                continue
            namespace = namespace_filename.replace('.txt', '')
            if any(
                    namespace_match(pattern, namespace)
                    for pattern in non_padded_namespaces):
                is_padded = False
            else:
                is_padded = True
            filename = os.path.join(directory, namespace_filename)
            vocab.set_from_file(filename, is_padded, namespace=namespace)

        return vocab
Beispiel #3
0
 def __missing__(self, key: str):
     if any(namespace_match(pattern, key) for pattern in self._non_padded_namespaces):
         value = self._non_padded_function()
     else:
         value = self._padded_function()
     dict.__setitem__(self, key, value)
     return value
Beispiel #4
0
 def __missing__(self, key: str):
     if any(namespace_match(pattern, key) for pattern in self._non_padded_namespaces):
         value = self._non_padded_function()
     else:
         value = self._padded_function()
     dict.__setitem__(self, key, value)
     return value
Beispiel #5
0
    def from_files(cls, directory: str) -> 'Vocabulary':
        """
        Loads a ``Vocabulary`` that was serialized using ``save_to_files``.

        Parameters
        ----------
        directory : ``str``
            The directory containing the serialized vocabulary.
        """
        logger.info("Loading token dictionary from %s.", directory)
        with codecs.open(os.path.join(directory, NAMESPACE_PADDING_FILE), 'r', 'utf-8') as namespace_file:
            non_padded_namespaces = [namespace_str.strip() for namespace_str in namespace_file]

        vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)

        # Check every file in the directory.
        for namespace_filename in os.listdir(directory):
            if namespace_filename == NAMESPACE_PADDING_FILE:
                continue
            namespace = namespace_filename.replace('.txt', '')
            if any(namespace_match(pattern, namespace) for pattern in non_padded_namespaces):
                is_padded = False
            else:
                is_padded = True
            filename = os.path.join(directory, namespace_filename)
            vocab.set_from_file(filename, is_padded, namespace=namespace)

        return vocab
Beispiel #6
0
    def from_files(
        cls,
        directory: str,
        padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
        oov_token: Optional[str] = DEFAULT_OOV_TOKEN,
    ) -> "Vocabulary":
        """
        Loads a `Vocabulary` that was serialized either using `save_to_files` or inside
        a model archive file.

        # Parameters

        directory : `str`
            The directory or archive file containing the serialized vocabulary.
        """
        logger.info("Loading token dictionary from %s.", directory)
        padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
        oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN

        if not os.path.isdir(directory):
            base_directory = cached_path(directory, extract_archive=True)
            # For convenience we'll check for a 'vocabulary' subdirectory of the archive.
            # That way you can use model archives directly.
            vocab_subdir = os.path.join(base_directory, "vocabulary")
            if os.path.isdir(vocab_subdir):
                directory = vocab_subdir
            elif os.path.isdir(base_directory):
                directory = base_directory
            else:
                raise ConfigurationError(f"{directory} is neither a directory nor an archive")

        # We use a lock file to avoid race conditions where multiple processes
        # might be reading/writing from/to the same vocab files at once.
        with FileLock(os.path.join(directory, ".lock")):
            with codecs.open(
                os.path.join(directory, NAMESPACE_PADDING_FILE), "r", "utf-8"
            ) as namespace_file:
                non_padded_namespaces = [namespace_str.strip() for namespace_str in namespace_file]

            vocab = cls(
                non_padded_namespaces=non_padded_namespaces,
                padding_token=padding_token,
                oov_token=oov_token,
            )

            # Check every file in the directory.
            for namespace_filename in os.listdir(directory):
                if namespace_filename == NAMESPACE_PADDING_FILE:
                    continue
                if namespace_filename.startswith("."):
                    continue
                namespace = namespace_filename.replace(".txt", "")
                if any(namespace_match(pattern, namespace) for pattern in non_padded_namespaces):
                    is_padded = False
                else:
                    is_padded = True
                filename = os.path.join(directory, namespace_filename)
                vocab.set_from_file(filename, is_padded, namespace=namespace, oov_token=oov_token)

        return vocab
Beispiel #7
0
    def from_files(
        cls,
        directory: str,
        padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
        oov_token: Optional[str] = DEFAULT_OOV_TOKEN,
    ) -> "Vocabulary":
        """
        Loads a `Vocabulary` that was serialized using `save_to_files`.

        # Parameters

        directory : `str`
            The directory containing the serialized vocabulary.
        """
        logger.info("Loading token dictionary from %s.", directory)
        padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
        oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN

        # We use a lock file to avoid race conditions where multiple processes
        # might be reading/writing from/to the same vocab files at once.
        with FileLock(os.path.join(directory, ".lock")):
            with codecs.open(os.path.join(directory, NAMESPACE_PADDING_FILE),
                             "r", "utf-8") as namespace_file:
                non_padded_namespaces = [
                    namespace_str.strip() for namespace_str in namespace_file
                ]

            vocab = cls(
                non_padded_namespaces=non_padded_namespaces,
                padding_token=padding_token,
                oov_token=oov_token,
            )

            # Check every file in the directory.
            for namespace_filename in os.listdir(directory):
                if namespace_filename == NAMESPACE_PADDING_FILE:
                    continue
                if namespace_filename.startswith("."):
                    continue
                namespace = namespace_filename.replace(".txt", "")
                if any(
                        namespace_match(pattern, namespace)
                        for pattern in non_padded_namespaces):
                    is_padded = False
                else:
                    is_padded = True
                filename = os.path.join(directory, namespace_filename)
                vocab.set_from_file(filename,
                                    is_padded,
                                    namespace=namespace,
                                    oov_token=oov_token)

        return vocab
Beispiel #8
0
def load_vocab_from_directory(directory: str,
                              padding_token: str = "[PAD]",
                              oov_token: str = "[UNK]") -> Vocabulary:
    """
    Load pre-trained vocabulary form a directory (since the original method does not work --> OOV problem)
    
    Args:
        directory (str)
        padding_token (str): default OOV token symbol ("[PAD]" our case, since we are using BERT)
        oov_token (str): default OOV token symbol ("[UNK]" our case, since we are using BERT)

    Returns:
        Vocabulary
    """
    NAMESPACE_PADDING_FILE = 'non_padded_namespaces.txt'

    print("Loading token dictionary from", directory)
    with codecs.open(os.path.join(directory, NAMESPACE_PADDING_FILE), 'r',
                     'utf-8') as namespace_file:
        non_padded_namespaces = [
            namespace_str.strip() for namespace_str in namespace_file
        ]

    vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)

    # Check every file in the directory.
    for namespace_filename in os.listdir(directory):
        if namespace_filename == NAMESPACE_PADDING_FILE:
            continue
        if namespace_filename.startswith("."):
            continue
        namespace = namespace_filename.replace('.txt', '')
        if any(
                namespace_match(pattern, namespace)
                for pattern in non_padded_namespaces):
            is_padded = False
        else:
            is_padded = True
        filename = os.path.join(directory, namespace_filename)
        vocab.set_from_file(filename,
                            is_padded,
                            oov_token=oov_token,
                            namespace=namespace)
        vocab._padding_token = padding_token

    return vocab
Beispiel #9
0
    def from_files(
        cls,
        directory: str,
        padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
        oov_token: Optional[str] = DEFAULT_OOV_TOKEN,
    ) -> "Vocabulary":
        """
        Loads a ``Vocabulary`` that was serialized using ``save_to_files``.

        Parameters
        ----------
        directory : ``str``
            The directory containing the serialized vocabulary.
        """
        logger.info("Loading token dictionary from %s.", directory)
        padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
        oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN
        with codecs.open(
            os.path.join(directory, NAMESPACE_PADDING_FILE), "r", "utf-8"
        ) as namespace_file:
            non_padded_namespaces = [namespace_str.strip() for namespace_str in namespace_file]

        vocab = cls(
            non_padded_namespaces=non_padded_namespaces,
            padding_token=padding_token,
            oov_token=oov_token,
        )

        # Check every file in the directory.
        for namespace_filename in os.listdir(directory):
            if namespace_filename == NAMESPACE_PADDING_FILE:
                continue
            if namespace_filename.startswith("."):
                continue
            namespace = namespace_filename.replace(".txt", "")
            if any(namespace_match(pattern, namespace) for pattern in non_padded_namespaces):
                is_padded = False
            else:
                is_padded = True
            filename = os.path.join(directory, namespace_filename)
            vocab.set_from_file(filename, is_padded, namespace=namespace, oov_token=oov_token)

        return vocab
def set_vocab_from_filename(vocab: Vocabulary, namespace_filename: str,
                            load_dir: str, non_padded_namespaces: str):
    """Set up the vocabulary from a file

    Arguments:
        vocab: The vocabulary
        namespace_filename: The file containing all the namespaces
            to be loaded
        load_dir: The directory to load the vocab from
        non_padded_namespaces: The namespaces that are not padded
            (like labels etc)
    Returns:
        ``Vocabulary``: The loaded vocabulary
    """
    namespace = namespace_filename.replace('.txt', '')
    if any(
            namespace_match(pattern, namespace)
            for pattern in non_padded_namespaces):
        is_padded = False
    else:
        is_padded = True
    filename = os.path.join(load_dir, namespace_filename)
    vocab.set_from_file(filename, is_padded, namespace=namespace)
    return vocab
Beispiel #11
0
 def test_namespace_match(self):
     assert util.namespace_match("*tags", "tags")
     assert util.namespace_match("*tags", "passage_tags")
     assert util.namespace_match("*tags", "question_tags")
     assert util.namespace_match("tokens", "tokens")
     assert not util.namespace_match("tokens", "stemmed_tokens")
Beispiel #12
0
 def test_namespace_match(self):
     assert util.namespace_match("*tags", "tags")
     assert util.namespace_match("*tags", "passage_tags")
     assert util.namespace_match("*tags", "question_tags")
     assert util.namespace_match("tokens", "tokens")
     assert not util.namespace_match("tokens", "stemmed_tokens")
Beispiel #13
0
    def _extend(self,
                counter: Dict[str, Dict[str, int]] = None,
                min_count: Dict[str, int] = None,
                max_vocab_size: Union[int, Dict[str, int]] = None,
                non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
                pretrained_files: Optional[Dict[str, str]] = None,
                only_include_pretrained_words: bool = False,
                tokens_to_add: Dict[str, List[str]] = None,
                min_pretrained_embeddings: Dict[str, int] = None) -> None:
        """
        This method can be used for extending already generated vocabulary.
        It takes same parameters as Vocabulary initializer. The token2index
        and indextotoken mappings of calling vocabulary will be retained.
        It is an inplace operation so None will be returned.
        """
        if not isinstance(max_vocab_size, dict):
            int_max_vocab_size = max_vocab_size
            max_vocab_size = defaultdict(lambda: int_max_vocab_size)  # type: ignore
        min_count = min_count or {}
        pretrained_files = pretrained_files or {}
        min_pretrained_embeddings = min_pretrained_embeddings or {}
        non_padded_namespaces = set(non_padded_namespaces)
        counter = counter or {}
        tokens_to_add = tokens_to_add or {}

        self._retained_counter = counter
        # Make sure vocabulary extension is safe.
        current_namespaces = {*self._token_to_index}
        extension_namespaces = {*counter, *tokens_to_add}

        for namespace in current_namespaces & extension_namespaces:
            # if new namespace was already present
            # Either both should be padded or none should be.
            original_padded = not any(namespace_match(pattern, namespace)
                                      for pattern in self._non_padded_namespaces)
            extension_padded = not any(namespace_match(pattern, namespace)
                                       for pattern in non_padded_namespaces)
            if original_padded != extension_padded:
                raise ConfigurationError("Common namespace {} has conflicting ".format(namespace)+
                                         "setting of padded = True/False. "+
                                         "Hence extension cannot be done.")

        # Add new non-padded namespaces for extension
        self._token_to_index.add_non_padded_namespaces(non_padded_namespaces)
        self._index_to_token.add_non_padded_namespaces(non_padded_namespaces)
        self._non_padded_namespaces.update(non_padded_namespaces)

        for namespace in counter:
            if namespace in pretrained_files:
                pretrained_list = _read_pretrained_tokens(pretrained_files[namespace])
                min_embeddings = min_pretrained_embeddings.get(namespace, 0)
                if min_embeddings > 0:
                    tokens_old = tokens_to_add.get(namespace, [])
                    tokens_new = pretrained_list[:min_embeddings]
                    tokens_to_add[namespace] = tokens_old + tokens_new
                pretrained_set = set(pretrained_list)
            else:
                pretrained_set = None
            token_counts = list(counter[namespace].items())
            token_counts.sort(key=lambda x: x[1], reverse=True)
            try:
                max_vocab = max_vocab_size[namespace]
            except KeyError:
                max_vocab = None
            if max_vocab:
                token_counts = token_counts[:max_vocab]
            for token, count in token_counts:
                if pretrained_set is not None:
                    if only_include_pretrained_words:
                        if token in pretrained_set and count >= min_count.get(namespace, 1):
                            self.add_token_to_namespace(token, namespace)
                    elif token in pretrained_set or count >= min_count.get(namespace, 1):
                        self.add_token_to_namespace(token, namespace)
                elif count >= min_count.get(namespace, 1):
                    self.add_token_to_namespace(token, namespace)

        for namespace, tokens in tokens_to_add.items():
            for token in tokens:
                self.add_token_to_namespace(token, namespace)
Beispiel #14
0
    def _extend(self,
                counter: Dict[str, Dict[str, int]] = None,
                min_count: Dict[str, int] = None,
                max_vocab_size: Union[int, Dict[str, int]] = None,
                non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
                pretrained_files: Optional[Dict[str, str]] = None,
                only_include_pretrained_words: bool = False,
                tokens_to_add: Dict[str, List[str]] = None,
                min_pretrained_embeddings: Dict[str, int] = None) -> None:
        """
        This method can be used for extending already generated vocabulary.
        It takes same parameters as Vocabulary initializer. The token2index
        and indextotoken mappings of calling vocabulary will be retained.
        It is an inplace operation so None will be returned.
        """
        if not isinstance(max_vocab_size, dict):
            int_max_vocab_size = max_vocab_size
            max_vocab_size = defaultdict(lambda: int_max_vocab_size)  # type: ignore
        min_count = min_count or {}
        pretrained_files = pretrained_files or {}
        min_pretrained_embeddings = min_pretrained_embeddings or {}
        non_padded_namespaces = set(non_padded_namespaces)
        counter = counter or {}
        tokens_to_add = tokens_to_add or {}

        self._retained_counter = counter
        # Make sure vocabulary extension is safe.
        current_namespaces = {*self._token_to_index}
        extension_namespaces = {*counter, *tokens_to_add}

        for namespace in current_namespaces & extension_namespaces:
            # if new namespace was already present
            # Either both should be padded or none should be.
            original_padded = not any(namespace_match(pattern, namespace)
                                      for pattern in self._non_padded_namespaces)
            extension_padded = not any(namespace_match(pattern, namespace)
                                       for pattern in non_padded_namespaces)
            if original_padded != extension_padded:
                raise ConfigurationError("Common namespace {} has conflicting ".format(namespace)+
                                         "setting of padded = True/False. "+
                                         "Hence extension cannot be done.")

        # Add new non-padded namespaces for extension
        self._token_to_index.add_non_padded_namespaces(non_padded_namespaces)
        self._index_to_token.add_non_padded_namespaces(non_padded_namespaces)
        self._non_padded_namespaces.update(non_padded_namespaces)

        for namespace in counter:
            if namespace in pretrained_files:
                pretrained_list = _read_pretrained_tokens(pretrained_files[namespace])
                min_embeddings = min_pretrained_embeddings.get(namespace, 0)
                if min_embeddings > 0:
                    tokens_old = tokens_to_add.get(namespace, [])
                    tokens_new = pretrained_list[:min_embeddings]
                    tokens_to_add[namespace] = tokens_old + tokens_new
                pretrained_set = set(pretrained_list)
            else:
                pretrained_set = None
            token_counts = list(counter[namespace].items())
            token_counts.sort(key=lambda x: x[1], reverse=True)
            try:
                max_vocab = max_vocab_size[namespace]
            except KeyError:
                max_vocab = None
            if max_vocab:
                token_counts = token_counts[:max_vocab]
            for token, count in token_counts:
                if pretrained_set is not None:
                    if only_include_pretrained_words:
                        if token in pretrained_set and count >= min_count.get(namespace, 1):
                            self.add_token_to_namespace(token, namespace)
                    elif token in pretrained_set or count >= min_count.get(namespace, 1):
                        self.add_token_to_namespace(token, namespace)
                elif count >= min_count.get(namespace, 1):
                    self.add_token_to_namespace(token, namespace)

        for namespace, tokens in tokens_to_add.items():
            for token in tokens:
                self.add_token_to_namespace(token, namespace)
Beispiel #15
0
def debug_vocab(parameter_filename: str,
                serialization_dir: str,
                overrides: str = "",
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    A wrapper around :func:`train_model` which loads the params from a file.

    Parameters
    ----------
    parameter_filename : ``str``
        A json parameter file specifying an AllenNLP experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs. We just pass this along to
        :func:`train_model`.
    overrides : ``str``
        A JSON string that we will use to override values in the input parameter file.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we make our output more friendly to saved model files.  We just pass this
        along to :func:`train_model`.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    """
    # Load the experiment config from a file and pass it to ``train_model``.
    params = Params.from_file(parameter_filename, overrides)

    prepare_global_logging(serialization_dir, file_friendly_logging)

    check_for_gpu(params.get('trainer').get('cuda_device', -1))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    vocab = model.vocab
    vocab_namespace_dict = vocab._token_to_index
    vocab_oov_token = vocab._oov_token
    vocab_non_padded_namespaces = vocab._non_padded_namespaces  # this is a set

    vocab_tokens_dict = vocab_namespace_dict['tokens']
    vocab_labels_dict = vocab_namespace_dict['labels']

    print()
    print("Vocab's OOV token: " + vocab_oov_token)
    print("Non-padded namespaces in vocab: " +
          str(list(vocab_non_padded_namespaces)))
    print()

    print("Number of words in vocab's tokens dict: " +
          str(len(vocab_tokens_dict)))
    if any(
            namespace_match(pattern, 'tokens')
            for pattern in vocab_non_padded_namespaces):
        is_padded = False
    else:
        is_padded = True
    print("tokens will return True for is_padded: " + str(is_padded))
    print("Vocab's OOV token is in its tokens dict (should be True): " +
          str(vocab_oov_token in vocab_tokens_dict))
    print()

    print("Number of words in vocab's labels dict: " +
          str(len(vocab_labels_dict)))
    if any(
            namespace_match(pattern, 'labels')
            for pattern in vocab_non_padded_namespaces):
        is_padded = False
    else:
        is_padded = True
    print("labels will return True for is_padded: " + str(is_padded))
    print("Vocab's OOV token is in its labels dict (should be False): " +
          str(vocab_oov_token in vocab_labels_dict))
Beispiel #16
0
    def _extend(self,
                counter=None,
                min_count=None,
                max_vocab_size=None,
                non_padded_namespaces=DEFAULT_NON_PADDED_NAMESPACES,
                pretrained_files=None,
                only_include_pretrained_words=False,
                tokens_to_add=None):
        u"""
        This method can be used for extending already generated vocabulary.
        It takes same parameters as Vocabulary initializer. The token2index
        and indextotoken mappings of calling vocabulary will be retained.
        It is an inplace operation so None will be returned.
        """
        if not isinstance(max_vocab_size, dict):
            int_max_vocab_size = max_vocab_size
            max_vocab_size = defaultdict(
                lambda: int_max_vocab_size)  # type: ignore
        min_count = min_count or {}
        pretrained_files = pretrained_files or {}
        non_padded_namespaces = set(non_padded_namespaces)
        counter = counter or {}
        tokens_to_add = tokens_to_add or {}

        self._retained_counter = counter
        # Make sure vocabulary extension is safe.
        current_namespaces = set(list(self._token_to_index))
        extension_namespaces = set(list(counter) + list(tokens_to_add))

        for namespace in current_namespaces & extension_namespaces:
            # if new namespace was already present
            # Either both should be padded or none should be.
            original_padded = not any(
                namespace_match(pattern, namespace)
                for pattern in self._non_padded_namespaces)
            extension_padded = not any(
                namespace_match(pattern, namespace)
                for pattern in non_padded_namespaces)
            if original_padded != extension_padded:
                raise ConfigurationError(
                    u"Common namespace {} has conflicting ".format(namespace) +
                    u"setting of padded = True/False. " +
                    u"Hence extension cannot be done.")

        # Add new non-padded namespaces for extension
        self._token_to_index.add_non_padded_namespaces(non_padded_namespaces)
        self._index_to_token.add_non_padded_namespaces(non_padded_namespaces)
        self._non_padded_namespaces.update(non_padded_namespaces)

        for namespace in counter:
            if namespace in pretrained_files:
                pretrained_list = _read_pretrained_tokens(
                    pretrained_files[namespace])
            else:
                pretrained_list = None
            token_counts = list(counter[namespace].items())
            token_counts.sort(key=lambda x: x[1], reverse=True)
            max_vocab = max_vocab_size[namespace]
            if max_vocab:
                token_counts = token_counts[:max_vocab]
            for token, count in token_counts:
                if pretrained_list is not None:
                    if only_include_pretrained_words:
                        if token in pretrained_list and count >= min_count.get(
                                namespace, 1):
                            self.add_token_to_namespace(token, namespace)
                    elif token in pretrained_list or count >= min_count.get(
                            namespace, 1):
                        self.add_token_to_namespace(token, namespace)
                elif count >= min_count.get(namespace, 1):
                    self.add_token_to_namespace(token, namespace)

        for namespace, tokens in list(tokens_to_add.items()):
            for token in tokens:
                self.add_token_to_namespace(token, namespace)