예제 #1
0
    def test_read_pretrained_words(self):
        # The fixture "fake_embeddings.5d.txt" was generated using the words in this random quote
        words = set(u"If you think you are too small to make a difference "
                    u"try to sleeping with a mosquito àèìòù".split(u' '))

        # Reading from a single (compressed) file or a single-file archive
        base_path = unicode(self.FIXTURES_ROOT /
                            u"embeddings/fake_embeddings.5d.txt")
        for ext in [u'', u'.gz', u'.lzma', u'.bz2', u'.zip', u'.tar.gz']:
            file_path = base_path + ext
            words_read = _read_pretrained_tokens(file_path)
            assert words_read == words, "Wrong words for file {file_path}\n"\
                                        "   Read: {sorted(words_read)}\n"\
                                        "Correct: {sorted(words)}"

        # Reading from a multi-file archive
        base_path = unicode(self.FIXTURES_ROOT /
                            u"embeddings/multi-file-archive")
        file_path = u'folder/fake_embeddings.5d.txt'
        for ext in [u'.zip', u'.tar.gz']:
            archive_path = base_path + ext
            embeddings_file_uri = format_embeddings_file_uri(
                archive_path, file_path)
            words_read = _read_pretrained_tokens(embeddings_file_uri)
            assert words_read == words, "Wrong words for file {archive_path}\n"\
                                        "   Read: {sorted(words_read)}\n"\
                                        "Correct: {sorted(words)}"
예제 #2
0
    def test_read_pretrained_words(self):
        # The fixture "fake_embeddings.5d.txt" was generated using the words in this random quote
        words = set("If you think you are too small to make a difference "
                    "try to sleeping with a mosquito àèìòù".split(" "))

        # Reading from a single (compressed) file or a single-file archive
        base_path = str(self.FIXTURES_ROOT /
                        "embeddings/fake_embeddings.5d.txt")
        for ext in ["", ".gz", ".lzma", ".bz2", ".zip", ".tar.gz"]:
            file_path = base_path + ext
            words_read = set(_read_pretrained_tokens(file_path))
            assert words_read == words, (f"Wrong words for file {file_path}\n"
                                         f"   Read: {sorted(words_read)}\n"
                                         f"Correct: {sorted(words)}")

        # Reading from a multi-file archive
        base_path = str(self.FIXTURES_ROOT / "embeddings/multi-file-archive")
        file_path = "folder/fake_embeddings.5d.txt"
        for ext in [".zip", ".tar.gz"]:
            archive_path = base_path + ext
            embeddings_file_uri = format_embeddings_file_uri(
                archive_path, file_path)
            words_read = set(_read_pretrained_tokens(embeddings_file_uri))
            assert words_read == words, (
                f"Wrong words for file {archive_path}\n"
                f"   Read: {sorted(words_read)}\n"
                f"Correct: {sorted(words)}")
예제 #3
0
def generate_neighbours(vocab, file_name, measure='euc', topk=8, rho=0.6):
    if vocab is None:
        tokens = _read_pretrained_tokens(WORD2VECS['counter'])
        vocab = Vocabulary(tokens_to_add={"tokens": tokens})

    embed = read_weight(vocab, "counter", None)
    emb_util = EmbeddingNbrUtil(embed, vocab.get_token_index,
                                vocab.get_token_from_index)
    if rho is None:
        emb_util.pre_search(measure, topk + 1, None)

    nbr_num = []
    ret = {}
    tokens = list(vocab.get_token_to_index_vocabulary("tokens").keys())
    if file_name is None:
        tokens = random.choices(tokens, k=100)
    for ele in tqdm(tokens):
        nbrs = emb_util.find_neighbours(ele,
                                        measure,
                                        topk + 1,
                                        rho,
                                        return_words=True)
        if ele in nbrs:
            nbrs.remove(ele)
        ret[ele] = nbrs
        nbr_num.append(len(nbrs))
    print(nbr_num)
    print('Average neighbour num:', np.mean(nbr_num))
    if file_name is None:
        return
    json.dump(ret, open(f"external_data/{file_name}", "w"))
예제 #4
0
def build_vocab(instances: Iterable[Instance],
                pretrained_files: Optional[Dict[str, str]] = None,
                include_full_pretrained_words: bool = False
                ) -> Vocabulary:
    print("Building the vocabulary")
    vocab = Vocabulary.from_instances(instances, min_count={"tokens": 1})
    if pretrained_files and include_full_pretrained_words:
        pretrained_tokens = _read_pretrained_tokens(pretrained_files["tokens"])
        from collections import Counter
        c = Counter(pretrained_tokens)
        counter = {"tokens": dict(c)}
        vocab._extend(counter=counter)
    print("Vocab size: ", vocab.get_vocab_size("tokens"))
    return vocab
예제 #5
0
    def test_read_pretrained_words(self):
        # The fixture "fake_embeddings.5d.txt" was generated using the words in this random quote
        words = set("If you think you are too small to make a difference "
                    "try to sleeping with a mosquito àèìòù".split(' '))

        # Reading from a single (compressed) file or a single-file archive
        base_path = str(self.FIXTURES_ROOT / "embeddings/fake_embeddings.5d.txt")
        for ext in ['', '.gz', '.lzma', '.bz2', '.zip', '.tar.gz']:
            file_path = base_path + ext
            words_read = set(_read_pretrained_tokens(file_path))
            assert words_read == words, f"Wrong words for file {file_path}\n" \
                                        f"   Read: {sorted(words_read)}\n" \
                                        f"Correct: {sorted(words)}"

        # Reading from a multi-file archive
        base_path = str(self.FIXTURES_ROOT / "embeddings/multi-file-archive")
        file_path = 'folder/fake_embeddings.5d.txt'
        for ext in ['.zip', '.tar.gz']:
            archive_path = base_path + ext
            embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path)
            words_read = set(_read_pretrained_tokens(embeddings_file_uri))
            assert words_read == words, f"Wrong words for file {archive_path}\n" \
                                        f"   Read: {sorted(words_read)}\n" \
                                        f"Correct: {sorted(words)}"
예제 #6
0
    def from_instances_extended(
        cls,
        instances: Iterable[allen_data.Instance],
        min_count: Dict[str, int] = None,
        max_vocab_size: Union[int, Dict[str, int]] = None,
        non_padded_namespaces: Iterable[str] = vocabulary.
        DEFAULT_NON_PADDED_NAMESPACES,
        pretrained_files: Optional[Dict[str, str]] = None,
        only_include_pretrained_words: bool = False,
        min_pretrained_embeddings: Dict[str, int] = None,
        padding_token: Optional[str] = vocabulary.DEFAULT_PADDING_TOKEN,
        oov_token: Optional[str] = vocabulary.DEFAULT_OOV_TOKEN,
    ) -> "Vocabulary":
        """
        Extension to manually fill gaps in missing 'feats_labels'.
        """
        # Load manually tokens from pretrained file (using different strategy
        # - only words add all embedding file, without checking if were seen
        # in any dataset.
        tokens_to_add = None
        if pretrained_files and "tokens" in pretrained_files:
            pretrained_set = set(
                vocabulary._read_pretrained_tokens(pretrained_files["tokens"]))
            tokens_to_add = {"tokens": list(pretrained_set)}
            pretrained_files = None

        vocab = super().from_instances(
            instances=instances,
            min_count=min_count,
            max_vocab_size=max_vocab_size,
            non_padded_namespaces=non_padded_namespaces,
            pretrained_files=pretrained_files,
            only_include_pretrained_words=only_include_pretrained_words,
            tokens_to_add=tokens_to_add,
            min_pretrained_embeddings=min_pretrained_embeddings,
            padding_token=padding_token,
            oov_token=oov_token)
        # Extending vocab with features that does not show up explicitly.
        # To know all features we need to read full dataset first.
        # Adding auxiliary '=None' feature for each category is needed
        # to perform classification.
        get_slices_if_not_provided(vocab)
        return vocab
    def _extend(self,
                counter: Dict[str, Dict[str, int]] = None,
                min_count: Dict[str, int] = None,
                max_vocab_size: Union[int, Dict[str, int]] = None,
                non_padded_namespaces: Iterable[
                    str] = EXTENDED_NON_PADDED_NAMESPACES,
                pretrained_files: Optional[Dict[str, str]] = None,
                only_include_pretrained_words: bool = False,
                tokens_to_add: Dict[str, List[str]] = None,
                min_pretrained_embeddings: Dict[str, int] = None) -> None:
        """
        Modifies the default ``Vocabulary._extend`` method so that tokens which would be
        eliminated are instead added to "*unk" namespaces.
        """
        if not isinstance(max_vocab_size, dict):
            int_max_vocab_size = max_vocab_size
            max_vocab_size = defaultdict(
                lambda: int_max_vocab_size)  # type: ignore
        min_count = min_count or {}
        pretrained_files = pretrained_files or {}
        min_pretrained_embeddings = min_pretrained_embeddings or {}
        non_padded_namespaces = set(non_padded_namespaces)
        counter = counter or {}
        tokens_to_add = tokens_to_add or {}

        self._retained_counter = counter
        # Make sure vocabulary extension is safe.
        current_namespaces = {*self._token_to_index}
        extension_namespaces = {*counter, *tokens_to_add}

        for namespace in current_namespaces & extension_namespaces:
            # if new namespace was already present
            # Either both should be padded or none should be.
            original_padded = not any(
                namespace_match(pattern, namespace)
                for pattern in self._non_padded_namespaces)
            extension_padded = not any(
                namespace_match(pattern, namespace)
                for pattern in non_padded_namespaces)
            if original_padded != extension_padded:
                raise ConfigurationError(
                    "Common namespace {} has conflicting ".format(namespace) +
                    "setting of padded = True/False. " +
                    "Hence extension cannot be done.")

        # Add new non-padded namespaces for extension
        self._token_to_index.add_non_padded_namespaces(non_padded_namespaces)
        self._index_to_token.add_non_padded_namespaces(non_padded_namespaces)
        self._non_padded_namespaces.update(non_padded_namespaces)

        for namespace in counter:  # pylint: disable=too-many-nested-blocks
            if namespace in pretrained_files:
                pretrained_list = _read_pretrained_tokens(
                    pretrained_files[namespace])
                min_embeddings = min_pretrained_embeddings.get(namespace, 0)
                if min_embeddings > 0:
                    tokens_old = tokens_to_add.get(namespace, [])
                    tokens_new = pretrained_list[:min_embeddings]
                    tokens_to_add[namespace] = tokens_old + tokens_new
                pretrained_set = set(pretrained_list)
            else:
                pretrained_set = set()
            token_counts = list(counter[namespace].items())
            token_counts.sort(key=lambda x: x[1], reverse=True)
            try:
                max_vocab = max_vocab_size[namespace]
                if max_vocab is not None:
                    # Add these to *unk namespace
                    unk_counts = token_counts[max_vocab:]
                    token_counts = token_counts[:max_vocab]
                else:
                    unk_counts = []
            except KeyError:
                unk_counts = []
            for token, count in token_counts:
                if pretrained_set is not None:
                    if only_include_pretrained_words:
                        if token in pretrained_set:
                            if count >= min_count.get(namespace, 1):
                                self.add_token_to_namespace(token, namespace)
                            else:
                                self.add_token_to_namespace(
                                    token, namespace + '_unk')
                    elif token in pretrained_set or count >= min_count.get(
                            namespace, 1):
                        self.add_token_to_namespace(token, namespace)
                    else:
                        self.add_token_to_namespace(token, namespace + '_unk')
                elif count >= min_count.get(namespace, 1):
                    self.add_token_to_namespace(token, namespace)
                else:
                    self.add_token_to_namespace(token, namespace + '_unk')
            for token, count in unk_counts:
                self.add_token_to_namespace(token, namespace + '_unk')

        for namespace, tokens in tokens_to_add.items():
            for token in tokens:
                self.add_token_to_namespace(token, namespace)