def test_read_pretrained_words(self): # The fixture "fake_embeddings.5d.txt" was generated using the words in this random quote words = set(u"If you think you are too small to make a difference " u"try to sleeping with a mosquito àèìòù".split(u' ')) # Reading from a single (compressed) file or a single-file archive base_path = unicode(self.FIXTURES_ROOT / u"embeddings/fake_embeddings.5d.txt") for ext in [u'', u'.gz', u'.lzma', u'.bz2', u'.zip', u'.tar.gz']: file_path = base_path + ext words_read = _read_pretrained_tokens(file_path) assert words_read == words, "Wrong words for file {file_path}\n"\ " Read: {sorted(words_read)}\n"\ "Correct: {sorted(words)}" # Reading from a multi-file archive base_path = unicode(self.FIXTURES_ROOT / u"embeddings/multi-file-archive") file_path = u'folder/fake_embeddings.5d.txt' for ext in [u'.zip', u'.tar.gz']: archive_path = base_path + ext embeddings_file_uri = format_embeddings_file_uri( archive_path, file_path) words_read = _read_pretrained_tokens(embeddings_file_uri) assert words_read == words, "Wrong words for file {archive_path}\n"\ " Read: {sorted(words_read)}\n"\ "Correct: {sorted(words)}"
def test_read_pretrained_words(self): # The fixture "fake_embeddings.5d.txt" was generated using the words in this random quote words = set("If you think you are too small to make a difference " "try to sleeping with a mosquito àèìòù".split(" ")) # Reading from a single (compressed) file or a single-file archive base_path = str(self.FIXTURES_ROOT / "embeddings/fake_embeddings.5d.txt") for ext in ["", ".gz", ".lzma", ".bz2", ".zip", ".tar.gz"]: file_path = base_path + ext words_read = set(_read_pretrained_tokens(file_path)) assert words_read == words, (f"Wrong words for file {file_path}\n" f" Read: {sorted(words_read)}\n" f"Correct: {sorted(words)}") # Reading from a multi-file archive base_path = str(self.FIXTURES_ROOT / "embeddings/multi-file-archive") file_path = "folder/fake_embeddings.5d.txt" for ext in [".zip", ".tar.gz"]: archive_path = base_path + ext embeddings_file_uri = format_embeddings_file_uri( archive_path, file_path) words_read = set(_read_pretrained_tokens(embeddings_file_uri)) assert words_read == words, ( f"Wrong words for file {archive_path}\n" f" Read: {sorted(words_read)}\n" f"Correct: {sorted(words)}")
def generate_neighbours(vocab, file_name, measure='euc', topk=8, rho=0.6): if vocab is None: tokens = _read_pretrained_tokens(WORD2VECS['counter']) vocab = Vocabulary(tokens_to_add={"tokens": tokens}) embed = read_weight(vocab, "counter", None) emb_util = EmbeddingNbrUtil(embed, vocab.get_token_index, vocab.get_token_from_index) if rho is None: emb_util.pre_search(measure, topk + 1, None) nbr_num = [] ret = {} tokens = list(vocab.get_token_to_index_vocabulary("tokens").keys()) if file_name is None: tokens = random.choices(tokens, k=100) for ele in tqdm(tokens): nbrs = emb_util.find_neighbours(ele, measure, topk + 1, rho, return_words=True) if ele in nbrs: nbrs.remove(ele) ret[ele] = nbrs nbr_num.append(len(nbrs)) print(nbr_num) print('Average neighbour num:', np.mean(nbr_num)) if file_name is None: return json.dump(ret, open(f"external_data/{file_name}", "w"))
def build_vocab(instances: Iterable[Instance], pretrained_files: Optional[Dict[str, str]] = None, include_full_pretrained_words: bool = False ) -> Vocabulary: print("Building the vocabulary") vocab = Vocabulary.from_instances(instances, min_count={"tokens": 1}) if pretrained_files and include_full_pretrained_words: pretrained_tokens = _read_pretrained_tokens(pretrained_files["tokens"]) from collections import Counter c = Counter(pretrained_tokens) counter = {"tokens": dict(c)} vocab._extend(counter=counter) print("Vocab size: ", vocab.get_vocab_size("tokens")) return vocab
def test_read_pretrained_words(self): # The fixture "fake_embeddings.5d.txt" was generated using the words in this random quote words = set("If you think you are too small to make a difference " "try to sleeping with a mosquito àèìòù".split(' ')) # Reading from a single (compressed) file or a single-file archive base_path = str(self.FIXTURES_ROOT / "embeddings/fake_embeddings.5d.txt") for ext in ['', '.gz', '.lzma', '.bz2', '.zip', '.tar.gz']: file_path = base_path + ext words_read = set(_read_pretrained_tokens(file_path)) assert words_read == words, f"Wrong words for file {file_path}\n" \ f" Read: {sorted(words_read)}\n" \ f"Correct: {sorted(words)}" # Reading from a multi-file archive base_path = str(self.FIXTURES_ROOT / "embeddings/multi-file-archive") file_path = 'folder/fake_embeddings.5d.txt' for ext in ['.zip', '.tar.gz']: archive_path = base_path + ext embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path) words_read = set(_read_pretrained_tokens(embeddings_file_uri)) assert words_read == words, f"Wrong words for file {archive_path}\n" \ f" Read: {sorted(words_read)}\n" \ f"Correct: {sorted(words)}"
def from_instances_extended( cls, instances: Iterable[allen_data.Instance], min_count: Dict[str, int] = None, max_vocab_size: Union[int, Dict[str, int]] = None, non_padded_namespaces: Iterable[str] = vocabulary. DEFAULT_NON_PADDED_NAMESPACES, pretrained_files: Optional[Dict[str, str]] = None, only_include_pretrained_words: bool = False, min_pretrained_embeddings: Dict[str, int] = None, padding_token: Optional[str] = vocabulary.DEFAULT_PADDING_TOKEN, oov_token: Optional[str] = vocabulary.DEFAULT_OOV_TOKEN, ) -> "Vocabulary": """ Extension to manually fill gaps in missing 'feats_labels'. """ # Load manually tokens from pretrained file (using different strategy # - only words add all embedding file, without checking if were seen # in any dataset. tokens_to_add = None if pretrained_files and "tokens" in pretrained_files: pretrained_set = set( vocabulary._read_pretrained_tokens(pretrained_files["tokens"])) tokens_to_add = {"tokens": list(pretrained_set)} pretrained_files = None vocab = super().from_instances( instances=instances, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add, min_pretrained_embeddings=min_pretrained_embeddings, padding_token=padding_token, oov_token=oov_token) # Extending vocab with features that does not show up explicitly. # To know all features we need to read full dataset first. # Adding auxiliary '=None' feature for each category is needed # to perform classification. get_slices_if_not_provided(vocab) return vocab
def _extend(self, counter: Dict[str, Dict[str, int]] = None, min_count: Dict[str, int] = None, max_vocab_size: Union[int, Dict[str, int]] = None, non_padded_namespaces: Iterable[ str] = EXTENDED_NON_PADDED_NAMESPACES, pretrained_files: Optional[Dict[str, str]] = None, only_include_pretrained_words: bool = False, tokens_to_add: Dict[str, List[str]] = None, min_pretrained_embeddings: Dict[str, int] = None) -> None: """ Modifies the default ``Vocabulary._extend`` method so that tokens which would be eliminated are instead added to "*unk" namespaces. """ if not isinstance(max_vocab_size, dict): int_max_vocab_size = max_vocab_size max_vocab_size = defaultdict( lambda: int_max_vocab_size) # type: ignore min_count = min_count or {} pretrained_files = pretrained_files or {} min_pretrained_embeddings = min_pretrained_embeddings or {} non_padded_namespaces = set(non_padded_namespaces) counter = counter or {} tokens_to_add = tokens_to_add or {} self._retained_counter = counter # Make sure vocabulary extension is safe. current_namespaces = {*self._token_to_index} extension_namespaces = {*counter, *tokens_to_add} for namespace in current_namespaces & extension_namespaces: # if new namespace was already present # Either both should be padded or none should be. original_padded = not any( namespace_match(pattern, namespace) for pattern in self._non_padded_namespaces) extension_padded = not any( namespace_match(pattern, namespace) for pattern in non_padded_namespaces) if original_padded != extension_padded: raise ConfigurationError( "Common namespace {} has conflicting ".format(namespace) + "setting of padded = True/False. " + "Hence extension cannot be done.") # Add new non-padded namespaces for extension self._token_to_index.add_non_padded_namespaces(non_padded_namespaces) self._index_to_token.add_non_padded_namespaces(non_padded_namespaces) self._non_padded_namespaces.update(non_padded_namespaces) for namespace in counter: # pylint: disable=too-many-nested-blocks if namespace in pretrained_files: pretrained_list = _read_pretrained_tokens( pretrained_files[namespace]) min_embeddings = min_pretrained_embeddings.get(namespace, 0) if min_embeddings > 0: tokens_old = tokens_to_add.get(namespace, []) tokens_new = pretrained_list[:min_embeddings] tokens_to_add[namespace] = tokens_old + tokens_new pretrained_set = set(pretrained_list) else: pretrained_set = set() token_counts = list(counter[namespace].items()) token_counts.sort(key=lambda x: x[1], reverse=True) try: max_vocab = max_vocab_size[namespace] if max_vocab is not None: # Add these to *unk namespace unk_counts = token_counts[max_vocab:] token_counts = token_counts[:max_vocab] else: unk_counts = [] except KeyError: unk_counts = [] for token, count in token_counts: if pretrained_set is not None: if only_include_pretrained_words: if token in pretrained_set: if count >= min_count.get(namespace, 1): self.add_token_to_namespace(token, namespace) else: self.add_token_to_namespace( token, namespace + '_unk') elif token in pretrained_set or count >= min_count.get( namespace, 1): self.add_token_to_namespace(token, namespace) else: self.add_token_to_namespace(token, namespace + '_unk') elif count >= min_count.get(namespace, 1): self.add_token_to_namespace(token, namespace) else: self.add_token_to_namespace(token, namespace + '_unk') for token, count in unk_counts: self.add_token_to_namespace(token, namespace + '_unk') for namespace, tokens in tokens_to_add.items(): for token in tokens: self.add_token_to_namespace(token, namespace)