def __getitem__(self, query: Union[str, List[str]]): """ Retreive a single embedding or a set of embeddings. Arguments: query: A single string or a list of strings Returns: An instance of [Embedding][whatlies.embedding.Embedding] (when `query` is a string) or [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] (when `query` is a list of strings). The embedding vector is computed as the sum of hidden-state representaions of tokens (excluding special tokens, e.g. [CLS]). **Usage** ```python > from whatlies.language import HFTransformersLanguage > lang = HFTransformersLanguage('bert-base-cased') > lang['today is a nice day'] > lang = HFTransformersLanguage('gpt2') > lang[['day and night', 'it is as clear as day', 'today the sky is clear']] ``` """ if isinstance(query, str): return self._get_embedding(query) return EmbeddingSet(*[self._get_embedding(q) for q in query])
def embset_similar( self, emb: Union[str, Embedding], n: int = 10, prob_limit=-15, lower=True, metric="cosine", ): """ Retreive an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] that are the most simmilar to the passed query. Arguments: emb: query to use n: the number of items you'd like to see returned prob_limit: likelihood limit that sets the subset of words to search metric: metric to use to calculate distance, must be scipy or sklearn compatible lower: only fetch lower case tokens Returns: An [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] containing the similar embeddings. """ embs = [ w[0] for w in self.score_similar(emb, n, prob_limit, lower, metric) ] return EmbeddingSet({w.name: w for w in embs})
def __getitem__(self, item): """ Retreive a single embedding or a set of embeddings. We retreive the sentence encoding that belongs to the entire utterance. Arguments: item: single string or list of strings **Usage** ```python from whatlies.language import DIETLanguage("path/to/model.tar.gz") lang[['hi', 'hello', 'greetings']] ``` """ if isinstance(item, str): with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) msg = Message({"text": item}) for p in self.pipeline: p.process(msg) diagnostic_data = msg.as_dict_nlu()["diagnostic_data"] key_of_interest = [ k for k in diagnostic_data.keys() if "DIET" in k ][0] # It's assumed that the final token in the array here represents the __CLS__ token. # These are also known as the "sentence embeddings" tensors = diagnostic_data[key_of_interest]["text_transformed"] return Embedding(item, tensors[-1][-1]) if isinstance(item, list): return EmbeddingSet(*[self[i] for i in item]) raise ValueError(f"Item must be list of strings got {item}.")
def embset_proximity( self, emb: Union[str, Embedding], max_proximity: float = 0.1, prob_limit=-15, lower=True, metric="cosine", ): """ Retreive an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] or embeddings that are within a proximity. Arguments: emb: query to use max_proximity: the number of items you'd like to see returned prob_limit: likelihood limit that sets the subset of words to search metric: metric to use to calculate distance, must be scipy or sklearn compatible lower: only fetch lower case tokens Returns: An [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] containing the similar embeddings. """ if isinstance(emb, str): emb = self[emb] queries = self._prepare_queries(prob_limit, lower) distances = self._calculate_distances(emb, queries, metric) return EmbeddingSet({ w: self[w] for w, d in zip(queries, distances) if d <= max_proximity })
def __getitem__(self, query: Union[str, List[str]]): """ Retreive a set of embeddings. Arguments: query: list of strings **Usage** ```python from whatlies.language import CountVectorLanguage lang = CountVectorLanguage(n_components=2, ngram_range=(1, 2), analyzer="char") lang[['pizza', 'pizzas', 'firehouse', 'firehydrant']] ``` """ orig_str = isinstance(query, str) if orig_str: query = [query] if any([len(q) == 0 for q in query]): raise ValueError( "You've passed an empty string to the language model which is not allowed." ) if self.fitted_manual: X = self.cv.transform(query) X_vec = self.svd.transform(X) else: X = self.cv.fit_transform(query) X_vec = self.svd.fit_transform(X) if orig_str: return Embedding(name=query[0], vector=X_vec[0]) return EmbeddingSet( *[Embedding(name=n, vector=v) for n, v in zip(query, X_vec)] )
def __getitem__( self, query: Union[str, List[str]]) -> Union[Embedding, EmbeddingSet]: """ Retreive a single embedding or a set of embeddings. Arguments: query: single string or list of strings **Usage** ```python > from whatlies.language import ConveRTLanguage > lang = ConveRTLanguage() > lang['bank'] > lang = ConveRTLanguage() > lang[['bank of the river', 'money on the bank', 'bank']] ``` """ if isinstance(query, str): query_tensor = tf.convert_to_tensor([query]) encoding = self.model(query_tensor) if self.signature == "encode_sequence": vec = encoding["sequence_encoding"].numpy().sum(axis=1)[0] else: vec = encoding["default"].numpy()[0] return Embedding(query, vec) return EmbeddingSet(*[self[tok] for tok in query])
def __getitem__(self, query: Union[str, List[str]]): """ Retreive a single embedding or a set of embeddings. Arguments: query: single string or list of strings **Usage** ```python > from whatlies.language import GensimLanguage > lang = GensimLanguage("wordvectors.kv") > lang['computer'] > lang = GensimLanguage("wordvectors.kv") > lang[['computer', 'human', 'dog']] ``` """ if isinstance(query, str): if " " in query: return Embedding( query, np.sum([self[q].vector for q in query.split(" ")], axis=0) ) try: vec = np.sum([self.kv[q] for q in query.split(" ")], axis=0) except KeyError: vec = np.zeros(self.kv.vector_size) return Embedding(query, vec) return EmbeddingSet(*[self[tok] for tok in query])
def embset_similar( self, emb: Union[str, Embedding], n: int = 10, lower=False, metric="cosine", ) -> EmbeddingSet: """ Retreive an [EmbeddingSet][bulk_labelling.custom_whatlies.embeddingset.EmbeddingSet] that are the most similar to the passed query. Note that we will only consider words that were passed in the `.fit_manual()` step. Arguments: emb: query to use n: the number of items you'd like to see returned metric: metric to use to calculate distance, must be scipy or sklearn compatible lower: only fetch lower case tokens Returns: An [EmbeddingSet][bulk_labelling.custom_whatlies.embeddingset.EmbeddingSet] containing the similar embeddings. """ embs = [ w[0] for w in self.score_similar( emb=emb, n=n, lower=lower, metric=metric) ] return EmbeddingSet({w.name: w for w in embs})
def get_embeddingset(veclist, textlist): """gets a whatlies.embeddingset from the encoding given by the language model Args: veclist (numpy.ndarray): ndarray of all encodings textlist (list): vector of encoded texts Returns: whatlies.EmbeddingSet: whatlies EmbeddingSet for easier transformation """ return EmbeddingSet( * [get_embedding(veclist[q], textlist[q]) for q in range(len(textlist))])
def embset_similar(self, query, n=10): """ Retreive an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] that are the most simmilar to the passed query. Arguments: query: query to use n: the number of items you'd like to see returned Returns: An [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] containing the similar embeddings. """ return EmbeddingSet( *[self[tok] for tok, sim in self.s2v.most_similar(query, n=n)], name=f"Embset[s2v similar_{n}:{query}]", )
def __getitem__(self, query): """ Retreive a single embedding or a set of embeddings. Arguments: query: single string or list of strings **Usage** ```python > lang = SpacyLanguage("en_core_web_md") > lang['duck|NOUN'] > lang[['duck|NOUN'], ['duck|VERB']] ``` """ if isinstance(query, str): vec = self.s2v[query] return Embedding(query, vec) return EmbeddingSet(*[self[tok] for tok in query])
def __getitem__(self, query: Union[str, List[str]]): """ Retreive a single embedding or a set of embeddings. Depending on the spaCy model the strings can support multiple tokens of text but they can also use the Bert DSL. See the Language Options documentation: https://rasahq.github.io/whatlies/tutorial/languages/#bert-style. Arguments: query: single string or list of strings **Usage** ```python > lang = FasttextLanguage("cc.en.300.bin") > lang['python'] > lang[['python'], ['snake']] > lang[['nobody expects'], ['the spanish inquisition']] ``` """ if isinstance(query, str): self._input_str_legal(query) vec = self.model.get_word_vector(query) return Embedding(query, vec) return EmbeddingSet(*[self[tok] for tok in query])
def __getitem__( self, query: Union[str, List[str]]) -> Union[Embedding, EmbeddingSet]: """ Retreive a single embedding or a set of embeddings. Arguments: query: single string or list of strings **Usage** ```python > from whatlies.language import TFHubLanguage > lang = TFHubLanguage("https://tfhub.dev/google/nnlm-en-dim50/2") > lang['today is a gift'] > lang = TFHubLanguage("https://tfhub.dev/google/nnlm-en-dim50/2") > lang[['withdraw some money', 'take out cash', 'cash out funds']] ``` """ if isinstance(query, str): return self._get_embedding(query) return EmbeddingSet(*[self._get_embedding(q) for q in query])
def __getitem__( self, query: Union[str, List[str]]) -> Union[Embedding, EmbeddingSet]: """ Retreive a single embedding or a set of embeddings. Depending on the spaCy model the strings can support multiple tokens of text but they can also use the Bert DSL. See the Language Options documentation: https://rasahq.github.io/whatlies/tutorial/languages/#bert-style. Arguments: query: single string or list of strings **Usage** ```python > lang = SpacyLanguage("en_core_web_md") > lang['python'] > lang[['python', 'snake']] > lang[['nobody expects', 'the spanish inquisition']] > lang = SpacyLanguage("en_trf_robertabase_lg") > lang['programming in [python]'] ``` """ if isinstance(query, str): return self._get_embedding(query) return EmbeddingSet(*[self._get_embedding(q) for q in query])
emb: query to use max_proximity: the number of items you'd like to see returned top_n: likelihood limit that sets the subset of words to search metric: metric to use to calculate distance, must be scipy or sklearn compatible lower: only fetch lower case tokens Returns: An [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] containing the similar embeddings. """ if isinstance(emb, str): emb = self[emb] queries = self._prepare_queries(top_n, lower) distances = self._calculate_distances(emb, queries, metric) return EmbeddingSet({ w: self[w] for w, d in zip(queries, distances) if d <= max_proximity }) def embset_similar( self, emb: Union[str, Embedding], n: int = 10, top_n=20_000, lower=False, metric="cosine", ): """ Retreive an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] that are the most similar to the passed query. Arguments: emb: query to use