コード例 #1
0
    def __getitem__(self, query: Union[str, List[str]]):
        """
        Retreive a single embedding or a set of embeddings.

        Arguments:
            query: A single string or a list of strings

        Returns:
            An instance of [Embedding][whatlies.embedding.Embedding] (when `query` is a string)
            or [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] (when `query` is a list of strings).
            The embedding vector is computed as the sum of hidden-state representaions of tokens
            (excluding special tokens, e.g. [CLS]).

        **Usage**

        ```python
        > from whatlies.language import HFTransformersLanguage
        > lang = HFTransformersLanguage('bert-base-cased')
        > lang['today is a nice day']
        > lang = HFTransformersLanguage('gpt2')
        > lang[['day and night', 'it is as clear as day', 'today the sky is clear']]
        ```
        """
        if isinstance(query, str):
            return self._get_embedding(query)
        return EmbeddingSet(*[self._get_embedding(q) for q in query])
コード例 #2
0
    def embset_similar(
        self,
        emb: Union[str, Embedding],
        n: int = 10,
        prob_limit=-15,
        lower=True,
        metric="cosine",
    ):
        """
        Retreive an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] that are the most simmilar to the passed query.

        Arguments:
            emb: query to use
            n: the number of items you'd like to see returned
            prob_limit: likelihood limit that sets the subset of words to search
            metric: metric to use to calculate distance, must be scipy or sklearn compatible
            lower: only fetch lower case tokens

        Returns:
            An [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] containing the similar embeddings.
        """
        embs = [
            w[0] for w in self.score_similar(emb, n, prob_limit, lower, metric)
        ]
        return EmbeddingSet({w.name: w for w in embs})
コード例 #3
0
    def __getitem__(self, item):
        """
        Retreive a single embedding or a set of embeddings. We retreive the sentence encoding that
        belongs to the entire utterance.

        Arguments:
            item: single string or list of strings

        **Usage**
        ```python
        from whatlies.language import DIETLanguage("path/to/model.tar.gz")
        lang[['hi', 'hello', 'greetings']]
        ```
        """
        if isinstance(item, str):
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", category=RuntimeWarning)
                msg = Message({"text": item})
                for p in self.pipeline:
                    p.process(msg)
                diagnostic_data = msg.as_dict_nlu()["diagnostic_data"]
                key_of_interest = [
                    k for k in diagnostic_data.keys() if "DIET" in k
                ][0]
                # It's assumed that the final token in the array here represents the __CLS__ token.
                # These are also known as the "sentence embeddings"
                tensors = diagnostic_data[key_of_interest]["text_transformed"]
                return Embedding(item, tensors[-1][-1])
        if isinstance(item, list):
            return EmbeddingSet(*[self[i] for i in item])
        raise ValueError(f"Item must be list of strings got {item}.")
コード例 #4
0
    def embset_proximity(
        self,
        emb: Union[str, Embedding],
        max_proximity: float = 0.1,
        prob_limit=-15,
        lower=True,
        metric="cosine",
    ):
        """
        Retreive an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] or embeddings that are within a proximity.

        Arguments:
            emb: query to use
            max_proximity: the number of items you'd like to see returned
            prob_limit: likelihood limit that sets the subset of words to search
            metric: metric to use to calculate distance, must be scipy or sklearn compatible
            lower: only fetch lower case tokens

        Returns:
            An [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] containing the similar embeddings.
        """
        if isinstance(emb, str):
            emb = self[emb]

        queries = self._prepare_queries(prob_limit, lower)
        distances = self._calculate_distances(emb, queries, metric)
        return EmbeddingSet({
            w: self[w]
            for w, d in zip(queries, distances) if d <= max_proximity
        })
コード例 #5
0
    def __getitem__(self, query: Union[str, List[str]]):
        """
        Retreive a set of embeddings.

        Arguments:
            query: list of strings

        **Usage**

        ```python
        from whatlies.language import CountVectorLanguage
        lang = CountVectorLanguage(n_components=2, ngram_range=(1, 2), analyzer="char")
        lang[['pizza', 'pizzas', 'firehouse', 'firehydrant']]
        ```
        """
        orig_str = isinstance(query, str)
        if orig_str:
            query = [query]
        if any([len(q) == 0 for q in query]):
            raise ValueError(
                "You've passed an empty string to the language model which is not allowed."
            )
        if self.fitted_manual:
            X = self.cv.transform(query)
            X_vec = self.svd.transform(X)
        else:
            X = self.cv.fit_transform(query)
            X_vec = self.svd.fit_transform(X)
        if orig_str:
            return Embedding(name=query[0], vector=X_vec[0])
        return EmbeddingSet(
            *[Embedding(name=n, vector=v) for n, v in zip(query, X_vec)]
        )
コード例 #6
0
    def __getitem__(
            self, query: Union[str,
                               List[str]]) -> Union[Embedding, EmbeddingSet]:
        """
        Retreive a single embedding or a set of embeddings.

        Arguments:
            query: single string or list of strings

        **Usage**

        ```python
        > from whatlies.language import ConveRTLanguage
        > lang = ConveRTLanguage()
        > lang['bank']
        > lang = ConveRTLanguage()
        > lang[['bank of the river', 'money on the bank', 'bank']]
        ```
        """
        if isinstance(query, str):
            query_tensor = tf.convert_to_tensor([query])
            encoding = self.model(query_tensor)
            if self.signature == "encode_sequence":
                vec = encoding["sequence_encoding"].numpy().sum(axis=1)[0]
            else:
                vec = encoding["default"].numpy()[0]
            return Embedding(query, vec)
        return EmbeddingSet(*[self[tok] for tok in query])
コード例 #7
0
    def __getitem__(self, query: Union[str, List[str]]):
        """
        Retreive a single embedding or a set of embeddings.

        Arguments:
            query: single string or list of strings

        **Usage**
        ```python
        > from whatlies.language import GensimLanguage
        > lang = GensimLanguage("wordvectors.kv")
        > lang['computer']
        > lang = GensimLanguage("wordvectors.kv")
        > lang[['computer', 'human', 'dog']]
        ```
        """
        if isinstance(query, str):
            if " " in query:
                return Embedding(
                    query, np.sum([self[q].vector for q in query.split(" ")], axis=0)
                )
            try:
                vec = np.sum([self.kv[q] for q in query.split(" ")], axis=0)
            except KeyError:
                vec = np.zeros(self.kv.vector_size)
            return Embedding(query, vec)
        return EmbeddingSet(*[self[tok] for tok in query])
コード例 #8
0
    def embset_similar(
        self,
        emb: Union[str, Embedding],
        n: int = 10,
        lower=False,
        metric="cosine",
    ) -> EmbeddingSet:
        """
        Retreive an [EmbeddingSet][bulk_labelling.custom_whatlies.embeddingset.EmbeddingSet] that are the most similar to the passed query.
        Note that we will only consider words that were passed in the `.fit_manual()` step.

        Arguments:
            emb: query to use
            n: the number of items you'd like to see returned
            metric: metric to use to calculate distance, must be scipy or sklearn compatible
            lower: only fetch lower case tokens

        Returns:
            An [EmbeddingSet][bulk_labelling.custom_whatlies.embeddingset.EmbeddingSet] containing the similar embeddings.
        """
        embs = [
            w[0] for w in self.score_similar(
                emb=emb, n=n, lower=lower, metric=metric)
        ]
        return EmbeddingSet({w.name: w for w in embs})
コード例 #9
0
def get_embeddingset(veclist, textlist):
    """gets a whatlies.embeddingset from the encoding given by the language model

    Args:
        veclist (numpy.ndarray): ndarray of all encodings
        textlist (list): vector of encoded texts

    Returns:
        whatlies.EmbeddingSet: whatlies EmbeddingSet for easier transformation
    """

    return EmbeddingSet(
        *
        [get_embedding(veclist[q], textlist[q]) for q in range(len(textlist))])
コード例 #10
0
    def embset_similar(self, query, n=10):
        """
        Retreive an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] that are the most simmilar to the passed query.

        Arguments:
            query: query to use
            n: the number of items you'd like to see returned

        Returns:
            An [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] containing the similar embeddings.
        """
        return EmbeddingSet(
            *[self[tok] for tok, sim in self.s2v.most_similar(query, n=n)],
            name=f"Embset[s2v similar_{n}:{query}]",
        )
コード例 #11
0
    def __getitem__(self, query):
        """
        Retreive a single embedding or a set of embeddings.

        Arguments:
            query: single string or list of strings

        **Usage**
        ```python
        > lang = SpacyLanguage("en_core_web_md")
        > lang['duck|NOUN']
        > lang[['duck|NOUN'], ['duck|VERB']]
        ```
        """
        if isinstance(query, str):
            vec = self.s2v[query]
            return Embedding(query, vec)
        return EmbeddingSet(*[self[tok] for tok in query])
コード例 #12
0
    def __getitem__(self, query: Union[str, List[str]]):
        """
        Retreive a single embedding or a set of embeddings. Depending on the spaCy model
        the strings can support multiple tokens of text but they can also use the Bert DSL.
        See the Language Options documentation: https://rasahq.github.io/whatlies/tutorial/languages/#bert-style.

        Arguments:
            query: single string or list of strings

        **Usage**
        ```python
        > lang = FasttextLanguage("cc.en.300.bin")
        > lang['python']
        > lang[['python'], ['snake']]
        > lang[['nobody expects'], ['the spanish inquisition']]
        ```
        """
        if isinstance(query, str):
            self._input_str_legal(query)
            vec = self.model.get_word_vector(query)
            return Embedding(query, vec)
        return EmbeddingSet(*[self[tok] for tok in query])
コード例 #13
0
    def __getitem__(
            self, query: Union[str,
                               List[str]]) -> Union[Embedding, EmbeddingSet]:
        """
        Retreive a single embedding or a set of embeddings.

        Arguments:
            query: single string or list of strings

        **Usage**

        ```python
        > from whatlies.language import TFHubLanguage
        > lang = TFHubLanguage("https://tfhub.dev/google/nnlm-en-dim50/2")
        > lang['today is a gift']
        > lang = TFHubLanguage("https://tfhub.dev/google/nnlm-en-dim50/2")
        > lang[['withdraw some money', 'take out cash', 'cash out funds']]
        ```
        """
        if isinstance(query, str):
            return self._get_embedding(query)
        return EmbeddingSet(*[self._get_embedding(q) for q in query])
コード例 #14
0
    def __getitem__(
            self, query: Union[str,
                               List[str]]) -> Union[Embedding, EmbeddingSet]:
        """
        Retreive a single embedding or a set of embeddings. Depending on the spaCy model
        the strings can support multiple tokens of text but they can also use the Bert DSL.
        See the Language Options documentation: https://rasahq.github.io/whatlies/tutorial/languages/#bert-style.

        Arguments:
            query: single string or list of strings

        **Usage**
        ```python
        > lang = SpacyLanguage("en_core_web_md")
        > lang['python']
        > lang[['python', 'snake']]
        > lang[['nobody expects', 'the spanish inquisition']]
        > lang = SpacyLanguage("en_trf_robertabase_lg")
        > lang['programming in [python]']
        ```
        """
        if isinstance(query, str):
            return self._get_embedding(query)
        return EmbeddingSet(*[self._get_embedding(q) for q in query])
コード例 #15
0
            emb: query to use
            max_proximity: the number of items you'd like to see returned
            top_n: likelihood limit that sets the subset of words to search
            metric: metric to use to calculate distance, must be scipy or sklearn compatible
            lower: only fetch lower case tokens

        Returns:
            An [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] containing the similar embeddings.
        """
        if isinstance(emb, str):
            emb = self[emb]

        queries = self._prepare_queries(top_n, lower)
        distances = self._calculate_distances(emb, queries, metric)
        return EmbeddingSet({
            w: self[w]
            for w, d in zip(queries, distances) if d <= max_proximity
        })

    def embset_similar(
        self,
        emb: Union[str, Embedding],
        n: int = 10,
        top_n=20_000,
        lower=False,
        metric="cosine",
    ):
        """
        Retreive an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] that are the most similar to the passed query.

        Arguments:
            emb: query to use