Ejemplo n.º 1
0
def _embedd_tokens(
    tokens: Collection[List[str]], language: str, progress_callback: Callable
) -> Tuple[np.ndarray, np.ndarray, Dict[str, Set[int]]]:
    """
    Embedd document and words and create a mapping dictionary between words and
    documents
    """
    # extract words
    word2doc = defaultdict(set)
    for i, doc_tokens in enumerate(tokens):
        for t in doc_tokens:
            word2doc[t].add(i)
    words = list(word2doc.keys())

    # TODO: currently embedding report success unify them to report progress float
    ticks = iter(np.linspace(0, 1, len(tokens) + len(words)))

    def emb_cb(sucess: bool):
        if sucess:
            progress_callback(next(ticks))

    # embedd documents
    embedder = DocumentEmbedder(language=language)
    # tokens is tranformedt to list in case it is np.ndarray
    doc_embs = np.array(embedder.transform(list(tokens), emb_cb))

    # embedd words
    word_embs = np.array(embedder.transform([[w] for w in words], emb_cb))

    return doc_embs, word_embs, word2doc
Ejemplo n.º 2
0
    def test_cache_for_different_aggregators(self):
        embedder = DocumentEmbedder(aggregator='max')
        embedder.clear_cache()
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 0)
        embedder.transform(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder._embedder._cache.persist_cache()

        embedder = DocumentEmbedder(aggregator='min')
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder.transform(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 2)
Ejemplo n.º 3
0
    def test_cache_for_different_languages(self):
        embedder = DocumentEmbedder(language='sl')
        embedder.clear_cache()
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 0)
        embedder.transform(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder._embedder._cache.persist_cache()

        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)
        self.embedder._embedder._cache.persist_cache()

        embedder = DocumentEmbedder(language='sl')
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder.clear_cache()
        self.embedder.clear_cache()
Ejemplo n.º 4
0
def _embedding_similarity(
    corpus: Corpus,
    words: List[str],
    callback: Callable,
    embedding_language: str,
) -> np.ndarray:
    language = LANGS_TO_ISO[embedding_language]
    # make sure there will be only embeddings in X after calling the embedder
    corpus = Corpus.from_table(Domain([], metas=corpus.domain.metas), corpus)
    emb = DocumentEmbedder(language)

    cb_part = len(corpus) / (len(corpus) + len(words))
    documet_embeddings, skipped = emb.transform(
        corpus, wrap_callback(callback, 0, cb_part))
    assert skipped is None

    words = [[w] for w in words]
    word_embeddings = np.array(
        emb.transform(words, wrap_callback(callback, cb_part, 1 - cb_part)))
    return cosine_similarity(documet_embeddings.X, word_embeddings)
Ejemplo n.º 5
0
class DocumentEmbedderTest(unittest.TestCase):

    def setUp(self):
        self.embedder = DocumentEmbedder()  # default params
        self.corpus = Corpus.from_file('deerwester')

    def tearDown(self):
        self.embedder.clear_cache()

    @patch(PATCH_METHOD)
    def test_with_empty_corpus(self, mock):
        self.assertIsNone(self.embedder.transform(self.corpus[:0])[0])
        self.assertIsNone(self.embedder.transform(self.corpus[:0])[1])
        mock.request.assert_not_called()
        mock.get_response.assert_not_called()
        self.assertEqual(self.embedder._embedder._cache._cache_dict, dict())

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_success_subset(self):
        res, skipped = self.embedder.transform(self.corpus[[0]])
        assert_array_equal(res.X, [[0.3, 1]])
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)
        self.assertIsNone(skipped)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_success_shapes(self):
        res, skipped = self.embedder.transform(self.corpus)
        self.assertEqual(res.X.shape, (len(self.corpus), 2))
        self.assertEqual(len(res.domain.variables),
                         len(self.corpus.domain.variables) + 2)
        self.assertIsNone(skipped)

    @patch(PATCH_METHOD, make_dummy_post(b''))
    def test_empty_response(self):
        with self.assertWarns(RuntimeWarning):
            res, skipped = self.embedder.transform(self.corpus[[0]])
        self.assertIsNone(res)
        self.assertEqual(len(skipped), 1)
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

    @patch(PATCH_METHOD, make_dummy_post(b'str'))
    def test_invalid_response(self):
        with self.assertWarns(RuntimeWarning):
            res, skipped = self.embedder.transform(self.corpus[[0]])
        self.assertIsNone(res)
        self.assertEqual(len(skipped), 1)
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embeddings": [0.3, 1]}'))
    def test_invalid_json_key(self):
        with self.assertWarns(RuntimeWarning):
            res, skipped = self.embedder.transform(self.corpus[[0]])
        self.assertIsNone(res)
        self.assertEqual(len(skipped), 1)
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_persistent_caching(self):
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)
        self.embedder.transform(self.corpus[[0]])
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)
        self.embedder._embedder._cache.persist_cache()

        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)

        self.embedder.clear_cache()
        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_cache_for_different_languages(self):
        embedder = DocumentEmbedder(language='sl')
        embedder.clear_cache()
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 0)
        embedder.transform(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder._embedder._cache.persist_cache()

        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)
        self.embedder._embedder._cache.persist_cache()

        embedder = DocumentEmbedder(language='sl')
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder.clear_cache()
        self.embedder.clear_cache()

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_cache_for_different_aggregators(self):
        embedder = DocumentEmbedder(aggregator='max')
        embedder.clear_cache()
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 0)
        embedder.transform(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder._embedder._cache.persist_cache()

        embedder = DocumentEmbedder(aggregator='min')
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder.transform(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 2)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_cancel(self):
        self.assertFalse(self.embedder._embedder._cancelled)
        self.embedder._embedder._cancelled = True
        with self.assertRaises(Exception):
            self.embedder.transform(self.corpus[[0]])

    @patch(PATCH_METHOD, side_effect=OSError)
    def test_connection_error(self, _):
        embedder = DocumentEmbedder()
        with self.assertRaises(ConnectionError):
            embedder.transform(self.corpus[[0]])

    def test_invalid_parameters(self):
        with self.assertRaises(ValueError):
            self.embedder = DocumentEmbedder(language='eng')
        with self.assertRaises(ValueError):
            self.embedder = DocumentEmbedder(aggregator='average')

    def test_remove_temporary_proxy_solution(self):
        """
        When it starts to fail:
        - remove this test
        - remove temporary implementation of get_proxy() function in text.__inint__
        - set minimum version of Orange on 3.33
        """
        import Orange
        self.assertGreater("3.34.0", Orange.__version__)
Ejemplo n.º 6
0
 def test_connection_error(self, _):
     embedder = DocumentEmbedder()
     with self.assertRaises(ConnectionError):
         embedder.transform(self.corpus[[0]])