Ejemplo n.º 1
0
def _embedd_tokens(
    tokens: Collection[List[str]], language: str, progress_callback: Callable
) -> Tuple[np.ndarray, np.ndarray, Dict[str, Set[int]]]:
    """
    Embedd document and words and create a mapping dictionary between words and
    documents
    """
    # extract words
    word2doc = defaultdict(set)
    for i, doc_tokens in enumerate(tokens):
        for t in doc_tokens:
            word2doc[t].add(i)
    words = list(word2doc.keys())

    # TODO: currently embedding report success unify them to report progress float
    ticks = iter(np.linspace(0, 1, len(tokens) + len(words)))

    def emb_cb(sucess: bool):
        if sucess:
            progress_callback(next(ticks))

    # embedd documents
    embedder = DocumentEmbedder(language=language)
    # tokens is tranformedt to list in case it is np.ndarray
    doc_embs = np.array(embedder.transform(list(tokens), emb_cb))

    # embedd words
    word_embs = np.array(embedder.transform([[w] for w in words], emb_cb))

    return doc_embs, word_embs, word2doc
Ejemplo n.º 2
0
    def test_persistent_caching(self):
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)
        self.embedder(self.corpus[[0]])
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)
        self.embedder._embedder._cache.persist_cache()

        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)

        self.embedder.clear_cache()
        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)
Ejemplo n.º 3
0
    def test_cache_for_different_aggregators(self):
        embedder = DocumentEmbedder(aggregator='max')
        embedder.clear_cache()
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 0)
        embedder.transform(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder._embedder._cache.persist_cache()

        embedder = DocumentEmbedder(aggregator='min')
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder.transform(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 2)
Ejemplo n.º 4
0
def run_pretrained_embedder(corpus: Corpus, language: str, aggregator: str,
                            state: TaskState) -> Tuple[Corpus, Corpus]:
    """Runs DocumentEmbedder.

    Parameters
    ----------
    corpus : Corpus
        Corpus on which transform is performed.
    language : str
        ISO 639-1 (two-letter) code of desired language.
    aggregator : str
        Aggregator which creates document embedding (single
        vector) from word embeddings (multiple vectors).
        Allowed values are mean, sum, max, min.
    state : TaskState
        State object.

    Returns
    -------
    Corpus
        New corpus with additional features.
    """
    embedder = DocumentEmbedder(language=language, aggregator=aggregator)

    ticks = iter(np.linspace(0., 100., len(corpus)))

    def advance(success=True):
        if state.is_interruption_requested():
            embedder.set_cancelled()
        if success:
            state.set_progress_value(next(ticks))

    new_corpus, skipped_corpus = embedder(corpus, processed_callback=advance)
    return new_corpus, skipped_corpus
Ejemplo n.º 5
0
    def test_cache_for_different_languages(self):
        embedder = DocumentEmbedder(language='sl')
        embedder.clear_cache()
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 0)
        embedder(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder._embedder._cache.persist_cache()

        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)
        self.embedder._embedder._cache.persist_cache()

        embedder = DocumentEmbedder(language='sl')
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder.clear_cache()
        self.embedder.clear_cache()
Ejemplo n.º 6
0
def _embedding_similarity(
    corpus: Corpus,
    words: List[str],
    callback: Callable,
    embedding_language: str,
) -> np.ndarray:
    language = LANGS_TO_ISO[embedding_language]
    # make sure there will be only embeddings in X after calling the embedder
    corpus = Corpus.from_table(Domain([], metas=corpus.domain.metas), corpus)
    emb = DocumentEmbedder(language)

    cb_part = len(corpus) / (len(corpus) + len(words))
    documet_embeddings, skipped = emb.transform(
        corpus, wrap_callback(callback, 0, cb_part))
    assert skipped is None

    words = [[w] for w in words]
    word_embeddings = np.array(
        emb.transform(words, wrap_callback(callback, cb_part, 1 - cb_part)))
    return cosine_similarity(documet_embeddings.X, word_embeddings)
Ejemplo n.º 7
0
def _embedding_similarity(
    corpus: Corpus,
    words: List[str],
    callback: Callable,
    embedding_language: str,
) -> np.ndarray:
    ticks = iter(np.linspace(0, 0.8, len(corpus) + len(words)))

    # TODO: currently embedding report success unify them to report progress float
    def emb_cb(sucess: bool):
        if sucess:
            callback(next(ticks))

    language = LANGS_TO_ISO[embedding_language]
    # make sure there will be only embeddings in X after calling the embedder
    corpus = Corpus.from_table(Domain([], metas=corpus.domain.metas), corpus)
    emb = DocumentEmbedder(language)
    documet_embeddings, skipped = emb(corpus, emb_cb)
    assert skipped is None
    word_embeddings = np.array(emb([[w] for w in words], emb_cb))
    return cosine_similarity(documet_embeddings.X, word_embeddings)
Ejemplo n.º 8
0
 def setUp(self):
     self.embedder = DocumentEmbedder()  # default params
     self.corpus = Corpus.from_file('deerwester')
Ejemplo n.º 9
0
class DocumentEmbedderTest(unittest.TestCase):

    def setUp(self):
        self.embedder = DocumentEmbedder()  # default params
        self.corpus = Corpus.from_file('deerwester')

    def tearDown(self):
        self.embedder.clear_cache()

    @patch(PATCH_METHOD)
    def test_with_empty_corpus(self, mock):
        self.assertEqual(len(self.embedder(self.corpus[:0])), 0)
        mock.request.assert_not_called()
        mock.get_response.assert_not_called()
        self.assertEqual(self.embedder._embedder._cache._cache_dict, dict())

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_success_subset(self):
        res = self.embedder(self.corpus[[0]])
        assert_array_equal(res.X, [[0.3, 1]])
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_success_shapes(self):
        res = self.embedder(self.corpus)
        self.assertEqual(res.X.shape, (len(self.corpus), 2))
        self.assertEqual(len(res.domain), len(self.corpus.domain) + 2)

    @patch(PATCH_METHOD, make_dummy_post(b''))
    def test_empty_response(self):
        with self.assertWarns(RuntimeWarning):
            res = self.embedder(self.corpus[[0]])
        self.assertEqual(res.X.shape, (0, 0))
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

    @patch(PATCH_METHOD, make_dummy_post(b'str'))
    def test_invalid_response(self):
        with self.assertWarns(RuntimeWarning):
            res = self.embedder(self.corpus[[0]])
        self.assertEqual(res.X.shape, (0, 0))
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embeddings": [0.3, 1]}'))
    def test_invalid_json_key(self):
        with self.assertWarns(RuntimeWarning):
            res = self.embedder(self.corpus[[0]])
        self.assertEqual(res.X.shape, (0, 0))
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_persistent_caching(self):
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)
        self.embedder(self.corpus[[0]])
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)
        self.embedder._embedder._cache.persist_cache()

        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)

        self.embedder.clear_cache()
        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_cache_for_different_languages(self):
        embedder = DocumentEmbedder(language='sl')
        embedder.clear_cache()
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 0)
        embedder(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder._embedder._cache.persist_cache()

        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)
        self.embedder._embedder._cache.persist_cache()

        embedder = DocumentEmbedder(language='sl')
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder.clear_cache()
        self.embedder.clear_cache()

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_cache_for_different_aggregators(self):
        embedder = DocumentEmbedder(aggregator='max')
        embedder.clear_cache()
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 0)
        embedder(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder._embedder._cache.persist_cache()

        embedder = DocumentEmbedder(aggregator='min')
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 2)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_with_statement(self):
        with self.embedder as embedder:
            res = embedder(self.corpus[[0]])
            assert_array_equal(res.X, [[0.3, 1]])

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_cancel(self):
        self.assertFalse(self.embedder._embedder._cancelled)
        self.embedder._embedder._cancelled = True
        with self.assertRaises(Exception):
            self.embedder(self.corpus[[0]])

    @patch(PATCH_METHOD, side_effect=OSError)
    def test_connection_error(self, _):
        embedder = DocumentEmbedder()
        with self.assertRaises(ConnectionError):
            embedder(self.corpus[[0]])

    def test_invalid_parameters(self):
        with self.assertRaises(ValueError):
            self.embedder = DocumentEmbedder(language='eng')
        with self.assertRaises(ValueError):
            self.embedder = DocumentEmbedder(aggregator='average')

    def test_invalid_corpus_type(self):
        with self.assertRaises(ValueError):
            self.embedder(self.corpus[0])
Ejemplo n.º 10
0
 def test_invalid_parameters(self):
     with self.assertRaises(ValueError):
         self.embedder = DocumentEmbedder(language='eng')
     with self.assertRaises(ValueError):
         self.embedder = DocumentEmbedder(aggregator='average')
Ejemplo n.º 11
0
 def test_connection_error(self, _):
     embedder = DocumentEmbedder()
     with self.assertRaises(ConnectionError):
         embedder(self.corpus[[0]])
Ejemplo n.º 12
0
class DocumentEmbedderTest(unittest.TestCase):

    def setUp(self):
        self.embedder = DocumentEmbedder()  # default params
        self.corpus = Corpus.from_file('deerwester')

    def tearDown(self):
        self.embedder.clear_cache()

    @patch(PATCH_METHOD)
    def test_with_empty_corpus(self, mock):
        self.assertIsNone(self.embedder.transform(self.corpus[:0])[0])
        self.assertIsNone(self.embedder.transform(self.corpus[:0])[1])
        mock.request.assert_not_called()
        mock.get_response.assert_not_called()
        self.assertEqual(self.embedder._embedder._cache._cache_dict, dict())

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_success_subset(self):
        res, skipped = self.embedder.transform(self.corpus[[0]])
        assert_array_equal(res.X, [[0.3, 1]])
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)
        self.assertIsNone(skipped)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_success_shapes(self):
        res, skipped = self.embedder.transform(self.corpus)
        self.assertEqual(res.X.shape, (len(self.corpus), 2))
        self.assertEqual(len(res.domain.variables),
                         len(self.corpus.domain.variables) + 2)
        self.assertIsNone(skipped)

    @patch(PATCH_METHOD, make_dummy_post(b''))
    def test_empty_response(self):
        with self.assertWarns(RuntimeWarning):
            res, skipped = self.embedder.transform(self.corpus[[0]])
        self.assertIsNone(res)
        self.assertEqual(len(skipped), 1)
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

    @patch(PATCH_METHOD, make_dummy_post(b'str'))
    def test_invalid_response(self):
        with self.assertWarns(RuntimeWarning):
            res, skipped = self.embedder.transform(self.corpus[[0]])
        self.assertIsNone(res)
        self.assertEqual(len(skipped), 1)
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embeddings": [0.3, 1]}'))
    def test_invalid_json_key(self):
        with self.assertWarns(RuntimeWarning):
            res, skipped = self.embedder.transform(self.corpus[[0]])
        self.assertIsNone(res)
        self.assertEqual(len(skipped), 1)
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_persistent_caching(self):
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)
        self.embedder.transform(self.corpus[[0]])
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)
        self.embedder._embedder._cache.persist_cache()

        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1)

        self.embedder.clear_cache()
        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_cache_for_different_languages(self):
        embedder = DocumentEmbedder(language='sl')
        embedder.clear_cache()
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 0)
        embedder.transform(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder._embedder._cache.persist_cache()

        self.embedder = DocumentEmbedder()
        self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)
        self.embedder._embedder._cache.persist_cache()

        embedder = DocumentEmbedder(language='sl')
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder.clear_cache()
        self.embedder.clear_cache()

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_cache_for_different_aggregators(self):
        embedder = DocumentEmbedder(aggregator='max')
        embedder.clear_cache()
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 0)
        embedder.transform(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder._embedder._cache.persist_cache()

        embedder = DocumentEmbedder(aggregator='min')
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 1)
        embedder.transform(self.corpus[[0]])
        self.assertEqual(len(embedder._embedder._cache._cache_dict), 2)

    @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}'))
    def test_cancel(self):
        self.assertFalse(self.embedder._embedder._cancelled)
        self.embedder._embedder._cancelled = True
        with self.assertRaises(Exception):
            self.embedder.transform(self.corpus[[0]])

    @patch(PATCH_METHOD, side_effect=OSError)
    def test_connection_error(self, _):
        embedder = DocumentEmbedder()
        with self.assertRaises(ConnectionError):
            embedder.transform(self.corpus[[0]])

    def test_invalid_parameters(self):
        with self.assertRaises(ValueError):
            self.embedder = DocumentEmbedder(language='eng')
        with self.assertRaises(ValueError):
            self.embedder = DocumentEmbedder(aggregator='average')

    def test_remove_temporary_proxy_solution(self):
        """
        When it starts to fail:
        - remove this test
        - remove temporary implementation of get_proxy() function in text.__inint__
        - set minimum version of Orange on 3.33
        """
        import Orange
        self.assertGreater("3.34.0", Orange.__version__)