Example #1
0
    def test_knn_doc2vec_cls_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/soc_classifiers"
        s3_storage = S3Store(path=s3_path)
        model_storage = ModelStorage(s3_storage)
        corpus_generator = FakeCorpusGenerator()

        # Embedding has no lookup_dict
        d2v = Doc2VecModel(size=10,
                           min_count=1,
                           dm=0,
                           alpha=0.025,
                           min_alpha=0.025)
        trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
        trainer.train(corpus_generator, lookup=False)

        self.assertRaises(ValueError,
                          lambda: KNNDoc2VecClassifier(embedding_model=d2v))

        d2v = Doc2VecModel(size=10,
                           min_count=1,
                           dm=0,
                           alpha=0.025,
                           min_alpha=0.025)
        trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
        trainer.train(corpus_generator, lookup=True)

        # KNNDoc2VecClassifier only supports doc2vec now
        self.assertRaises(NotImplementedError,
                          lambda: KNNDoc2VecClassifier(Word2VecModel()))

        doc = docs.split(',')[0].split()

        knn = KNNDoc2VecClassifier(embedding_model=d2v, k=0)
        self.assertRaises(ValueError, lambda: knn.predict_soc([doc]))

        knn = KNNDoc2VecClassifier(embedding_model=d2v, k=10)
        soc_cls = SocClassifier(knn)

        assert knn.predict_soc([doc])[0][0] == soc_cls.predict_soc([doc])[0][0]

        # Build Annoy index
        knn.build_ann_indexer(num_trees=5)
        assert isinstance(knn.indexer, AnnoyIndexer)

        # Save
        s3 = s3fs.S3FileSystem()
        model_storage.save_model(knn, knn.model_name)
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set([knn.model_name])

        # Load
        new_knn = model_storage.load_model(knn.model_name)
        assert new_knn.model_name == knn.model_name
        assert new_knn.predict_soc([doc])[0][0] == '29-2061.00'

        # Have to re-build the index whenever ones load the knn model to the memory
        assert new_knn.indexer == None
Example #2
0
    def test_knn_doc2vec_cls_local(self, mock_getcwd):
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))
            corpus_generator = FakeCorpusGenerator()
            d2v = Doc2VecModel(size=10,
                               min_count=1,
                               dm=0,
                               alpha=0.025,
                               min_alpha=0.025)
            trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
            trainer.train(corpus_generator, lookup=True)

            # KNNDoc2VecClassifier only supports doc2vec now
            self.assertRaises(NotImplementedError,
                              lambda: KNNDoc2VecClassifier(Word2VecModel()))

            doc = docs.split(',')[0].split()

            knn = KNNDoc2VecClassifier(embedding_model=d2v, k=0)
            self.assertRaises(ValueError, lambda: knn.predict_soc([doc]))

            knn = KNNDoc2VecClassifier(embedding_model=d2v, k=1)
            soc_cls = SocClassifier(knn)

            assert knn.predict_soc([doc
                                    ])[0][0] == soc_cls.predict_soc([doc
                                                                     ])[0][0]

            # Build Annoy index
            knn.build_ann_indexer(num_trees=5)
            assert isinstance(knn.indexer, AnnoyIndexer)

            # Save
            model_storage.save_model(knn, knn.model_name)
            assert set(os.listdir(os.getcwd())) == set([knn.model_name])
            assert isinstance(knn.indexer, AnnoyIndexer)

            # Load
            new_knn = model_storage.load_model(knn.model_name)
            assert new_knn.model_name == knn.model_name
            assert new_knn.predict_soc([doc])[0][0] == '29-2061.00'

            # Have to re-build the index whenever ones load the knn model to the memory
            assert new_knn.indexer == None