Esempio n. 1
0
    def test_knn_doc2vec_cls_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/soc_classifiers"
        s3_storage = S3Store(path=s3_path)
        model_storage = ModelStorage(s3_storage)
        corpus_generator = FakeCorpusGenerator()

        # Embedding has no lookup_dict
        d2v = Doc2VecModel(size=10,
                           min_count=1,
                           dm=0,
                           alpha=0.025,
                           min_alpha=0.025)
        trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
        trainer.train(corpus_generator, lookup=False)

        self.assertRaises(ValueError,
                          lambda: KNNDoc2VecClassifier(embedding_model=d2v))

        d2v = Doc2VecModel(size=10,
                           min_count=1,
                           dm=0,
                           alpha=0.025,
                           min_alpha=0.025)
        trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
        trainer.train(corpus_generator, lookup=True)

        # KNNDoc2VecClassifier only supports doc2vec now
        self.assertRaises(NotImplementedError,
                          lambda: KNNDoc2VecClassifier(Word2VecModel()))

        doc = docs.split(',')[0].split()

        knn = KNNDoc2VecClassifier(embedding_model=d2v, k=0)
        self.assertRaises(ValueError, lambda: knn.predict_soc([doc]))

        knn = KNNDoc2VecClassifier(embedding_model=d2v, k=10)
        soc_cls = SocClassifier(knn)

        assert knn.predict_soc([doc])[0][0] == soc_cls.predict_soc([doc])[0][0]

        # Build Annoy index
        knn.build_ann_indexer(num_trees=5)
        assert isinstance(knn.indexer, AnnoyIndexer)

        # Save
        s3 = s3fs.S3FileSystem()
        model_storage.save_model(knn, knn.model_name)
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set([knn.model_name])

        # Load
        new_knn = model_storage.load_model(knn.model_name)
        assert new_knn.model_name == knn.model_name
        assert new_knn.predict_soc([doc])[0][0] == '29-2061.00'

        # Have to re-build the index whenever ones load the knn model to the memory
        assert new_knn.indexer == None
    def test_embedding_trainer_doc2vec_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/embedding"
        s3_storage = S3Store(path=s3_path)

        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        job_postings_generator = JobPostingCollectionSample(num_records=30)
        corpus_generator = Doc2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)
        d2v = Doc2VecModel(storage=s3_storage,
                           size=10,
                           min_count=3,
                           iter=4,
                           window=6,
                           workers=3)

        trainer = EmbeddingTrainer(corpus_generator, d2v)
        trainer.train(lookup=True)
        trainer.save_model()

        vocab_size = len(d2v.wv.vocab.keys())
        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert d2v.model_name == trainer.model_name
        assert set(files) == set([trainer.model_name])
        self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict)

        # Save as different name
        d2v.save('other_name.model')

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set([trainer.model_name, 'other_name.model'])

        # Load
        d2v_loaded = Doc2VecModel.load(s3_storage, trainer.model_name)
        assert d2v_loaded.metadata['embedding_model']['hyperparameters'][
            'vector_size'] == trainer.metadata['embedding_model'][
                'hyperparameters']['vector_size']
        # Change the store directory
        new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory"
        trainer.save_model(S3Store(new_s3_path))
        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(new_s3_path)]
        assert set(files) == set([trainer.model_name])
Esempio n. 3
0
    def test_doc2vec(self):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        job_postings_generator = JobPostingCollectionSample(num_records=50)
        corpus_generator = Doc2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)
        d2v = Doc2VecModel(size=16,
                           min_count=1,
                           dm=0,
                           alpha=0.025,
                           min_alpha=0.025)
        trainer = EmbeddingTrainer(d2v)
        trainer.train(corpus_generator)

        # Since the inference of doc2vec is an non-deterministic algorithm, we need to reset the random seed for testing.
        d2v.random.seed(0)
        v1 = d2v.infer_vector(["media", "news"])
        d2v.random.seed(0)
        v2 = d2v.infer_vector(["media", "news"])
        assert_array_equal(v1, v2)

        # test unseen word
        self.assertRaises(KeyError, lambda: d2v["sports"])

        # test unseen sentence
        v1 = d2v.infer_vector(["sports"])
        v2 = d2v.infer_vector(["sports"])
        assert_array_equal(v1, v2)
    def test_embedding_trainer_doc2vec_local(self, mock_getcwd):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]

        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            job_postings_generator = JobPostingCollectionSample(num_records=30)
            corpus_generator = Doc2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)
            d2v = Doc2VecModel(storage=FSStore(td),
                               size=10,
                               min_count=3,
                               iter=4,
                               window=6,
                               workers=3)

            trainer = EmbeddingTrainer(corpus_generator, d2v)
            trainer.train(lookup=True)
            trainer.save_model()

            vocab_size = len(d2v.wv.vocab.keys())
            assert d2v.model_name == trainer.model_name
            assert set(os.listdir(os.getcwd())) == set([trainer.model_name])
            self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict)

            # Save as different name
            d2v.save('other_name.model')
            assert set(os.listdir(os.getcwd())) == set(
                [trainer.model_name, 'other_name.model'])

            # Load
            d2v_loaded = Doc2VecModel.load(FSStore(td), trainer.model_name)
            assert d2v_loaded.metadata["embedding_model"][
                "model_type"] == trainer.metadata["embedding_model"][
                    "model_type"]

            # Change the store directory
            new_path = os.path.join(td, 'other_directory')
            trainer.save_model(FSStore(new_path))
            assert set(os.listdir(new_path)) == set([trainer.model_name])
    def test_embedding_trainer_doc2vec_with_other(self):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        job_postings_generator = JobPostingCollectionSample(num_records=30)
        corpus_generator = Doc2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)

        trainer = EmbeddingTrainer(Doc2VecModel(), Word2VecModel(),
                                   FastTextModel())
        self.assertRaises(TypeError, lambda: trainer.train(corpus_generator))
Esempio n. 6
0
    def test_knn_doc2vec_cls_local(self, mock_getcwd):
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))
            corpus_generator = FakeCorpusGenerator()
            d2v = Doc2VecModel(size=10,
                               min_count=1,
                               dm=0,
                               alpha=0.025,
                               min_alpha=0.025)
            trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
            trainer.train(corpus_generator, lookup=True)

            # KNNDoc2VecClassifier only supports doc2vec now
            self.assertRaises(NotImplementedError,
                              lambda: KNNDoc2VecClassifier(Word2VecModel()))

            doc = docs.split(',')[0].split()

            knn = KNNDoc2VecClassifier(embedding_model=d2v, k=0)
            self.assertRaises(ValueError, lambda: knn.predict_soc([doc]))

            knn = KNNDoc2VecClassifier(embedding_model=d2v, k=1)
            soc_cls = SocClassifier(knn)

            assert knn.predict_soc([doc
                                    ])[0][0] == soc_cls.predict_soc([doc
                                                                     ])[0][0]

            # Build Annoy index
            knn.build_ann_indexer(num_trees=5)
            assert isinstance(knn.indexer, AnnoyIndexer)

            # Save
            model_storage.save_model(knn, knn.model_name)
            assert set(os.listdir(os.getcwd())) == set([knn.model_name])
            assert isinstance(knn.indexer, AnnoyIndexer)

            # Load
            new_knn = model_storage.load_model(knn.model_name)
            assert new_knn.model_name == knn.model_name
            assert new_knn.predict_soc([doc])[0][0] == '29-2061.00'

            # Have to re-build the index whenever ones load the knn model to the memory
            assert new_knn.indexer == None