Beispiel #1
0
    def test_knn_doc2vec_cls_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/soc_classifiers"
        s3_storage = S3Store(path=s3_path)
        model_storage = ModelStorage(s3_storage)
        corpus_generator = FakeCorpusGenerator()

        # Embedding has no lookup_dict
        d2v = Doc2VecModel(size=10,
                           min_count=1,
                           dm=0,
                           alpha=0.025,
                           min_alpha=0.025)
        trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
        trainer.train(corpus_generator, lookup=False)

        self.assertRaises(ValueError,
                          lambda: KNNDoc2VecClassifier(embedding_model=d2v))

        d2v = Doc2VecModel(size=10,
                           min_count=1,
                           dm=0,
                           alpha=0.025,
                           min_alpha=0.025)
        trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
        trainer.train(corpus_generator, lookup=True)

        # KNNDoc2VecClassifier only supports doc2vec now
        self.assertRaises(NotImplementedError,
                          lambda: KNNDoc2VecClassifier(Word2VecModel()))

        doc = docs.split(',')[0].split()

        knn = KNNDoc2VecClassifier(embedding_model=d2v, k=0)
        self.assertRaises(ValueError, lambda: knn.predict_soc([doc]))

        knn = KNNDoc2VecClassifier(embedding_model=d2v, k=10)
        soc_cls = SocClassifier(knn)

        assert knn.predict_soc([doc])[0][0] == soc_cls.predict_soc([doc])[0][0]

        # Build Annoy index
        knn.build_ann_indexer(num_trees=5)
        assert isinstance(knn.indexer, AnnoyIndexer)

        # Save
        s3 = s3fs.S3FileSystem()
        model_storage.save_model(knn, knn.model_name)
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set([knn.model_name])

        # Load
        new_knn = model_storage.load_model(knn.model_name)
        assert new_knn.model_name == knn.model_name
        assert new_knn.predict_soc([doc])[0][0] == '29-2061.00'

        # Have to re-build the index whenever ones load the knn model to the memory
        assert new_knn.indexer == None
Beispiel #2
0
    def test_with_grid_search(self):
        import boto3
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3 = S3Store('fake-open-skills')
        model_storage = ModelStorage(s3)

        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import GridSearchCV

        gs = GridSearchCV(RandomForestClassifier(), {})
        proxy_gs = ProxyObjectWithStorage(model_obj=gs,
                                          storage=s3,
                                          model_name='rf.grid')

        X = np.random.rand(20, 2)
        y = np.random.randint(2, size=20)

        proxy_gs.fit(X, y)
        model_storage.save_model(proxy_gs, 'rf.grid')

        loaded_proxy_gs = model_storage.load_model('rf.grid')

        assert loaded_proxy_gs.storage.path == s3.path
        assert proxy_gs.predict([[5, 6]]) == gs.predict([[5, 6]])
Beispiel #3
0
    def test_with_iterable_pipelin(self):
        import boto3
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3 = S3Store('fake-open-skills/models')
        model_storage = ModelStorage(storage=s3)
        fake = FakeModel('fake')

        model_storage.save_model(fake, fake.model_name)
        vectorize_for_pipeline = partial(nlp.vectorize,
                                         embedding_model=SerializedByStorage(
                                             storage=s3,
                                             model_name=fake.model_name,
                                             model=fake))
        pipe = IterablePipeline(vectorize_for_pipeline)

        pipe_unpickled = pickle.loads(pickle.dumps(pipe))
        # make sure the fake model wasn't pickled but the reference
        assert pipe_unpickled.functions[-1].keywords[
            'embedding_model']._model == None
        assert pipe_unpickled.functions[-1].keywords[
            'embedding_model'].storage.path == s3.path
        # The model will be loaded when it's needed
        assert list(pipe_unpickled([1])) == [[1, 2, 3, 4]]
Beispiel #4
0
    def test_embedding_trainer_word2vec_local(self, mock_getcwd):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]

        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))
            job_postings_generator = JobPostingCollectionSample(num_records=30)
            corpus_generator = Word2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)
            w2v = Word2VecModel(size=10,
                                min_count=3,
                                iter=4,
                                window=6,
                                workers=3)

            trainer = EmbeddingTrainer(corpus_generator, w2v, model_storage)
            trainer.train()
            trainer.save_model()

            vocab_size = len(w2v.wv.vocab.keys())

            assert w2v.model_name == trainer.model_name
            assert set(os.listdir(os.getcwd())) == set([trainer.model_name])

            # Test Online Training
            job_postings_generator = JobPostingCollectionSample(num_records=50)
            corpus_generator = Word2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)

            w2v_loaded = model_storage.load_model(w2v.model_name)

            new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded,
                                           model_storage)
            new_trainer.train()
            new_trainer.save_model()

            new_vocab_size = len(w2v_loaded.wv.vocab.keys())

            assert set(os.listdir(os.getcwd())) == set(
                [trainer.model_name, new_trainer.model_name])
            assert new_trainer.metadata['embedding_trainer'][
                'model_name'] != trainer.metadata['embedding_trainer'][
                    'model_name']
            assert vocab_size <= new_vocab_size

            # Save as different name
            model_storage.save_model(w2v, 'other_name.model')
            assert set(os.listdir(os.getcwd())) == set([
                trainer.model_name, new_trainer.model_name, 'other_name.model'
            ])

            # Change the store directory
            new_path = os.path.join(td, 'other_directory')
            new_trainer.save_model(FSStore(new_path))
            assert set(os.listdir(new_path)) == set([new_trainer.model_name])
Beispiel #5
0
 def save(self):
     if self.storage:
         model_storage = ModelStorage(self.storage)
         if self.model_name:
             model_storage.save_model(self, self.model_name)
         else:
             raise AttributeError("'self.model_name' shouldn't be {self.model_name}")
     else:
         raise AttributeError("'self.model_storage' shouldn't be {self.model_storage}")
Beispiel #6
0
 def test_model_storage(self, mock_getcwd):
     with tempfile.TemporaryDirectory() as td:
         mock_getcwd.return_value = td
         ms = ModelStorage(FSStore(td))
         fake = FakeModel(1)
         ms.save_model(fake, 'test.model')
         assert set(os.listdir(os.getcwd())) == set(['test.model'])
         new_model = ms.load_model('test.model')
         assert new_model.val == fake.val
    def save_model(self, storage=None):
        if storage:
            ms = ModelStorage(storage)
        else:
            ms = self.model_storage

        for model in self._models:
            model.storage = ms.storage
            ms.save_model(model, model.model_name)
            logging.info(f"{model.model_name} has been stored to {ms.storage.path}.")
Beispiel #8
0
 def save_model(self, storage=None):
     if storage is None:
         if self.model_storage is None:
             raise AttributeError(
                 f"'self.model_storage' should not be None if you want to save the model"
             )
         ms = self.model_storage
         ms.save_model(self._model, self.model_name)
     else:
         ms = ModelStorage(storage)
         ms.save_model(self._model, self.model_name)
     logging.info(
         f"{self.model_name} has been stored to {ms.storage.path}.")
    def test_embedding_trainer_doc2vec_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/embedding"
        s3_storage = S3Store(path=s3_path)
        model_storage = ModelStorage(s3_storage)

        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        job_postings_generator = JobPostingCollectionSample(num_records=30)
        corpus_generator = Doc2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)
        d2v = Doc2VecModel(size=10, min_count=3, iter=4, window=6, workers=3)

        trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
        trainer.train(corpus_generator, lookup=True)
        trainer.save_model()

        vocab_size = len(d2v.wv.vocab.keys())
        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert d2v.model_name == trainer._models[0].model_name
        assert set(files) == set([trainer._models[0].model_name])
        print(trainer.lookup_dict)
        print(d2v.lookup_dict)
        self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict)
        # Save as different name
        model_storage.save_model(d2v, 'other_name.model')

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set(
            [trainer._models[0].model_name, 'other_name.model'])

        # Load
        d2v_loaded = model_storage.load_model(trainer._models[0].model_name)
        assert d2v_loaded.metadata['embedding_model']['hyperparameters'][
            'vector_size'] == trainer._models[0].metadata['embedding_model'][
                'hyperparameters']['vector_size']
        # Change the store directory
        new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory"
        trainer.save_model(S3Store(new_s3_path))
        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(new_s3_path)]
        assert set(files) == set([trainer._models[0].model_name])
    def test_embedding_trainer_doc2vec_local(self, mock_getcwd):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]

        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))

            job_postings_generator = JobPostingCollectionSample(num_records=30)
            corpus_generator = Doc2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)
            d2v = Doc2VecModel(size=10,
                               min_count=3,
                               iter=4,
                               window=6,
                               workers=3)

            trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
            trainer.train(corpus_generator, lookup=True)
            trainer.save_model()

            vocab_size = len(d2v.wv.vocab.keys())
            assert d2v.model_name == trainer._models[0].model_name
            assert set(os.listdir(os.getcwd())) == set(
                [trainer._models[0].model_name])
            self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict)

            # Save as different name
            model_storage.save_model(d2v, 'other_name.model')
            assert set(os.listdir(os.getcwd())) == set(
                [trainer._models[0].model_name, 'other_name.model'])

            # Load
            d2v_loaded = model_storage.load_model(
                trainer._models[0].model_name)
            assert d2v_loaded.metadata["embedding_model"][
                "model_type"] == list(
                    trainer.metadata["embedding_trainer"]
                    ['models'].values())[0]['embedding_model']['model_type']

            # Change the store directory
            new_path = os.path.join(td, 'other_directory')
            trainer.save_model(FSStore(new_path))
            assert set(os.listdir(new_path)) == set(
                [trainer._models[0].model_name])
Beispiel #11
0
    def test_with_iterable_pipeline(self):
        import boto3
        client=boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write')
        s3 = S3Store('fake-open-skills')
        model_storage = ModelStorage(s3)

        proxy_fake = ProxyObjectWithStorage(model_obj=FakeModel('fake'), storage=s3, model_name='fake')
        model_storage.save_model(proxy_fake, proxy_fake.model_name)

        vectorize_for_pipeline = partial(nlp.vectorize, embedding_model=SerializedByStorage(model=proxy_fake, model_name=proxy_fake.model_name))
        pipe = IterablePipeline(vectorize_for_pipeline)

        s3.write(pickle.dumps(pipe), 'fake.pipe')
        pipe_unpickled = pickle.loads(s3.load('fake.pipe'))

        assert list(pipe_unpickled([1])) == [[1, 2, 3, 4]]
Beispiel #12
0
    def test_knn_doc2vec_cls_local(self, mock_getcwd):
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))
            corpus_generator = FakeCorpusGenerator()
            d2v = Doc2VecModel(size=10,
                               min_count=1,
                               dm=0,
                               alpha=0.025,
                               min_alpha=0.025)
            trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
            trainer.train(corpus_generator, lookup=True)

            # KNNDoc2VecClassifier only supports doc2vec now
            self.assertRaises(NotImplementedError,
                              lambda: KNNDoc2VecClassifier(Word2VecModel()))

            doc = docs.split(',')[0].split()

            knn = KNNDoc2VecClassifier(embedding_model=d2v, k=0)
            self.assertRaises(ValueError, lambda: knn.predict_soc([doc]))

            knn = KNNDoc2VecClassifier(embedding_model=d2v, k=1)
            soc_cls = SocClassifier(knn)

            assert knn.predict_soc([doc
                                    ])[0][0] == soc_cls.predict_soc([doc
                                                                     ])[0][0]

            # Build Annoy index
            knn.build_ann_indexer(num_trees=5)
            assert isinstance(knn.indexer, AnnoyIndexer)

            # Save
            model_storage.save_model(knn, knn.model_name)
            assert set(os.listdir(os.getcwd())) == set([knn.model_name])
            assert isinstance(knn.indexer, AnnoyIndexer)

            # Load
            new_knn = model_storage.load_model(knn.model_name)
            assert new_knn.model_name == knn.model_name
            assert new_knn.predict_soc([doc])[0][0] == '29-2061.00'

            # Have to re-build the index whenever ones load the knn model to the memory
            assert new_knn.indexer == None
Beispiel #13
0
    def test_save_load(self):
        import boto3
        client=boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write')
        s3 = S3Store('fake-open-skills')
        model_storage = ModelStorage(storage=s3)
        fake = FakeModel('fake')

        model_storage.save_model(fake, fake.model_name)
        proxy_fake = ProxyObjectWithStorage(model_obj=fake, storage=s3, model_name=fake.model_name)

        assert proxy_fake.storage == s3

        proxy_fake_unpickled = pickle.loads(pickle.dumps(proxy_fake))
        assert proxy_fake_unpickled.val == proxy_fake.val

        model_storage.save_model(proxy_fake, 'proxy_'+ proxy_fake.model_name)
        proxy_fake_loaded= model_storage.load_model('proxy_'+ proxy_fake.model_name)

        assert proxy_fake_loaded.val == proxy_fake.val == fake.val
Beispiel #14
0
    def test_pickle_s3(self):
        import boto3
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write')
        s3 = S3Store('fake-open-skills/models')
        model_storage = ModelStorage(storage=s3)
        fake = FakeModel('fake')
        model_storage.save_model(fake, fake.model_name)

        s_fake = SerializedByStorage(fake, s3, fake.model_name)
        s3.write(pickle.dumps(s_fake), 'fake.pickle')
        fake_unpickled = pickle.loads(s3.load('fake.pickle'))
        # make sure the fake model wasn't pickled but the reference
        assert fake_unpickled._model == None
        assert fake_unpickled.storage.path == s3.path
        assert fake_unpickled.val == fake.val

        # if the object to be pickled doesn't have storage attribute and didn't provide the storage
        # to SerializedByStorage, it will be serialized normally
        s_fake = SerializedByStorage(model=fake, model_name=fake.model_name)
        s3.write(pickle.dumps(s_fake), 'fake.pickle')
        fake_unpickled = pickle.loads(s3.load('fake.pickle'))
        assert fake_unpickled._model != None
Beispiel #15
0
    def test_embedding_trainer_word2vec_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/embedding"
        s3_storage = S3Store(path=s3_path)
        model_storage = ModelStorage(s3_storage)

        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        job_postings_generator = JobPostingCollectionSample(num_records=30)
        corpus_generator = Word2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)
        w2v = Word2VecModel(size=10, min_count=3, iter=4, window=6, workers=3)

        trainer = EmbeddingTrainer(corpus_generator, w2v, model_storage)
        trainer.train()
        trainer.save_model()

        vocab_size = len(w2v.wv.vocab.keys())

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert w2v.model_name == trainer.model_name
        assert set(files) == set([trainer.model_name])

        # Test online training
        job_postings_generator = JobPostingCollectionSample(num_records=50)
        corpus_generator = Word2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)

        w2v_loaded = model_storage.load_model(w2v.model_name)

        new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded,
                                       model_storage)
        new_trainer.train()
        new_trainer.save_model()

        new_vocab_size = len(w2v_loaded.wv.vocab.keys())

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set([new_trainer.model_name, trainer.model_name])
        assert new_trainer.metadata['embedding_trainer'][
            'model_name'] != trainer.metadata['embedding_trainer']['model_name']
        assert vocab_size <= new_vocab_size

        # Save as different name
        model_storage.save_model(w2v, 'other_name.model')

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set(
            [trainer.model_name, new_trainer.model_name, 'other_name.model'])

        # Change the store directory
        new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory"
        new_trainer.save_model(S3Store(new_s3_path))
        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(new_s3_path)]
        assert set(files) == set([new_trainer.model_name])