def test_with_grid_search(self): import boto3 client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = S3Store('fake-open-skills') model_storage = ModelStorage(s3) from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV gs = GridSearchCV(RandomForestClassifier(), {}) proxy_gs = ProxyObjectWithStorage(model_obj=gs, storage=s3, model_name='rf.grid') X = np.random.rand(20, 2) y = np.random.randint(2, size=20) proxy_gs.fit(X, y) model_storage.save_model(proxy_gs, 'rf.grid') loaded_proxy_gs = model_storage.load_model('rf.grid') assert loaded_proxy_gs.storage.path == s3.path assert proxy_gs.predict([[5, 6]]) == gs.predict([[5, 6]])
def test_knn_doc2vec_cls_s3(self): client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3_path = f"s3://fake-open-skills/model_cache/soc_classifiers" s3_storage = S3Store(path=s3_path) model_storage = ModelStorage(s3_storage) corpus_generator = FakeCorpusGenerator() # Embedding has no lookup_dict d2v = Doc2VecModel(size=10, min_count=1, dm=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(d2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=False) self.assertRaises(ValueError, lambda: KNNDoc2VecClassifier(embedding_model=d2v)) d2v = Doc2VecModel(size=10, min_count=1, dm=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(d2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=True) # KNNDoc2VecClassifier only supports doc2vec now self.assertRaises(NotImplementedError, lambda: KNNDoc2VecClassifier(Word2VecModel())) doc = docs.split(',')[0].split() knn = KNNDoc2VecClassifier(embedding_model=d2v, k=0) self.assertRaises(ValueError, lambda: knn.predict_soc([doc])) knn = KNNDoc2VecClassifier(embedding_model=d2v, k=10) soc_cls = SocClassifier(knn) assert knn.predict_soc([doc])[0][0] == soc_cls.predict_soc([doc])[0][0] # Build Annoy index knn.build_ann_indexer(num_trees=5) assert isinstance(knn.indexer, AnnoyIndexer) # Save s3 = s3fs.S3FileSystem() model_storage.save_model(knn, knn.model_name) files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set([knn.model_name]) # Load new_knn = model_storage.load_model(knn.model_name) assert new_knn.model_name == knn.model_name assert new_knn.predict_soc([doc])[0][0] == '29-2061.00' # Have to re-build the index whenever ones load the knn model to the memory assert new_knn.indexer == None
def test_with_iterable_pipelin(self): import boto3 client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = S3Store('fake-open-skills/models') model_storage = ModelStorage(storage=s3) fake = FakeModel('fake') model_storage.save_model(fake, fake.model_name) vectorize_for_pipeline = partial(nlp.vectorize, embedding_model=SerializedByStorage( storage=s3, model_name=fake.model_name, model=fake)) pipe = IterablePipeline(vectorize_for_pipeline) pipe_unpickled = pickle.loads(pickle.dumps(pipe)) # make sure the fake model wasn't pickled but the reference assert pipe_unpickled.functions[-1].keywords[ 'embedding_model']._model == None assert pipe_unpickled.functions[-1].keywords[ 'embedding_model'].storage.path == s3.path # The model will be loaded when it's needed assert list(pipe_unpickled([1])) == [[1, 2, 3, 4]]
def test_embedding_trainer_word2vec_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v = Word2VecModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(corpus_generator, w2v, model_storage) trainer.train() trainer.save_model() vocab_size = len(w2v.wv.vocab.keys()) assert w2v.model_name == trainer.model_name assert set(os.listdir(os.getcwd())) == set([trainer.model_name]) # Test Online Training job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v_loaded = model_storage.load_model(w2v.model_name) new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded, model_storage) new_trainer.train() new_trainer.save_model() new_vocab_size = len(w2v_loaded.wv.vocab.keys()) assert set(os.listdir(os.getcwd())) == set( [trainer.model_name, new_trainer.model_name]) assert new_trainer.metadata['embedding_trainer'][ 'model_name'] != trainer.metadata['embedding_trainer'][ 'model_name'] assert vocab_size <= new_vocab_size # Save as different name model_storage.save_model(w2v, 'other_name.model') assert set(os.listdir(os.getcwd())) == set([ trainer.model_name, new_trainer.model_name, 'other_name.model' ]) # Change the store directory new_path = os.path.join(td, 'other_directory') new_trainer.save_model(FSStore(new_path)) assert set(os.listdir(new_path)) == set([new_trainer.model_name])
def save(self): if self.storage: model_storage = ModelStorage(self.storage) if self.model_name: model_storage.save_model(self, self.model_name) else: raise AttributeError("'self.model_name' shouldn't be {self.model_name}") else: raise AttributeError("'self.model_storage' shouldn't be {self.model_storage}")
def test_model_storage(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td ms = ModelStorage(FSStore(td)) fake = FakeModel(1) ms.save_model(fake, 'test.model') assert set(os.listdir(os.getcwd())) == set(['test.model']) new_model = ms.load_model('test.model') assert new_model.val == fake.val
def save_model(self, storage=None): if storage: ms = ModelStorage(storage) else: ms = self.model_storage for model in self._models: model.storage = ms.storage ms.save_model(model, model.model_name) logging.info(f"{model.model_name} has been stored to {ms.storage.path}.")
def save_model(self, storage=None): if storage is None: if self.model_storage is None: raise AttributeError( f"'self.model_storage' should not be None if you want to save the model" ) ms = self.model_storage ms.save_model(self._model, self.model_name) else: ms = ModelStorage(storage) ms.save_model(self._model, self.model_name) logging.info( f"{self.model_name} has been stored to {ms.storage.path}.")
def test_embedding_trainer_doc2vec_s3(self): client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3_path = f"s3://fake-open-skills/model_cache/embedding" s3_storage = S3Store(path=s3_path) model_storage = ModelStorage(s3_storage) document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Doc2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) d2v = Doc2VecModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(d2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=True) trainer.save_model() vocab_size = len(d2v.wv.vocab.keys()) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert d2v.model_name == trainer._models[0].model_name assert set(files) == set([trainer._models[0].model_name]) print(trainer.lookup_dict) print(d2v.lookup_dict) self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict) # Save as different name model_storage.save_model(d2v, 'other_name.model') s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set( [trainer._models[0].model_name, 'other_name.model']) # Load d2v_loaded = model_storage.load_model(trainer._models[0].model_name) assert d2v_loaded.metadata['embedding_model']['hyperparameters'][ 'vector_size'] == trainer._models[0].metadata['embedding_model'][ 'hyperparameters']['vector_size'] # Change the store directory new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory" trainer.save_model(S3Store(new_s3_path)) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(new_s3_path)] assert set(files) == set([trainer._models[0].model_name])
def test_embedding_trainer_fasttext_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) fasttext = FastTextModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(fasttext, model_storage=model_storage) trainer.train(corpus_generator) trainer.save_model() vocab_size = len(fasttext.wv.vocab.keys()) assert fasttext.model_name == trainer._models[0].model_name assert set(os.listdir(os.getcwd())) == set( [trainer._models[0].model_name]) # Test Online Training job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) fasttext_loaded = model_storage.load_model(fasttext.model_name) new_trainer = EmbeddingTrainer(fasttext_loaded, model_storage=model_storage) new_trainer.train(corpus_generator) new_trainer.save_model() new_vocab_size = len(fasttext_loaded.wv.vocab.keys()) assert set(os.listdir(os.getcwd())) == set([ trainer._models[0].model_name, new_trainer._models[0].model_name ]) assert new_trainer.metadata['embedding_trainer'][ 'models'] != trainer.metadata['embedding_trainer']['models'] assert vocab_size <= new_vocab_size
def test_combined_cls_local(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) jobpostings = JobPostingCollectionSample() corpus_generator = Word2VecGensimCorpusCreator(jobpostings, raw=True) w2v = Word2VecModel(size=10, min_count=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(w2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=True) matrix = DesignMatrix(jobpostings, self.major_group, self.pipe_x, self.pipe_y) matrix.build() X = matrix.X rf = ProxyObjectWithStorage(RandomForestClassifier(), None, None, matrix.target_variable) rf.fit(X, matrix.y) proxy_rf = ProxyObjectWithStorage(rf, None, None, matrix.target_variable) # Remove the last step in the pipe_x # the input of predict_soc should be tokenized words new_pipe_x = self.pipe_x new_pipe_x.generators.pop() new_matrix = DesignMatrix(JobPostingCollectionSample(), self.major_group, new_pipe_x) new_matrix.build() ccls = CombinedClassifier(w2v, rf) assert len(ccls.predict_soc([new_matrix.X[0]])[0]) == 2
def test_base_embedding(self): model_storage = ModelStorage() fake = FakeModel() self.assertRaises(AttributeError, lambda: fake.save()) fake = FakeModel(model_storage=model_storage) self.assertRaises(AttributeError, lambda: fake.save())
def test_embedding_trainer_multicore_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) job_postings_generator = JobPostingCollectionSample() corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) trainer = EmbeddingTrainer(FastTextModel(size=10, min_count=3, iter=4, window=6, workers=3), FastTextModel(size=10, min_count=3, iter=4, window=10, workers=3), Word2VecModel(size=10, workers=3, window=6), Word2VecModel(size=10, min_count=10, window=10, workers=3), model_storage=model_storage) trainer.train(corpus_generator, n_processes=4) trainer.save_model() assert set(os.listdir(os.getcwd())) == set( [model.model_name for model in trainer._models])
def test_embedding_trainer_doc2vec_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Doc2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) d2v = Doc2VecModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(d2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=True) trainer.save_model() vocab_size = len(d2v.wv.vocab.keys()) assert d2v.model_name == trainer._models[0].model_name assert set(os.listdir(os.getcwd())) == set( [trainer._models[0].model_name]) self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict) # Save as different name model_storage.save_model(d2v, 'other_name.model') assert set(os.listdir(os.getcwd())) == set( [trainer._models[0].model_name, 'other_name.model']) # Load d2v_loaded = model_storage.load_model( trainer._models[0].model_name) assert d2v_loaded.metadata["embedding_model"][ "model_type"] == list( trainer.metadata["embedding_trainer"] ['models'].values())[0]['embedding_model']['model_type'] # Change the store directory new_path = os.path.join(td, 'other_directory') trainer.save_model(FSStore(new_path)) assert set(os.listdir(new_path)) == set( [trainer._models[0].model_name])
def test_with_iterable_pipeline(self): import boto3 client=boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = S3Store('fake-open-skills') model_storage = ModelStorage(s3) proxy_fake = ProxyObjectWithStorage(model_obj=FakeModel('fake'), storage=s3, model_name='fake') model_storage.save_model(proxy_fake, proxy_fake.model_name) vectorize_for_pipeline = partial(nlp.vectorize, embedding_model=SerializedByStorage(model=proxy_fake, model_name=proxy_fake.model_name)) pipe = IterablePipeline(vectorize_for_pipeline) s3.write(pickle.dumps(pipe), 'fake.pipe') pipe_unpickled = pickle.loads(s3.load('fake.pipe')) assert list(pipe_unpickled([1])) == [[1, 2, 3, 4]]
def test_knn_doc2vec_cls_local(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) corpus_generator = FakeCorpusGenerator() d2v = Doc2VecModel(size=10, min_count=1, dm=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(d2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=True) # KNNDoc2VecClassifier only supports doc2vec now self.assertRaises(NotImplementedError, lambda: KNNDoc2VecClassifier(Word2VecModel())) doc = docs.split(',')[0].split() knn = KNNDoc2VecClassifier(embedding_model=d2v, k=0) self.assertRaises(ValueError, lambda: knn.predict_soc([doc])) knn = KNNDoc2VecClassifier(embedding_model=d2v, k=1) soc_cls = SocClassifier(knn) assert knn.predict_soc([doc ])[0][0] == soc_cls.predict_soc([doc ])[0][0] # Build Annoy index knn.build_ann_indexer(num_trees=5) assert isinstance(knn.indexer, AnnoyIndexer) # Save model_storage.save_model(knn, knn.model_name) assert set(os.listdir(os.getcwd())) == set([knn.model_name]) assert isinstance(knn.indexer, AnnoyIndexer) # Load new_knn = model_storage.load_model(knn.model_name) assert new_knn.model_name == knn.model_name assert new_knn.predict_soc([doc])[0][0] == '29-2061.00' # Have to re-build the index whenever ones load the knn model to the memory assert new_knn.indexer == None
def test_pickle_s3(self): import boto3 client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = S3Store('fake-open-skills/models') model_storage = ModelStorage(storage=s3) fake = FakeModel('fake') model_storage.save_model(fake, fake.model_name) s_fake = SerializedByStorage(fake, s3, fake.model_name) s3.write(pickle.dumps(s_fake), 'fake.pickle') fake_unpickled = pickle.loads(s3.load('fake.pickle')) # make sure the fake model wasn't pickled but the reference assert fake_unpickled._model == None assert fake_unpickled.storage.path == s3.path assert fake_unpickled.val == fake.val # if the object to be pickled doesn't have storage attribute and didn't provide the storage # to SerializedByStorage, it will be serialized normally s_fake = SerializedByStorage(model=fake, model_name=fake.model_name) s3.write(pickle.dumps(s_fake), 'fake.pickle') fake_unpickled = pickle.loads(s3.load('fake.pickle')) assert fake_unpickled._model != None
def test_save_load(self): import boto3 client=boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = S3Store('fake-open-skills') model_storage = ModelStorage(storage=s3) fake = FakeModel('fake') model_storage.save_model(fake, fake.model_name) proxy_fake = ProxyObjectWithStorage(model_obj=fake, storage=s3, model_name=fake.model_name) assert proxy_fake.storage == s3 proxy_fake_unpickled = pickle.loads(pickle.dumps(proxy_fake)) assert proxy_fake_unpickled.val == proxy_fake.val model_storage.save_model(proxy_fake, 'proxy_'+ proxy_fake.model_name) proxy_fake_loaded= model_storage.load_model('proxy_'+ proxy_fake.model_name) assert proxy_fake_loaded.val == proxy_fake.val == fake.val
def test_embedding_trainer_multicore_s3(self): client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3_path = f"s3://fake-open-skills/model_cache/embedding" s3_storage = S3Store(path=s3_path) model_storage = ModelStorage(s3_storage) document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample() corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) trainer = EmbeddingTrainer(FastTextModel(size=10, min_count=3, iter=4, window=6, workers=3), FastTextModel(size=10, min_count=3, iter=4, window=10, workers=3), Word2VecModel(size=10, workers=3, window=6), Word2VecModel(size=10, min_count=10, window=10, workers=3), model_storage=model_storage) trainer.train(corpus_generator) trainer.save_model() s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set( [model.model_name for model in trainer._models])
def test_embedding_trainer_word2vec_s3(self): client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3_path = f"s3://fake-open-skills/model_cache/embedding" s3_storage = S3Store(path=s3_path) model_storage = ModelStorage(s3_storage) document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v = Word2VecModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(corpus_generator, w2v, model_storage) trainer.train() trainer.save_model() vocab_size = len(w2v.wv.vocab.keys()) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert w2v.model_name == trainer.model_name assert set(files) == set([trainer.model_name]) # Test online training job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v_loaded = model_storage.load_model(w2v.model_name) new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded, model_storage) new_trainer.train() new_trainer.save_model() new_vocab_size = len(w2v_loaded.wv.vocab.keys()) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set([new_trainer.model_name, trainer.model_name]) assert new_trainer.metadata['embedding_trainer'][ 'model_name'] != trainer.metadata['embedding_trainer']['model_name'] assert vocab_size <= new_vocab_size # Save as different name model_storage.save_model(w2v, 'other_name.model') s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set( [trainer.model_name, new_trainer.model_name, 'other_name.model']) # Change the store directory new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory" new_trainer.save_model(S3Store(new_s3_path)) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(new_s3_path)] assert set(files) == set([new_trainer.model_name])
job_samples = JobPostingCollectionSample() job_postings = list(job_samples) random.shuffle(job_postings) train_data = job_postings[:30] test_data = job_postings[30:] train_bytes = json.dumps(train_data).encode() test_bytes = json.dumps(test_data).encode() logging.info("Loading Embedding Model") model_storage = ModelStorage(FSStore('/your/model/path')) w2v = model_storage.load_model(model_name='your_model_name') full_soc = FullSOC() def basic_filter(doc): """ Return the document except for the document which soc is unknown or empty or not in the soc code pool of current O*Net version """ if full_soc.filter_func(doc) and doc['onet_soc_code'] in full_soc.choices: return doc else: return None class JobGenerator(object):