def test_embedding_trainer_word2vec_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v = Word2VecModel(storage=FSStore(td), size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(corpus_generator, w2v) trainer.train() trainer.save_model() vocab_size = len(w2v.wv.vocab.keys()) assert w2v.model_name == trainer.model_name assert set(os.listdir(os.getcwd())) == set([trainer.model_name]) # Test Online Training job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v_loaded = Word2VecModel.load(FSStore(td), w2v.model_name) new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded) new_trainer.train() new_trainer.save_model() new_vocab_size = len(w2v_loaded.wv.vocab.keys()) assert set(os.listdir(os.getcwd())) == set( [trainer.model_name, new_trainer.model_name]) assert new_trainer.metadata['embedding_trainer'][ 'model_name'] != trainer.metadata['embedding_trainer'][ 'model_name'] assert vocab_size <= new_vocab_size # Save as different name w2v.save('other_name.model') assert set(os.listdir(os.getcwd())) == set([ trainer.model_name, new_trainer.model_name, 'other_name.model' ]) # Change the store directory new_path = os.path.join(td, 'other_directory') new_trainer.save_model(FSStore(new_path)) assert set(os.listdir(new_path)) == set([new_trainer.model_name])
def test_embedding_trainer_word2vec_s3(self): client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3_path = f"s3://fake-open-skills/model_cache/embedding" s3_storage = S3Store(path=s3_path) document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v = Word2VecModel(storage=s3_storage, size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(corpus_generator, w2v) trainer.train() trainer.save_model() vocab_size = len(w2v.wv.vocab.keys()) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert w2v.model_name == trainer.model_name assert set(files) == set([trainer.model_name]) # Test online training job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v_loaded = Word2VecModel.load(s3_storage, w2v.model_name) new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded) new_trainer.train() new_trainer.save_model() new_vocab_size = len(w2v_loaded.wv.vocab.keys()) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set([new_trainer.model_name, trainer.model_name]) assert new_trainer.metadata['embedding_trainer'][ 'model_name'] != trainer.metadata['embedding_trainer']['model_name'] assert vocab_size <= new_vocab_size # Save as different name w2v.save('other_name.model') s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set( [trainer.model_name, new_trainer.model_name, 'other_name.model']) # Change the store directory new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory" new_trainer.save_model(S3Store(new_s3_path)) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(new_s3_path)] assert set(files) == set([new_trainer.model_name])
import multiprocessing num_of_worker = multiprocessing.cpu_count() job_samples = JobPostingCollectionSample() job_postings = list(job_samples) random.shuffle(job_postings) train_data = job_postings[:30] test_data = job_postings[30:] train_bytes = json.dumps(train_data).encode() test_bytes = json.dumps(test_data).encode() logging.info("Loading Embedding Model") w2v = Word2VecModel.load(storage=FSStore('tmp'), model_name='your-embedding-model') full_soc = FullSOC() def basic_filter(doc): """ Return the document except for the document which soc is unknown or empty or not in the soc code pool of current O*Net version """ if full_soc.filter_func( doc) and doc['onet_soc_code'] in full_soc.onet.all_soc: return doc else: return None