def test_embedding_trainer_word2vec_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v = Word2VecModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(corpus_generator, w2v, model_storage) trainer.train() trainer.save_model() vocab_size = len(w2v.wv.vocab.keys()) assert w2v.model_name == trainer.model_name assert set(os.listdir(os.getcwd())) == set([trainer.model_name]) # Test Online Training job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v_loaded = model_storage.load_model(w2v.model_name) new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded, model_storage) new_trainer.train() new_trainer.save_model() new_vocab_size = len(w2v_loaded.wv.vocab.keys()) assert set(os.listdir(os.getcwd())) == set( [trainer.model_name, new_trainer.model_name]) assert new_trainer.metadata['embedding_trainer'][ 'model_name'] != trainer.metadata['embedding_trainer'][ 'model_name'] assert vocab_size <= new_vocab_size # Save as different name model_storage.save_model(w2v, 'other_name.model') assert set(os.listdir(os.getcwd())) == set([ trainer.model_name, new_trainer.model_name, 'other_name.model' ]) # Change the store directory new_path = os.path.join(td, 'other_directory') new_trainer.save_model(FSStore(new_path)) assert set(os.listdir(new_path)) == set([new_trainer.model_name])
def test_model_storage(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td ms = ModelStorage(FSStore(td)) ms.save('test.model') assert set(os.listdir(os.getcwd())) == set(['test.model']) new_ms = ms.load(FSStore(td), 'test.model') self.assertEqual(ms.storage.path, new_ms.storage.path)
def test_combined_cls_local(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) jobpostings = JobPostingCollectionSample() corpus_generator = Word2VecGensimCorpusCreator(jobpostings, raw=True) w2v = Word2VecModel(size=10, min_count=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(w2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=True) matrix = DesignMatrix(jobpostings, self.major_group, self.pipe_x, self.pipe_y) matrix.build() X = matrix.X rf = ProxyObjectWithStorage(RandomForestClassifier(), None, None, matrix.target_variable) rf.fit(X, matrix.y) proxy_rf = ProxyObjectWithStorage(rf, None, None, matrix.target_variable) # Remove the last step in the pipe_x # the input of predict_soc should be tokenized words new_pipe_x = self.pipe_x new_pipe_x.generators.pop() new_matrix = DesignMatrix(JobPostingCollectionSample(), self.major_group, new_pipe_x) new_matrix.build() ccls = CombinedClassifier(w2v, rf) assert len(ccls.predict_soc([new_matrix.X[0]])[0]) == 2
def test_embedding_trainer_multicore_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) job_postings_generator = JobPostingCollectionSample() corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) trainer = EmbeddingTrainer(FastTextModel(size=10, min_count=3, iter=4, window=6, workers=3), FastTextModel(size=10, min_count=3, iter=4, window=10, workers=3), Word2VecModel(size=10, workers=3, window=6), Word2VecModel(size=10, min_count=10, window=10, workers=3), model_storage=model_storage) trainer.train(corpus_generator, n_processes=4) trainer.save_model() assert set(os.listdir(os.getcwd())) == set( [model.model_name for model in trainer._models])
def test_embedding_trainer_doc2vec_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Doc2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) d2v = Doc2VecModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(d2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=True) trainer.save_model() vocab_size = len(d2v.wv.vocab.keys()) assert d2v.model_name == trainer._models[0].model_name assert set(os.listdir(os.getcwd())) == set( [trainer._models[0].model_name]) self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict) # Save as different name model_storage.save_model(d2v, 'other_name.model') assert set(os.listdir(os.getcwd())) == set( [trainer._models[0].model_name, 'other_name.model']) # Load d2v_loaded = model_storage.load_model( trainer._models[0].model_name) assert d2v_loaded.metadata["embedding_model"][ "model_type"] == list( trainer.metadata["embedding_trainer"] ['models'].values())[0]['embedding_model']['model_type'] # Change the store directory new_path = os.path.join(td, 'other_directory') trainer.save_model(FSStore(new_path)) assert set(os.listdir(new_path)) == set( [trainer._models[0].model_name])
def test_embedding_trainer_fasttext_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) fasttext = FastTextModel(storage=FSStore(td), size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(corpus_generator, fasttext) trainer.train() trainer.save_model() vocab_size = len(fasttext.wv.vocab.keys()) assert fasttext.model_name == trainer.model_name assert set(os.listdir(os.getcwd())) == set([trainer.model_name]) # Test Online Training job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) fasttext_loaded = FastTextModel.load(FSStore(td), fasttext.model_name) new_trainer = EmbeddingTrainer(corpus_generator, fasttext_loaded) new_trainer.train() new_trainer.save_model() new_vocab_size = len(fasttext_loaded.wv.vocab.keys()) assert set(os.listdir(os.getcwd())) == set( [trainer.model_name, new_trainer.model_name]) assert new_trainer.metadata['embedding_trainer'][ 'model_name'] != trainer.metadata['embedding_trainer'][ 'model_name'] assert vocab_size <= new_vocab_size
def test_knn_doc2vec_cls_local(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td corpus_generator = FakeCorpusGenerator() d2v = Doc2VecModel(storage=FSStore(td), size=10, min_count=1, dm=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(corpus_generator, d2v) trainer.train(True) # KNNDoc2VecClassifier only supports doc2vec now self.assertRaises(NotImplementedError, lambda: KNNDoc2VecClassifier(Word2VecModel())) doc = docs.split(',')[0].split() knn = KNNDoc2VecClassifier(embedding_model=d2v, k=0) self.assertRaises(ValueError, lambda: knn.predict_soc([doc])) knn = KNNDoc2VecClassifier(embedding_model=d2v, k=1) soc_cls = SocClassifier(knn) assert knn.predict_soc([doc ])[0][0] == soc_cls.predict_soc([doc ])[0][0] # Build Annoy index knn.build_ann_indexer(num_trees=5) assert isinstance(knn.indexer, AnnoyIndexer) # Save knn.save() assert set(os.listdir(os.getcwd())) == set([knn.model_name]) assert isinstance(knn.indexer, AnnoyIndexer) # Load new_knn = KNNDoc2VecClassifier.load(FSStore(td), knn.model_name) assert new_knn.model_name == knn.model_name assert new_knn.predict_soc([doc])[0][0] == '29-2061.00' # Have to re-build the index whenever ones load the knn model to the memory assert new_knn.indexer == None
def test_model_storage(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td ms = ModelStorage(FSStore(td)) fake = FakeModel(1) ms.save_model(fake, 'test.model') assert set(os.listdir(os.getcwd())) == set(['test.model']) new_model = ms.load_model('test.model') assert new_model.val == fake.val
def test_fsstore(self): with tempfile.TemporaryDirectory() as tmpdir: storage = FSStore(tmpdir) model = pickle.dumps(FakeModel('val')) storage.write(model, 'for_testing.model') assert os.path.isfile(os.path.join( tmpdir, 'for_testing.model')) == storage.exists( 'for_testing.model') == True model_loaded = storage.load('for_testing.model') model_loaded = pickle.loads(model_loaded) assert model_loaded.val == 'val' storage.delete('for_testing.model') assert os.path.isfile(os.path.join( tmpdir, 'for_testing.model')) == storage.exists( 'for_testing.model') == False
def __init__(self, matrix, k_folds, grid_config=None, storage=None, random_state_for_split=None, scoring=['accuracy'], n_jobs=3): self.matrix = matrix self.storage = FSStore() if storage is None else storage self.k_folds = k_folds self.n_jobs = n_jobs self.grid_config = self.default_grid_config if grid_config is None else grid_config self.cls_cv_result = {} self.scoring = scoring self.best_classifiers = {} self.random_state_for_split = random_state_for_split self.train_time = datetime.today().isoformat()
def test_training_save(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td matrix = self.matrix assert matrix.target_variable.name == "major_group" occ_trainer = OccupationClassifierTrainer(matrix, k_folds=2, storage=FSStore(td), grid_config=grid, scoring=['accuracy']) occ_trainer.train(save=True) assert set(os.listdir(os.getcwd())) == set( [occ_trainer.train_time])
def test_fsstore(self): with tempfile.TemporaryDirectory() as tmpdir: storage = FSStore(tmpdir) # 1. Ensure that a new file is correctly created and saved to storage_one = PersistedJSONDict(storage, 'test.json') storage_one['key1'] = 'value1' storage_one['key2'] = {'nestedkey2': 'value2'} storage_one.save() assert json.load(open(os.path.join(tmpdir, 'test.json')))\ == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}} # 2. Ensure that an existing file is correctly read, updated, and saved to storage_two = PersistedJSONDict(storage, 'test.json') assert 'key1' in storage_two assert storage_two['key1'] == 'value1' storage_two['key3'] = 'value3' storage_two.save() assert json.load(open(os.path.join(tmpdir, 'test.json')))\ == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}, 'key3': 'value3'} # 3. Ensure that, in the same thread, updating and svaing an old one gets new chagnes too storage_one['key4'] = 'value4' storage_one.save() assert json.load(open(os.path.join(tmpdir, 'test.json')))\ == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}, 'key3': 'value3', 'key4': 'value4'} # 4. test autosave - this will be the fourth update of this object storage_one.SAVE_EVERY_N_UPDATES = 4 storage_one['key5'] = 'value5' assert json.load(open(os.path.join(tmpdir, 'test.json')))\ == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}, 'key3': 'value3', 'key4': 'value4', 'key5': 'value5'} # 5. test length checking assert len(storage_one) == 5 # 6. test iteration assert sorted( [(key, value) for key, value in storage_one.items()], key=lambda x: x[0] ) == [ ('key1', 'value1'), ('key2', {'nestedkey2': 'value2'}), ('key3', 'value3'), ('key4', 'value4'), ('key5', 'value5') ]
def test_combined_cls_local(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td jobpostings = list(JobPostingCollectionSample()) corpus_generator = Word2VecGensimCorpusCreator(jobpostings, raw=True) w2v = Word2VecModel(storage=FSStore(td), size=10, min_count=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(corpus_generator, w2v) trainer.train(True) matrix = create_training_set(jobpostings, SOCMajorGroup()) X = EmbeddingTransformer(w2v).transform(matrix.X) rf = RandomForestClassifier() rf.fit(X, matrix.y) ccls = CombinedClassifier(w2v, rf, matrix.target_variable) assert len(ccls.predict_soc([matrix.X[0]])[0]) == 2
def test_onet_skill_extractor(): skills_content = [ [ 'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID', 'Data Value', 'N', 'Standard Error', 'Lower CI Bound', 'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date', 'Domain Source' ], [ '11-1011.00', '2.A.1.a', 'Reading Comprehension', 'IM', '4.12', '8', '0.13', '3.88', '4.37', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-1011.00', '2.A.1.a', 'Reading Comprehension', 'LV', '4.75', '8', '0.16', '4.43', '5.07', 'N', 'N', '07/2014', 'Analyst' ], [ '11-1011.00', '2.A.1.b', 'Active Listening', 'IM', '4.12', '8', '0.13', '3.88', '4.37', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-1011.00', '2.A.1.b', 'Active Listening', 'LV', '-4.88', '8', '0.23', '4.43', '5.32', 'N', 'N', '07/2014', 'Analyst' ], ] abilities_content = [ [ 'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID', 'Data Value', 'N', 'Standard Error', 'Lower CI Bound', 'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date', 'Domain Source' ], [ '11-1011.00', '1.A.1.a.1', 'Oral Comprehension', 'IM', '4.50', '8', '0.19', '4.13', '4.87', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-1011.00', '1.A.1.a.1', 'Oral Comprehension', 'LV', '4.88', '8', '0.13', '4.63', '5.12', 'N', 'Y', '07/2014', 'Analyst' ], [ '11-1011.00', '1.A.1.a.2', 'Written Comprehension', 'IM', '4.25', '8', '0.16', '3.93', '4.57', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-1011.00', '1.A.1.a.2', 'Written Comprehension', 'LV', '4.62', '8', '0.18', '4.27', '4.98', 'N', 'N', '07/2014', 'Analyst' ], [ '11-2031.00', '1.A.1.a.3', 'Written Comprehension', 'IM', '4.25', '8', '0.16', '3.93', '4.57', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-2031.00', '1.A.1.a.3', 'Written Comprehension', 'LV', '4.62', '8', '0.18', '4.27', '4.98', 'N', 'N', '07/2014', 'Analyst' ], ] knowledge_content = [ [ 'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID', 'Data Value', 'N', 'Standard Error', 'Lower CI Bound', 'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date', 'Domain Source' ], [ '11-1011.00', '2.C.1.a', 'Administration and Management', 'IM', '4.75', '27', '0.09', '4.56', '4.94', 'N', 'n/a', '07/2014', 'Incumbent' ], [ '11-1011.00', '2.C.1.a', 'Administration and Management', 'LV', '6.23', '27', '0.17', '5.88', '6.57', 'N', 'N', '07/2014', 'Incumbent' ], [ '11-1011.00', '2.C.1.b', 'Clerical', 'IM', '2.66', '27', '0.22', '2.21', '3.11', 'N', 'n/a', '07/2014', 'Incumbent' ], [ '11-1011.00', '2.C.1.b', 'Clerical', 'LV', '3.50', '27', '0.41', '2.66', '4.34', 'N', 'N', '07/2014', 'Incumbent' ], ] tools_content = [ [ 'O*NET-SOC Code', 'T2 Type', 'T2 Example', 'Commodity Code', 'Commodity Title' ], [ '11-1011.00', 'Tools', '10-key calculators', '44101809', 'Desktop calculator' ], [ '11-1011.00', 'Tools', 'Desktop computers', '43211507', 'Desktop computers' ], [ '11-1011.00', 'Tools', 'Laptop computers', '43211503', 'Notebook computers' ], [ '11-1011.00', 'Tools', 'Personal computers', '43211508', 'Personal computers' ], [ '11-1011.00', 'Tools', 'Personal digital assistants PDA', '43211504', 'Personal digital assistant PDAs or organizers' ], ['11-1011.00', 'Tools', 'Smartphones', '43191501', 'Mobile phones'], [ '11-1011.00', 'Tools', 'Universal serial bus USB flash drives', '43201813', 'High capacity removable media drives' ], [ '11-1011.00', 'Technology', 'Adobe Systems Adobe Acrobat software', '43232202', 'Document management software' ], [ '11-1011.00', 'Technology', 'AdSense Tracker', '43232306', 'Data base user interface and query software' ], [ '11-1011.00', 'Technology', 'Blackbaud The Raiser\'s Edge', '43232303', 'Customer relationship management CRM software' ], ] class MockOnetDownloader(object): def download(self, source_file): fake_data_lookup = { 'Skills': skills_content, 'Abilities': abilities_content, 'Knowledge': knowledge_content, 'Tools and Technology': tools_content, } with utils.makeNamedTemporaryCSV(fake_data_lookup[source_file], '\t') as tempname: with open(tempname) as fh: return fh.read() with patch( 'skills_ml.datasets.skill_importances.onet.OnetToMemoryDownloader', MockOnetDownloader): with tempfile.TemporaryDirectory() as output_dir: storage = FSStore(output_dir) extractor = OnetSkillImportanceExtractor( output_dataset_name='skills', storage=storage, hash_function=md5) extractor.run() pdin = io.StringIO(storage.load('skills.tsv').decode('utf-8')) output = pd.read_csv(pdin, sep='\t').T.to_dict().values() # +24 base rows in input across the K,S,A,T files assert len(output) == 24 # make sure uuid is hashed version of the KSA for row in output: assert row['nlp_a'] == md5(row['ONET KSA']) # otherwise, this is a simple concat so not much to assert # we do use these rows though so make sure they're there assert 'O*NET-SOC Code' in row assert 'ONET KSA' in row
import multiprocessing num_of_worker = multiprocessing.cpu_count() job_samples = JobPostingCollectionSample() job_postings = list(job_samples) random.shuffle(job_postings) train_data = job_postings[:30] test_data = job_postings[30:] train_bytes = json.dumps(train_data).encode() test_bytes = json.dumps(test_data).encode() logging.info("Loading Embedding Model") w2v = Word2VecModel.load(storage=FSStore('tmp'), model_name='your-embedding-model') full_soc = FullSOC() def basic_filter(doc): """ Return the document except for the document which soc is unknown or empty or not in the soc code pool of current O*Net version """ if full_soc.filter_func( doc) and doc['onet_soc_code'] in full_soc.onet.all_soc: return doc else: return None
aggregate_properties from skills_ml.storage import FSStore from functools import partial import unicodecsv as csv import numpy from skills_ml.job_postings.aggregate.pandas import listy_n_most_common import os import tempfile logging.basicConfig(level=logging.INFO) job_postings = list(JobPostingCollectionSample()) with tempfile.TemporaryDirectory() as tmpdir: computed_properties_path = os.path.join(tmpdir, 'computed_properties') storage = FSStore(computed_properties_path) # Create properties. In this example, we are going to both compute and aggregate, # but this is not necessary! Computation and aggregation are entirely decoupled. # So it's entirely valid to just compute a bunch of properties and then later # figure out how you want to aggregate them. # We are only introducing the 'grouping' and 'aggregate' semantics this early in the # script so as to avoid defining these properties twice in the same script. # create properties to be grouped on. In this case, we want to group on cleaned job title grouping_properties = [ TitleCleanPhaseOne(storage=storage), Geography(geo_querier=JobStateQuerier(), storage=storage) ] # create properties to aggregate for each group posting_present_prop = PostingIdPresent(storage=storage)
def test_fsstore(self): with tempfile.TemporaryDirectory() as tmpdir: storage = FSStore(tmpdir) model = FakeModel('val') model_pickled = pickle.dumps(model) storage.write(model_pickled, 'for_testing.model') assert os.path.isfile(os.path.join(tmpdir, 'for_testing.model')) == storage.exists('for_testing.model') == True with storage.open("for_testing_compressed.model", "wb") as f: joblib.dump(model, f, compress=True) assert storage.exists("for_testing_compressed.model") with open_sesame(os.path.join(tmpdir, "for_testing_compressed.model"), "rb") as f: model_loaded = joblib.load(f) assert model.val == model_loaded.val model_loaded = storage.load('for_testing.model') model_loaded = pickle.loads(model_loaded) assert model_loaded.val == 'val' storage.delete('for_testing.model') assert os.path.isfile(os.path.join(tmpdir, 'for_testing.model')) == storage.exists('for_testing.model') == False
def test_save(self): with tempfile.TemporaryDirectory() as temp_dir: storage = FSStore(temp_dir) self.ontology().save(storage) assert CompetencyOntology(jsonld_string=storage.load('Test Ontology.json')) == self.ontology()
def __init__(self, storage=None): self._storage = FSStore() if storage is None else storage
job_samples = JobPostingCollectionSample() job_postings = list(job_samples) random.shuffle(job_postings) train_data = job_postings[:30] test_data = job_postings[30:] train_bytes = json.dumps(train_data).encode() test_bytes = json.dumps(test_data).encode() logging.info("Loading Embedding Model") model_storage = ModelStorage(FSStore('/your/model/path')) w2v = model_storage.load_model(model_name='your_model_name') full_soc = FullSOC() def basic_filter(doc): """ Return the document except for the document which soc is unknown or empty or not in the soc code pool of current O*Net version """ if full_soc.filter_func(doc) and doc['onet_soc_code'] in full_soc.choices: return doc else: return None class JobGenerator(object):