Exemple #1
0
    def test_embedding_trainer_word2vec_local(self, mock_getcwd):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]

        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))
            job_postings_generator = JobPostingCollectionSample(num_records=30)
            corpus_generator = Word2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)
            w2v = Word2VecModel(size=10,
                                min_count=3,
                                iter=4,
                                window=6,
                                workers=3)

            trainer = EmbeddingTrainer(corpus_generator, w2v, model_storage)
            trainer.train()
            trainer.save_model()

            vocab_size = len(w2v.wv.vocab.keys())

            assert w2v.model_name == trainer.model_name
            assert set(os.listdir(os.getcwd())) == set([trainer.model_name])

            # Test Online Training
            job_postings_generator = JobPostingCollectionSample(num_records=50)
            corpus_generator = Word2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)

            w2v_loaded = model_storage.load_model(w2v.model_name)

            new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded,
                                           model_storage)
            new_trainer.train()
            new_trainer.save_model()

            new_vocab_size = len(w2v_loaded.wv.vocab.keys())

            assert set(os.listdir(os.getcwd())) == set(
                [trainer.model_name, new_trainer.model_name])
            assert new_trainer.metadata['embedding_trainer'][
                'model_name'] != trainer.metadata['embedding_trainer'][
                    'model_name']
            assert vocab_size <= new_vocab_size

            # Save as different name
            model_storage.save_model(w2v, 'other_name.model')
            assert set(os.listdir(os.getcwd())) == set([
                trainer.model_name, new_trainer.model_name, 'other_name.model'
            ])

            # Change the store directory
            new_path = os.path.join(td, 'other_directory')
            new_trainer.save_model(FSStore(new_path))
            assert set(os.listdir(new_path)) == set([new_trainer.model_name])
    def test_model_storage(self, mock_getcwd):
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            ms = ModelStorage(FSStore(td))
            ms.save('test.model')
            assert set(os.listdir(os.getcwd())) == set(['test.model'])

            new_ms = ms.load(FSStore(td), 'test.model')

            self.assertEqual(ms.storage.path, new_ms.storage.path)
Exemple #3
0
    def test_combined_cls_local(self, mock_getcwd):
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))
            jobpostings = JobPostingCollectionSample()
            corpus_generator = Word2VecGensimCorpusCreator(jobpostings,
                                                           raw=True)
            w2v = Word2VecModel(size=10,
                                min_count=0,
                                alpha=0.025,
                                min_alpha=0.025)
            trainer = EmbeddingTrainer(w2v, model_storage=model_storage)
            trainer.train(corpus_generator, lookup=True)

            matrix = DesignMatrix(jobpostings, self.major_group, self.pipe_x,
                                  self.pipe_y)
            matrix.build()

            X = matrix.X
            rf = ProxyObjectWithStorage(RandomForestClassifier(), None, None,
                                        matrix.target_variable)
            rf.fit(X, matrix.y)

            proxy_rf = ProxyObjectWithStorage(rf, None, None,
                                              matrix.target_variable)
            # Remove the last step in the pipe_x
            # the input of predict_soc should be tokenized words
            new_pipe_x = self.pipe_x
            new_pipe_x.generators.pop()

            new_matrix = DesignMatrix(JobPostingCollectionSample(),
                                      self.major_group, new_pipe_x)
            new_matrix.build()
            ccls = CombinedClassifier(w2v, rf)
            assert len(ccls.predict_soc([new_matrix.X[0]])[0]) == 2
    def test_embedding_trainer_multicore_local(self, mock_getcwd):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))
            job_postings_generator = JobPostingCollectionSample()
            corpus_generator = Word2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)
            trainer = EmbeddingTrainer(FastTextModel(size=10,
                                                     min_count=3,
                                                     iter=4,
                                                     window=6,
                                                     workers=3),
                                       FastTextModel(size=10,
                                                     min_count=3,
                                                     iter=4,
                                                     window=10,
                                                     workers=3),
                                       Word2VecModel(size=10,
                                                     workers=3,
                                                     window=6),
                                       Word2VecModel(size=10,
                                                     min_count=10,
                                                     window=10,
                                                     workers=3),
                                       model_storage=model_storage)
            trainer.train(corpus_generator, n_processes=4)
            trainer.save_model()

            assert set(os.listdir(os.getcwd())) == set(
                [model.model_name for model in trainer._models])
    def test_embedding_trainer_doc2vec_local(self, mock_getcwd):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]

        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))

            job_postings_generator = JobPostingCollectionSample(num_records=30)
            corpus_generator = Doc2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)
            d2v = Doc2VecModel(size=10,
                               min_count=3,
                               iter=4,
                               window=6,
                               workers=3)

            trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
            trainer.train(corpus_generator, lookup=True)
            trainer.save_model()

            vocab_size = len(d2v.wv.vocab.keys())
            assert d2v.model_name == trainer._models[0].model_name
            assert set(os.listdir(os.getcwd())) == set(
                [trainer._models[0].model_name])
            self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict)

            # Save as different name
            model_storage.save_model(d2v, 'other_name.model')
            assert set(os.listdir(os.getcwd())) == set(
                [trainer._models[0].model_name, 'other_name.model'])

            # Load
            d2v_loaded = model_storage.load_model(
                trainer._models[0].model_name)
            assert d2v_loaded.metadata["embedding_model"][
                "model_type"] == list(
                    trainer.metadata["embedding_trainer"]
                    ['models'].values())[0]['embedding_model']['model_type']

            # Change the store directory
            new_path = os.path.join(td, 'other_directory')
            trainer.save_model(FSStore(new_path))
            assert set(os.listdir(new_path)) == set(
                [trainer._models[0].model_name])
    def test_embedding_trainer_fasttext_local(self, mock_getcwd):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            job_postings_generator = JobPostingCollectionSample(num_records=30)
            corpus_generator = Word2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)
            fasttext = FastTextModel(storage=FSStore(td),
                                     size=10,
                                     min_count=3,
                                     iter=4,
                                     window=6,
                                     workers=3)

            trainer = EmbeddingTrainer(corpus_generator, fasttext)
            trainer.train()
            trainer.save_model()

            vocab_size = len(fasttext.wv.vocab.keys())

            assert fasttext.model_name == trainer.model_name
            assert set(os.listdir(os.getcwd())) == set([trainer.model_name])

            # Test Online Training
            job_postings_generator = JobPostingCollectionSample(num_records=50)
            corpus_generator = Word2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)

            fasttext_loaded = FastTextModel.load(FSStore(td),
                                                 fasttext.model_name)
            new_trainer = EmbeddingTrainer(corpus_generator, fasttext_loaded)
            new_trainer.train()
            new_trainer.save_model()

            new_vocab_size = len(fasttext_loaded.wv.vocab.keys())

            assert set(os.listdir(os.getcwd())) == set(
                [trainer.model_name, new_trainer.model_name])
            assert new_trainer.metadata['embedding_trainer'][
                'model_name'] != trainer.metadata['embedding_trainer'][
                    'model_name']
            assert vocab_size <= new_vocab_size
    def test_knn_doc2vec_cls_local(self, mock_getcwd):
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            corpus_generator = FakeCorpusGenerator()
            d2v = Doc2VecModel(storage=FSStore(td),
                               size=10,
                               min_count=1,
                               dm=0,
                               alpha=0.025,
                               min_alpha=0.025)
            trainer = EmbeddingTrainer(corpus_generator, d2v)
            trainer.train(True)

            # KNNDoc2VecClassifier only supports doc2vec now
            self.assertRaises(NotImplementedError,
                              lambda: KNNDoc2VecClassifier(Word2VecModel()))

            doc = docs.split(',')[0].split()

            knn = KNNDoc2VecClassifier(embedding_model=d2v, k=0)
            self.assertRaises(ValueError, lambda: knn.predict_soc([doc]))

            knn = KNNDoc2VecClassifier(embedding_model=d2v, k=1)
            soc_cls = SocClassifier(knn)

            assert knn.predict_soc([doc
                                    ])[0][0] == soc_cls.predict_soc([doc
                                                                     ])[0][0]

            # Build Annoy index
            knn.build_ann_indexer(num_trees=5)
            assert isinstance(knn.indexer, AnnoyIndexer)

            # Save
            knn.save()
            assert set(os.listdir(os.getcwd())) == set([knn.model_name])
            assert isinstance(knn.indexer, AnnoyIndexer)

            # Load
            new_knn = KNNDoc2VecClassifier.load(FSStore(td), knn.model_name)
            assert new_knn.model_name == knn.model_name
            assert new_knn.predict_soc([doc])[0][0] == '29-2061.00'

            # Have to re-build the index whenever ones load the knn model to the memory
            assert new_knn.indexer == None
Exemple #8
0
 def test_model_storage(self, mock_getcwd):
     with tempfile.TemporaryDirectory() as td:
         mock_getcwd.return_value = td
         ms = ModelStorage(FSStore(td))
         fake = FakeModel(1)
         ms.save_model(fake, 'test.model')
         assert set(os.listdir(os.getcwd())) == set(['test.model'])
         new_model = ms.load_model('test.model')
         assert new_model.val == fake.val
Exemple #9
0
    def test_fsstore(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            storage = FSStore(tmpdir)
            model = pickle.dumps(FakeModel('val'))
            storage.write(model, 'for_testing.model')
            assert os.path.isfile(os.path.join(
                tmpdir, 'for_testing.model')) == storage.exists(
                    'for_testing.model') == True

            model_loaded = storage.load('for_testing.model')
            model_loaded = pickle.loads(model_loaded)
            assert model_loaded.val == 'val'

            storage.delete('for_testing.model')
            assert os.path.isfile(os.path.join(
                tmpdir, 'for_testing.model')) == storage.exists(
                    'for_testing.model') == False
Exemple #10
0
 def __init__(self, matrix, k_folds, grid_config=None, storage=None,
              random_state_for_split=None, scoring=['accuracy'], n_jobs=3):
     self.matrix = matrix
     self.storage = FSStore() if storage is None else storage
     self.k_folds = k_folds
     self.n_jobs = n_jobs
     self.grid_config = self.default_grid_config if grid_config is None else grid_config
     self.cls_cv_result = {}
     self.scoring = scoring
     self.best_classifiers = {}
     self.random_state_for_split = random_state_for_split
     self.train_time = datetime.today().isoformat()
 def test_training_save(self, mock_getcwd):
     with tempfile.TemporaryDirectory() as td:
         mock_getcwd.return_value = td
         matrix = self.matrix
         assert matrix.target_variable.name == "major_group"
         occ_trainer = OccupationClassifierTrainer(matrix,
                                                   k_folds=2,
                                                   storage=FSStore(td),
                                                   grid_config=grid,
                                                   scoring=['accuracy'])
         occ_trainer.train(save=True)
         assert set(os.listdir(os.getcwd())) == set(
             [occ_trainer.train_time])
Exemple #12
0
    def test_fsstore(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            storage = FSStore(tmpdir)

            # 1. Ensure that a new file is correctly created and saved to
            storage_one = PersistedJSONDict(storage, 'test.json')
            storage_one['key1'] = 'value1'
            storage_one['key2'] = {'nestedkey2': 'value2'}
            storage_one.save()
            assert json.load(open(os.path.join(tmpdir, 'test.json')))\
                == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}}

            # 2. Ensure that an existing file is correctly read, updated, and saved to
            storage_two = PersistedJSONDict(storage, 'test.json')
            assert 'key1' in storage_two
            assert storage_two['key1'] == 'value1'
            storage_two['key3'] = 'value3'
            storage_two.save()
            assert json.load(open(os.path.join(tmpdir, 'test.json')))\
                == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}, 'key3': 'value3'}

            # 3. Ensure that, in the same thread, updating and svaing an old one gets new chagnes too
            storage_one['key4'] = 'value4'
            storage_one.save()
            assert json.load(open(os.path.join(tmpdir, 'test.json')))\
                == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}, 'key3': 'value3', 'key4': 'value4'}

            # 4. test autosave - this will be the fourth update of this object
            storage_one.SAVE_EVERY_N_UPDATES = 4
            storage_one['key5'] = 'value5'
            assert json.load(open(os.path.join(tmpdir, 'test.json')))\
                == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}, 'key3': 'value3', 'key4': 'value4', 'key5': 'value5'}

            # 5. test length checking
            assert len(storage_one) == 5

            # 6.  test iteration
            assert sorted(
                [(key, value) for key, value in storage_one.items()],
                key=lambda x: x[0]
            ) == [
                ('key1', 'value1'),
                ('key2', {'nestedkey2': 'value2'}),
                ('key3', 'value3'),
                ('key4', 'value4'),
                ('key5', 'value5')

            ]
Exemple #13
0
    def test_combined_cls_local(self, mock_getcwd):
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            jobpostings = list(JobPostingCollectionSample())
            corpus_generator = Word2VecGensimCorpusCreator(jobpostings, raw=True)
            w2v = Word2VecModel(storage=FSStore(td), size=10, min_count=0, alpha=0.025, min_alpha=0.025)
            trainer = EmbeddingTrainer(corpus_generator, w2v)
            trainer.train(True)

            matrix = create_training_set(jobpostings, SOCMajorGroup())
            X = EmbeddingTransformer(w2v).transform(matrix.X)

            rf = RandomForestClassifier()
            rf.fit(X, matrix.y)
            ccls = CombinedClassifier(w2v, rf, matrix.target_variable)
            assert len(ccls.predict_soc([matrix.X[0]])[0]) == 2
def test_onet_skill_extractor():
    skills_content = [
        [
            'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID',
            'Data Value', 'N', 'Standard Error', 'Lower CI Bound',
            'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date',
            'Domain Source'
        ],
        [
            '11-1011.00', '2.A.1.a', 'Reading Comprehension', 'IM', '4.12',
            '8', '0.13', '3.88', '4.37', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '2.A.1.a', 'Reading Comprehension', 'LV', '4.75',
            '8', '0.16', '4.43', '5.07', 'N', 'N', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '2.A.1.b', 'Active Listening', 'IM', '4.12', '8',
            '0.13', '3.88', '4.37', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '2.A.1.b', 'Active Listening', 'LV', '-4.88', '8',
            '0.23', '4.43', '5.32', 'N', 'N', '07/2014', 'Analyst'
        ],
    ]

    abilities_content = [
        [
            'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID',
            'Data Value', 'N', 'Standard Error', 'Lower CI Bound',
            'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date',
            'Domain Source'
        ],
        [
            '11-1011.00', '1.A.1.a.1', 'Oral Comprehension', 'IM', '4.50', '8',
            '0.19', '4.13', '4.87', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '1.A.1.a.1', 'Oral Comprehension', 'LV', '4.88', '8',
            '0.13', '4.63', '5.12', 'N', 'Y', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '1.A.1.a.2', 'Written Comprehension', 'IM', '4.25',
            '8', '0.16', '3.93', '4.57', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '1.A.1.a.2', 'Written Comprehension', 'LV', '4.62',
            '8', '0.18', '4.27', '4.98', 'N', 'N', '07/2014', 'Analyst'
        ],
        [
            '11-2031.00', '1.A.1.a.3', 'Written Comprehension', 'IM', '4.25',
            '8', '0.16', '3.93', '4.57', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-2031.00', '1.A.1.a.3', 'Written Comprehension', 'LV', '4.62',
            '8', '0.18', '4.27', '4.98', 'N', 'N', '07/2014', 'Analyst'
        ],
    ]

    knowledge_content = [
        [
            'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID',
            'Data Value', 'N', 'Standard Error', 'Lower CI Bound',
            'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date',
            'Domain Source'
        ],
        [
            '11-1011.00', '2.C.1.a', 'Administration and Management', 'IM',
            '4.75', '27', '0.09', '4.56', '4.94', 'N', 'n/a', '07/2014',
            'Incumbent'
        ],
        [
            '11-1011.00', '2.C.1.a', 'Administration and Management', 'LV',
            '6.23', '27', '0.17', '5.88', '6.57', 'N', 'N', '07/2014',
            'Incumbent'
        ],
        [
            '11-1011.00', '2.C.1.b', 'Clerical', 'IM', '2.66', '27', '0.22',
            '2.21', '3.11', 'N', 'n/a', '07/2014', 'Incumbent'
        ],
        [
            '11-1011.00', '2.C.1.b', 'Clerical', 'LV', '3.50', '27', '0.41',
            '2.66', '4.34', 'N', 'N', '07/2014', 'Incumbent'
        ],
    ]

    tools_content = [
        [
            'O*NET-SOC Code', 'T2 Type', 'T2 Example', 'Commodity Code',
            'Commodity Title'
        ],
        [
            '11-1011.00', 'Tools', '10-key calculators', '44101809',
            'Desktop calculator'
        ],
        [
            '11-1011.00', 'Tools', 'Desktop computers', '43211507',
            'Desktop computers'
        ],
        [
            '11-1011.00', 'Tools', 'Laptop computers', '43211503',
            'Notebook computers'
        ],
        [
            '11-1011.00', 'Tools', 'Personal computers', '43211508',
            'Personal computers'
        ],
        [
            '11-1011.00', 'Tools', 'Personal digital assistants PDA',
            '43211504', 'Personal digital assistant PDAs or organizers'
        ],
        ['11-1011.00', 'Tools', 'Smartphones', '43191501', 'Mobile phones'],
        [
            '11-1011.00', 'Tools', 'Universal serial bus USB flash drives',
            '43201813', 'High capacity removable media drives'
        ],
        [
            '11-1011.00', 'Technology', 'Adobe Systems Adobe Acrobat software',
            '43232202', 'Document management software'
        ],
        [
            '11-1011.00', 'Technology', 'AdSense Tracker', '43232306',
            'Data base user interface and query software'
        ],
        [
            '11-1011.00', 'Technology', 'Blackbaud The Raiser\'s Edge',
            '43232303', 'Customer relationship management CRM software'
        ],
    ]

    class MockOnetDownloader(object):
        def download(self, source_file):
            fake_data_lookup = {
                'Skills': skills_content,
                'Abilities': abilities_content,
                'Knowledge': knowledge_content,
                'Tools and Technology': tools_content,
            }

            with utils.makeNamedTemporaryCSV(fake_data_lookup[source_file],
                                             '\t') as tempname:
                with open(tempname) as fh:
                    return fh.read()

    with patch(
            'skills_ml.datasets.skill_importances.onet.OnetToMemoryDownloader',
            MockOnetDownloader):
        with tempfile.TemporaryDirectory() as output_dir:
            storage = FSStore(output_dir)
            extractor = OnetSkillImportanceExtractor(
                output_dataset_name='skills',
                storage=storage,
                hash_function=md5)
            extractor.run()
            pdin = io.StringIO(storage.load('skills.tsv').decode('utf-8'))
            output = pd.read_csv(pdin, sep='\t').T.to_dict().values()

            # +24 base rows in input across the K,S,A,T files
            assert len(output) == 24

            # make sure uuid is hashed version of the KSA
            for row in output:
                assert row['nlp_a'] == md5(row['ONET KSA'])
                # otherwise, this is a simple concat so not much to assert
                # we do use these rows though so make sure they're there
                assert 'O*NET-SOC Code' in row
                assert 'ONET KSA' in row
import multiprocessing
num_of_worker = multiprocessing.cpu_count()

job_samples = JobPostingCollectionSample()
job_postings = list(job_samples)

random.shuffle(job_postings)

train_data = job_postings[:30]
test_data = job_postings[30:]

train_bytes = json.dumps(train_data).encode()
test_bytes = json.dumps(test_data).encode()

logging.info("Loading Embedding Model")
w2v = Word2VecModel.load(storage=FSStore('tmp'),
                         model_name='your-embedding-model')

full_soc = FullSOC()


def basic_filter(doc):
    """
    Return the document except for the document which soc is unknown or empty or not in the
    soc code pool of current O*Net version
    """
    if full_soc.filter_func(
            doc) and doc['onet_soc_code'] in full_soc.onet.all_soc:
        return doc
    else:
        return None
    aggregate_properties
from skills_ml.storage import FSStore
from functools import partial
import unicodecsv as csv
import numpy
from skills_ml.job_postings.aggregate.pandas import listy_n_most_common
import os
import tempfile

logging.basicConfig(level=logging.INFO)

job_postings = list(JobPostingCollectionSample())

with tempfile.TemporaryDirectory() as tmpdir:
    computed_properties_path = os.path.join(tmpdir, 'computed_properties')
    storage = FSStore(computed_properties_path)

    # Create properties. In this example, we are going to both compute and aggregate,
    # but this is not necessary! Computation and aggregation are entirely decoupled.
    # So it's entirely valid to just compute a bunch of properties and then later
    # figure out how you want to aggregate them.
    # We are only introducing the 'grouping' and 'aggregate' semantics this early in the
    # script so as to avoid defining these properties twice in the same script.

    # create properties to be grouped on. In this case, we want to group on cleaned job title
    grouping_properties = [
        TitleCleanPhaseOne(storage=storage),
        Geography(geo_querier=JobStateQuerier(), storage=storage)
    ]
    # create properties to aggregate for each group
    posting_present_prop = PostingIdPresent(storage=storage)
Exemple #17
0
    def test_fsstore(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            storage = FSStore(tmpdir)
            model = FakeModel('val')
            model_pickled = pickle.dumps(model)
            storage.write(model_pickled, 'for_testing.model')
            assert os.path.isfile(os.path.join(tmpdir, 'for_testing.model')) == storage.exists('for_testing.model') == True

            with storage.open("for_testing_compressed.model", "wb") as f:
                joblib.dump(model, f, compress=True)

            assert storage.exists("for_testing_compressed.model")

            with open_sesame(os.path.join(tmpdir, "for_testing_compressed.model"), "rb") as f:
                model_loaded = joblib.load(f)
            assert model.val ==  model_loaded.val

            model_loaded = storage.load('for_testing.model')
            model_loaded = pickle.loads(model_loaded)
            assert model_loaded.val == 'val'

            storage.delete('for_testing.model')
            assert os.path.isfile(os.path.join(tmpdir, 'for_testing.model')) == storage.exists('for_testing.model') == False
Exemple #18
0
 def test_save(self):
     with tempfile.TemporaryDirectory() as temp_dir:
         storage = FSStore(temp_dir)
         self.ontology().save(storage)
         assert CompetencyOntology(jsonld_string=storage.load('Test Ontology.json')) == self.ontology()
Exemple #19
0
 def __init__(self, storage=None):
     self._storage = FSStore() if storage is None else storage

job_samples = JobPostingCollectionSample()
job_postings = list(job_samples)

random.shuffle(job_postings)

train_data = job_postings[:30]
test_data = job_postings[30:]

train_bytes = json.dumps(train_data).encode()
test_bytes = json.dumps(test_data).encode()


logging.info("Loading Embedding Model")
model_storage = ModelStorage(FSStore('/your/model/path'))
w2v = model_storage.load_model(model_name='your_model_name')

full_soc = FullSOC()

def basic_filter(doc):
    """
    Return the document except for the document which soc is unknown or empty or not in the
    soc code pool of current O*Net version
    """
    if full_soc.filter_func(doc) and doc['onet_soc_code'] in full_soc.choices:
        return doc
    else:
        return None

class JobGenerator(object):