def test_training_not_save(self):
     matrix = self.matrix
     assert matrix.target_variable.name == "major_group"
     occ_trainer = OccupationClassifierTrainer(matrix,
                                               k_folds=2,
                                               grid_config=grid,
                                               scoring=['accuracy'])
     occ_trainer.train(save=False)
     assert list(occ_trainer.cls_cv_result['accuracy'].keys()) == [
         'ExtraTreesClassifier'
     ]
 def test_training_save(self, mock_getcwd):
     with tempfile.TemporaryDirectory() as td:
         mock_getcwd.return_value = td
         matrix = self.matrix
         assert matrix.target_variable.name == "major_group"
         occ_trainer = OccupationClassifierTrainer(matrix,
                                                   k_folds=2,
                                                   storage=FSStore(td),
                                                   grid_config=grid,
                                                   scoring=['accuracy'])
         occ_trainer.train(save=True)
         assert set(os.listdir(os.getcwd())) == set(
             [occ_trainer.train_time])
Example #3
0
    def test_training(self):
        jp_f = JobPostingFilterer(self.jobpostings, [self.has_soc_filter])
        matrix = create_training_set(jp_f, SOCMajorGroup(),
                                     self.embedding_model)
        assert matrix.target_variable.name == "major_group"

        occ_trainer = OccupationClassifierTrainer(matrix,
                                                  k_folds=2,
                                                  grid_config=grid,
                                                  scoring=['accuracy'])
        occ_trainer.train()
        assert list(occ_trainer.cls_cv_result['accuracy'].keys()) == [
            'ExtraTreesClassifier'
        ]
        assert occ_trainer.matrix.embedding_model.model_name == self.embedding_model.model_name
    def test_tester(self):
        document_schema_fields = ['description','experienceRequirements', 'qualifications', 'skills']
        corpus_generator = Word2VecGensimCorpusCreator(JobPostingCollectionSample(num_records=30), document_schema_fields=document_schema_fields)
        w2v = Word2VecModel(size=10, min_count=3, iter=4, window=6, workers=3)
        trainer = EmbeddingTrainer(w2v)
        trainer.train(corpus_generator)

        jp = JobPostingCollectionSample()
        train_gen = islice(jp, 30)
        test_gen = islice(jp, 30, None)
        train_matrix = DesignMatrix(train_gen, self.fullsoc, self.pipe_x, self.pipe_y)
        train_matrix.build()
        occ_trainer = OccupationClassifierTrainer(train_matrix, 2, grid_config=self.grid_config)
        occ_trainer.train(save=False)
        cc = CombinedClassifier(w2v, occ_trainer.best_estimators[0])

        steps = self.pipe_x.generators[:-1]

        test_gen = (t for t in test_gen if t['onet_soc_code'] is not '')

        tester = OccupationClassifierTester(test_data_generator=test_gen, preprocessing=steps, classifier=cc)
        result = list(tester)

        assert len(tester) == len(result) == 18
                     'max_depth': [20, 50],
                     'max_features': ['log2'],
                     'min_samples_split': [10, 20]
                     },
                 'sklearn.neural_network.MLPClassifier': {
                    'hidden_layer_sizes': [100, 200, 300, 500, 1000],
                     'activation': ['identity', 'logistic', 'tanh', 'relu'],
                     'solver': ['lbfgs', 'sgd', 'adam']
                     },
                 'sklearn.svm.SVC': {
                     'C': [0.1, 1, 10, 100, 1000],
                     'kernel': ['linear', 'poly', 'sigmoid', 'rbf', 'precomputed'],
                     'shrinking': [True, False],
                     'decision_function_shape': ['ovo', 'ovr']
                     }
                 }

trainer = OccupationClassifierTrainer(
    matrix=matrix,
    k_folds=3,
    grid_config=grid_config,
    storage=FSStore('tmp/soc_classifiers'),
    n_jobs = num_of_worker
)
trainer.train()

fs = FSStore(os.path.join('soc_classifiers', trainer.train_time))
fs.write(train_bytes, "train.data")
fs.write(test_bytes, "test_data")