def test_training_not_save(self): matrix = self.matrix assert matrix.target_variable.name == "major_group" occ_trainer = OccupationClassifierTrainer(matrix, k_folds=2, grid_config=grid, scoring=['accuracy']) occ_trainer.train(save=False) assert list(occ_trainer.cls_cv_result['accuracy'].keys()) == [ 'ExtraTreesClassifier' ]
def test_training_save(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td matrix = self.matrix assert matrix.target_variable.name == "major_group" occ_trainer = OccupationClassifierTrainer(matrix, k_folds=2, storage=FSStore(td), grid_config=grid, scoring=['accuracy']) occ_trainer.train(save=True) assert set(os.listdir(os.getcwd())) == set( [occ_trainer.train_time])
def test_training(self): jp_f = JobPostingFilterer(self.jobpostings, [self.has_soc_filter]) matrix = create_training_set(jp_f, SOCMajorGroup(), self.embedding_model) assert matrix.target_variable.name == "major_group" occ_trainer = OccupationClassifierTrainer(matrix, k_folds=2, grid_config=grid, scoring=['accuracy']) occ_trainer.train() assert list(occ_trainer.cls_cv_result['accuracy'].keys()) == [ 'ExtraTreesClassifier' ] assert occ_trainer.matrix.embedding_model.model_name == self.embedding_model.model_name
def test_tester(self): document_schema_fields = ['description','experienceRequirements', 'qualifications', 'skills'] corpus_generator = Word2VecGensimCorpusCreator(JobPostingCollectionSample(num_records=30), document_schema_fields=document_schema_fields) w2v = Word2VecModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(w2v) trainer.train(corpus_generator) jp = JobPostingCollectionSample() train_gen = islice(jp, 30) test_gen = islice(jp, 30, None) train_matrix = DesignMatrix(train_gen, self.fullsoc, self.pipe_x, self.pipe_y) train_matrix.build() occ_trainer = OccupationClassifierTrainer(train_matrix, 2, grid_config=self.grid_config) occ_trainer.train(save=False) cc = CombinedClassifier(w2v, occ_trainer.best_estimators[0]) steps = self.pipe_x.generators[:-1] test_gen = (t for t in test_gen if t['onet_soc_code'] is not '') tester = OccupationClassifierTester(test_data_generator=test_gen, preprocessing=steps, classifier=cc) result = list(tester) assert len(tester) == len(result) == 18
'max_depth': [20, 50], 'max_features': ['log2'], 'min_samples_split': [10, 20] }, 'sklearn.neural_network.MLPClassifier': { 'hidden_layer_sizes': [100, 200, 300, 500, 1000], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'] }, 'sklearn.svm.SVC': { 'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'poly', 'sigmoid', 'rbf', 'precomputed'], 'shrinking': [True, False], 'decision_function_shape': ['ovo', 'ovr'] } } trainer = OccupationClassifierTrainer( matrix=matrix, k_folds=3, grid_config=grid_config, storage=FSStore('tmp/soc_classifiers'), n_jobs = num_of_worker ) trainer.train() fs = FSStore(os.path.join('soc_classifiers', trainer.train_time)) fs.write(train_bytes, "train.data") fs.write(test_bytes, "test_data")