Beispiel #1
0
    def test_filtering(self):
        major_group_27_filter = lambda job: job['onet_soc_code'][:2] != '27'
        major_group_49_filter = lambda job: job['onet_soc_code'][:2] != '49'

        soc_target = SOCMajorGroup()
        matrix = create_training_set(
            self.jobpostings,
            soc_target,
            self.embedding_model,
        )
        assert '27' in matrix.target_variable.encoder.inverse_transform(
            matrix.y)

        soc_target = SOCMajorGroup(major_group_27_filter)
        matrix = create_training_set(self.jobpostings, soc_target,
                                     self.embedding_model)
        assert '27' not in matrix.target_variable.encoder.inverse_transform(
            matrix.y)

        soc_target = SOCMajorGroup(
            [major_group_27_filter, major_group_49_filter])
        matrix = create_training_set(self.jobpostings, soc_target,
                                     self.embedding_model)
        assert '27' not in matrix.target_variable.encoder.inverse_transform(
            matrix.y)
        assert '49' not in matrix.target_variable.encoder.inverse_transform(
            matrix.y)
    def test_two_filters(self):
        major_group_27_filter = lambda job: job['onet_soc_code'][:2] != '27'
        major_group_49_filter = lambda job: job['onet_soc_code'][:2] != '49'
        soc_target = SOCMajorGroup(
            [major_group_27_filter, major_group_49_filter])

        def new_filter(doc):
            if soc_target.filter_func(doc):
                return doc
            else:
                return None

        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        pipe_x = IterablePipeline(
            new_filter,
            partial(nlp.fields_join,
                    document_schema_fields=document_schema_fields),
            nlp.clean_str, nlp.word_tokenize,
            partial(nlp.vectorize, embedding_model=self.embedding_model))

        pipe_y = IterablePipeline(new_filter, soc_target.transformer)

        matrix = DesignMatrix(self.jobpostings, soc_target, pipe_x, pipe_y)
        matrix.build()
        assert '27' not in matrix.target_variable.encoder.inverse_transform(
            matrix.y)
        assert '49' not in matrix.target_variable.encoder.inverse_transform(
            matrix.y)
Beispiel #3
0
 def test_create_training_set(self):
     jp_f = list(JobPostingFilterer(self.jobpostings,
                                    [self.has_soc_filter]))
     matrix = create_training_set(jp_f, SOCMajorGroup(),
                                  self.embedding_model)
     assert matrix.target_variable.name == "major_group"
     assert matrix.X.shape[0] == len(jp_f)
     assert matrix.y.shape[0] == len(jp_f)
     assert matrix.embedding_model == self.embedding_model
     assert matrix.target_variable.encoder.inverse_transform([0]) == '11'
Beispiel #4
0
    def test_training(self):
        jp_f = JobPostingFilterer(self.jobpostings, [self.has_soc_filter])
        matrix = create_training_set(jp_f, SOCMajorGroup(),
                                     self.embedding_model)
        assert matrix.target_variable.name == "major_group"

        occ_trainer = OccupationClassifierTrainer(matrix,
                                                  k_folds=2,
                                                  grid_config=grid,
                                                  scoring=['accuracy'])
        occ_trainer.train()
        assert list(occ_trainer.cls_cv_result['accuracy'].keys()) == [
            'ExtraTreesClassifier'
        ]
        assert occ_trainer.matrix.embedding_model.model_name == self.embedding_model.model_name
Beispiel #5
0
    def test_combined_cls_local(self, mock_getcwd):
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            jobpostings = list(JobPostingCollectionSample())
            corpus_generator = Word2VecGensimCorpusCreator(jobpostings, raw=True)
            w2v = Word2VecModel(storage=FSStore(td), size=10, min_count=0, alpha=0.025, min_alpha=0.025)
            trainer = EmbeddingTrainer(corpus_generator, w2v)
            trainer.train(True)

            matrix = create_training_set(jobpostings, SOCMajorGroup())
            X = EmbeddingTransformer(w2v).transform(matrix.X)

            rf = RandomForestClassifier()
            rf.fit(X, matrix.y)
            ccls = CombinedClassifier(w2v, rf, matrix.target_variable)
            assert len(ccls.predict_soc([matrix.X[0]])[0]) == 2
Beispiel #6
0
 def major_group(self):
     return SOCMajorGroup()