コード例 #1
0
    def test_two_filters(self):
        major_group_27_filter = lambda job: job['onet_soc_code'][:2] != '27'
        major_group_49_filter = lambda job: job['onet_soc_code'][:2] != '49'
        soc_target = SOCMajorGroup(
            [major_group_27_filter, major_group_49_filter])

        def new_filter(doc):
            if soc_target.filter_func(doc):
                return doc
            else:
                return None

        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        pipe_x = IterablePipeline(
            new_filter,
            partial(nlp.fields_join,
                    document_schema_fields=document_schema_fields),
            nlp.clean_str, nlp.word_tokenize,
            partial(nlp.vectorize, embedding_model=self.embedding_model))

        pipe_y = IterablePipeline(new_filter, soc_target.transformer)

        matrix = DesignMatrix(self.jobpostings, soc_target, pipe_x, pipe_y)
        matrix.build()
        assert '27' not in matrix.target_variable.encoder.inverse_transform(
            matrix.y)
        assert '49' not in matrix.target_variable.encoder.inverse_transform(
            matrix.y)
コード例 #2
0
    def test_with_iterable_pipelin(self):
        import boto3
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3 = S3Store('fake-open-skills/models')
        model_storage = ModelStorage(storage=s3)
        fake = FakeModel('fake')

        model_storage.save_model(fake, fake.model_name)
        vectorize_for_pipeline = partial(nlp.vectorize,
                                         embedding_model=SerializedByStorage(
                                             storage=s3,
                                             model_name=fake.model_name,
                                             model=fake))
        pipe = IterablePipeline(vectorize_for_pipeline)

        pipe_unpickled = pickle.loads(pickle.dumps(pipe))
        # make sure the fake model wasn't pickled but the reference
        assert pipe_unpickled.functions[-1].keywords[
            'embedding_model']._model == None
        assert pipe_unpickled.functions[-1].keywords[
            'embedding_model'].storage.path == s3.path
        # The model will be loaded when it's needed
        assert list(pipe_unpickled([1])) == [[1, 2, 3, 4]]
 def pipe_x(self):
     document_schema_fields = ['description', 'experienceRequirements', 'qualifications', 'skills']
     pipe_x = IterablePipeline(
             self.basic_filter,
             partial(nlp.fields_join, document_schema_fields=document_schema_fields),
             nlp.clean_str,
             nlp.word_tokenize,
             partial(nlp.vectorize, embedding_model=self.embedding_model)
             )
     return pipe_x
コード例 #4
0
    def test_iterable_pipeline(self):
        def sentence_counter(doc):
            """count sentence for a document"""
            return len(doc)

        updated_fields_join = partial(fields_join,
                                      document_schema_fields=['description'])
        update_wrapper(updated_fields_join, fields_join)

        pipe1 = IterablePipeline(updated_fields_join, clean_html,
                                 sentence_tokenize, clean_str, word_tokenize,
                                 sentence_counter)
        pipe2 = IterablePipeline(updated_fields_join, clean_html,
                                 sentence_tokenize, sentence_counter)

        pipe1_generator = pipe1(self.jp)
        pipe2_generator = pipe2(self.jp)

        assert list(pipe1_generator) == list(pipe2_generator)
        assert pipe1.description == [f.__doc__ for f in pipe1.functions]
コード例 #5
0
    def __init__(self,
                 data_source_generator: Generator,
                 target_variable: TargetVariable,
                 pipe_X: IterablePipeline = None,
                 pipe_y: IterablePipeline = None):

        if pipe_X == None:
            pipe_X = IterablePipeline()
        if pipe_y == None:
            pipe_y = IterablePipeline()
        if not self._check_pipeline(pipe_X) or not self._check_pipeline(
                pipe_y):
            raise TypeError(
                "pipeline object should be IterablePipeline object")

        self._X = []
        self._y = []
        self.data_source_generator = data_source_generator
        self.pipe_X = pipe_X
        self.pipe_y = pipe_y
        self.target_variable = target_variable
コード例 #6
0
    def test_combined_processing_iterable(self):
        w2v = FakeEmbeddingModel(size=10)
        vectorization = ProcessingPipeline(
            normalize, clean_html, clean_str, word_tokenize,
            partial(vectorize, embedding_model=w2v))

        pipe_combined = IterablePipeline(self.updated_fields_join,
                                         vectorization)

        pipe_iterable = IterablePipeline(
            self.updated_fields_join, normalize, clean_html, clean_str,
            word_tokenize, partial(vectorize, embedding_model=w2v))

        pipe_combined_generator = pipe_combined(self.jp)
        pipe_iterable_generator = pipe_iterable(self.jp)

        combined = list(pipe_combined_generator)
        iterable = list(pipe_iterable_generator)

        assert len(combined) == len(iterable)

        for c, i in zip(combined, iterable):
            np.testing.assert_array_equal(c, i)
コード例 #7
0
ファイル: test_storage.py プロジェクト: resgen/skills-ml
    def test_with_iterable_pipeline(self):
        import boto3
        client=boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write')
        s3 = S3Store('fake-open-skills')
        model_storage = ModelStorage(s3)

        proxy_fake = ProxyObjectWithStorage(model_obj=FakeModel('fake'), storage=s3, model_name='fake')
        model_storage.save_model(proxy_fake, proxy_fake.model_name)

        vectorize_for_pipeline = partial(nlp.vectorize, embedding_model=SerializedByStorage(model=proxy_fake, model_name=proxy_fake.model_name))
        pipe = IterablePipeline(vectorize_for_pipeline)

        s3.write(pickle.dumps(pipe), 'fake.pipe')
        pipe_unpickled = pickle.loads(s3.load('fake.pipe'))

        assert list(pipe_unpickled([1])) == [[1, 2, 3, 4]]
コード例 #8
0
 def pipe_y(self):
     pipe_y = IterablePipeline(self.basic_filter,
                               self.major_group.transformer)
     return pipe_y
コード例 #9
0
class JobGenerator(object):
    def __init__(self, data):
        self.data = data

    @property
    def metadata(self):
        return job_samples.metadata

    def __iter__(self):
        yield from self.data

document_schema_fields = ['description', 'experienceRequirements', 'qualifications', 'skills']
pipe_x = IterablePipeline(
    basic_filter,
    partial(nlp.fields_join, document_schema_fields=document_schema_fields),
    nlp.clean_str,
    nlp.word_tokenize,
    partial(nlp.vectorize, embedding_model=w2v)
)
pipe_y = IterablePipeline(
    basic_filter,
    full_soc.transformer
)

matrix = DesignMatrix(
        data_source_generator=JobGenerator(train_data),
        target_variable=full_soc,
        pipe_X=pipe_x,
        pipe_y=pipe_y)

matrix.build()
コード例 #10
0
 def pipeline_y(self):
     return IterablePipeline(*self._preprocessing_y)
コード例 #11
0
 def pipeline_X(self):
     steps = self._preprocessing_X.copy()
     steps.append(lambda x: self.classifier.predict([x]))
     return IterablePipeline(*steps)
 def pipe_y(self):
     pipe_y = IterablePipeline(
             self.fullsoc.filter,
             self.fullsoc.transformer
     )
     return pipe_y