def test_two_filters(self): major_group_27_filter = lambda job: job['onet_soc_code'][:2] != '27' major_group_49_filter = lambda job: job['onet_soc_code'][:2] != '49' soc_target = SOCMajorGroup( [major_group_27_filter, major_group_49_filter]) def new_filter(doc): if soc_target.filter_func(doc): return doc else: return None document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] pipe_x = IterablePipeline( new_filter, partial(nlp.fields_join, document_schema_fields=document_schema_fields), nlp.clean_str, nlp.word_tokenize, partial(nlp.vectorize, embedding_model=self.embedding_model)) pipe_y = IterablePipeline(new_filter, soc_target.transformer) matrix = DesignMatrix(self.jobpostings, soc_target, pipe_x, pipe_y) matrix.build() assert '27' not in matrix.target_variable.encoder.inverse_transform( matrix.y) assert '49' not in matrix.target_variable.encoder.inverse_transform( matrix.y)
def test_with_iterable_pipelin(self): import boto3 client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = S3Store('fake-open-skills/models') model_storage = ModelStorage(storage=s3) fake = FakeModel('fake') model_storage.save_model(fake, fake.model_name) vectorize_for_pipeline = partial(nlp.vectorize, embedding_model=SerializedByStorage( storage=s3, model_name=fake.model_name, model=fake)) pipe = IterablePipeline(vectorize_for_pipeline) pipe_unpickled = pickle.loads(pickle.dumps(pipe)) # make sure the fake model wasn't pickled but the reference assert pipe_unpickled.functions[-1].keywords[ 'embedding_model']._model == None assert pipe_unpickled.functions[-1].keywords[ 'embedding_model'].storage.path == s3.path # The model will be loaded when it's needed assert list(pipe_unpickled([1])) == [[1, 2, 3, 4]]
def pipe_x(self): document_schema_fields = ['description', 'experienceRequirements', 'qualifications', 'skills'] pipe_x = IterablePipeline( self.basic_filter, partial(nlp.fields_join, document_schema_fields=document_schema_fields), nlp.clean_str, nlp.word_tokenize, partial(nlp.vectorize, embedding_model=self.embedding_model) ) return pipe_x
def test_iterable_pipeline(self): def sentence_counter(doc): """count sentence for a document""" return len(doc) updated_fields_join = partial(fields_join, document_schema_fields=['description']) update_wrapper(updated_fields_join, fields_join) pipe1 = IterablePipeline(updated_fields_join, clean_html, sentence_tokenize, clean_str, word_tokenize, sentence_counter) pipe2 = IterablePipeline(updated_fields_join, clean_html, sentence_tokenize, sentence_counter) pipe1_generator = pipe1(self.jp) pipe2_generator = pipe2(self.jp) assert list(pipe1_generator) == list(pipe2_generator) assert pipe1.description == [f.__doc__ for f in pipe1.functions]
def __init__(self, data_source_generator: Generator, target_variable: TargetVariable, pipe_X: IterablePipeline = None, pipe_y: IterablePipeline = None): if pipe_X == None: pipe_X = IterablePipeline() if pipe_y == None: pipe_y = IterablePipeline() if not self._check_pipeline(pipe_X) or not self._check_pipeline( pipe_y): raise TypeError( "pipeline object should be IterablePipeline object") self._X = [] self._y = [] self.data_source_generator = data_source_generator self.pipe_X = pipe_X self.pipe_y = pipe_y self.target_variable = target_variable
def test_combined_processing_iterable(self): w2v = FakeEmbeddingModel(size=10) vectorization = ProcessingPipeline( normalize, clean_html, clean_str, word_tokenize, partial(vectorize, embedding_model=w2v)) pipe_combined = IterablePipeline(self.updated_fields_join, vectorization) pipe_iterable = IterablePipeline( self.updated_fields_join, normalize, clean_html, clean_str, word_tokenize, partial(vectorize, embedding_model=w2v)) pipe_combined_generator = pipe_combined(self.jp) pipe_iterable_generator = pipe_iterable(self.jp) combined = list(pipe_combined_generator) iterable = list(pipe_iterable_generator) assert len(combined) == len(iterable) for c, i in zip(combined, iterable): np.testing.assert_array_equal(c, i)
def test_with_iterable_pipeline(self): import boto3 client=boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = S3Store('fake-open-skills') model_storage = ModelStorage(s3) proxy_fake = ProxyObjectWithStorage(model_obj=FakeModel('fake'), storage=s3, model_name='fake') model_storage.save_model(proxy_fake, proxy_fake.model_name) vectorize_for_pipeline = partial(nlp.vectorize, embedding_model=SerializedByStorage(model=proxy_fake, model_name=proxy_fake.model_name)) pipe = IterablePipeline(vectorize_for_pipeline) s3.write(pickle.dumps(pipe), 'fake.pipe') pipe_unpickled = pickle.loads(s3.load('fake.pipe')) assert list(pipe_unpickled([1])) == [[1, 2, 3, 4]]
def pipe_y(self): pipe_y = IterablePipeline(self.basic_filter, self.major_group.transformer) return pipe_y
class JobGenerator(object): def __init__(self, data): self.data = data @property def metadata(self): return job_samples.metadata def __iter__(self): yield from self.data document_schema_fields = ['description', 'experienceRequirements', 'qualifications', 'skills'] pipe_x = IterablePipeline( basic_filter, partial(nlp.fields_join, document_schema_fields=document_schema_fields), nlp.clean_str, nlp.word_tokenize, partial(nlp.vectorize, embedding_model=w2v) ) pipe_y = IterablePipeline( basic_filter, full_soc.transformer ) matrix = DesignMatrix( data_source_generator=JobGenerator(train_data), target_variable=full_soc, pipe_X=pipe_x, pipe_y=pipe_y) matrix.build()
def pipeline_y(self): return IterablePipeline(*self._preprocessing_y)
def pipeline_X(self): steps = self._preprocessing_X.copy() steps.append(lambda x: self.classifier.predict([x])) return IterablePipeline(*steps)
def pipe_y(self): pipe_y = IterablePipeline( self.fullsoc.filter, self.fullsoc.transformer ) return pipe_y