def test_embedding_feature(self): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields, raw=True) w2v = Word2VecModel(size=10, min_count=0, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(w2v) trainer.train(corpus_generator) job_postings = RawCorpusCreator( JobPostingCollectionSample(num_records=50)) raw1, raw2 = tee(job_postings) fc = SequenceFeatureCreator(raw1, sentence_tokenizer=sentence_tokenize, word_tokenizer=word_tokenize, embedding_model=w2v, features=["EmbeddingFeature"]) fc = iter(fc) self.assertEqual( next(fc).shape[0], np.array( next(iter(word_tokenizer_gen( sentence_tokenizer_gen(raw2))))).shape[0]) self.assertEqual(next(fc)[0].shape[0], 10)
def test_contextual_feature(self): raw = RawCorpusCreator(JobPostingCollectionSample()) raw1, raw2 = tee(raw) fc = SequenceFeatureCreator(raw1, features=["ContextualFeature"]) fc = iter(fc) self.assertEqual(next(fc).shape[0], np.array(next(iter(word_tokenizer_gen(sentence_tokenizer_gen(raw2))))).shape[0]) self.assertEqual(next(fc)[0].shape[0], 17)
def test_skill_feature(self): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields, raw=True) w2v = Word2VecModel(size=10, min_count=0, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(w2v) trainer.train(corpus_generator) raw = RawCorpusCreator(JobPostingCollectionSample()) raw1, raw2 = tee(raw) # default fc = SequenceFeatureCreator(raw1, embedding_model=w2v) self.assertEqual( fc.selected_features, ["StructuralFeature", "ContextualFeature", "EmbeddingFeature"]) self.assertEqual( fc.all_features, ["StructuralFeature", "ContextualFeature", "EmbeddingFeature"]) fc = iter(fc) self.assertEqual( next(fc).shape[0], np.array( next(iter(word_tokenizer_gen( sentence_tokenizer_gen(raw2))))).shape[0]) self.assertEqual(next(fc)[0].shape[0], 29) # Not Supported fc = SequenceFeatureCreator(raw1, features=["FeatureNotSupported"]) fc = iter(fc) self.assertRaises(TypeError, lambda: next(fc))