def test_field_exceptions(self): # test to make sure that the method that checks the field configs ids for each field name in the field_dict # of the content analyzer works. It considers the three cases this can occur: when passing the field_dict # with duplicate ids as argument for the content_analyzer, when setting the FieldConfig list with duplicates # for a specific field_name, and when appending a FieldConfig to the list associated with a specific field_name # but the config id is already in the list config_1 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test") config_2 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test") config_list = [config_1, config_2] field_dict = dict() field_dict["test"] = config_list with self.assertRaises(ValueError): config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test", field_dict) ContentAnalyzer(config).fit() with self.assertRaises(ValueError): config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test") config.add_multiple_config("test", config_list) ContentAnalyzer(config).fit() with self.assertRaises(ValueError): config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test") config.add_single_config("test", config_1) config.add_single_config("test", config_2) ContentAnalyzer(config).fit()
def test_create_content_embedding(self): movies_ca_config = ItemAnalyzerConfig( source=JSONFile(movies_info_reduced), id=['imdbID'], output_directory="movielens_test_embedding", ) movies_ca_config.add_multiple_config( field_name='Title', config_list=[FieldConfig( WordEmbeddingTechnique(Gensim('glove-twitter-25')), NLTK(lemmatization=True, stopwords_removal=True))]) content_analyzer = ContentAnalyzer(movies_ca_config) content_analyzer.fit() for name in os.listdir(THIS_DIR): if os.path.isdir(os.path.join(THIS_DIR, name)) \ and 'movielens_test_embedding' in str(name): with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance(content.get_field("Title")[0], EmbeddingField) self.assertIsInstance(content.get_field("Title")[0].value, np.ndarray) break
def test_create_contents_in_index(self): output_dir = os.path.join(THIS_DIR, "movielens_test_original_index") movies_ca_config = ItemAnalyzerConfig( source=JSONFile(movies_info_reduced), id=['imdbID'], output_directory=output_dir, ) movies_ca_config.add_multiple_config( field_name='Title', config_list=[ FieldConfig(OriginalData(), NLTK(lemmatization=True, stopwords_removal=True), SearchIndex(os.path.join(output_dir, "index")), "test_search"), FieldConfig(SkLearnTfIdf(), NLTK(), KeywordIndex(os.path.join(output_dir, "index1")), "test_keyword"), FieldConfig(OriginalData(), NLTK(), SearchIndex(os.path.join(output_dir, "index"))) ]) content_analyzer = ContentAnalyzer(movies_ca_config) content_analyzer.fit() for name in os.listdir(THIS_DIR): if os.path.isdir(os.path.join(THIS_DIR, name)) \ and 'movielens_test_original_index' in str(name): with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance( content.get_field("Title")[0], IndexField) self.assertIsInstance( content.get_field("Title")[0].value, str) self.assertIsInstance( content.get_field("Title")[1], IndexField) self.assertIsInstance( content.get_field("Title")[1].value, str) break
def test_produce_content_str(self): self.skipTest("Test requires internet but is too complex to be mocked") technique = WordEmbeddingTechnique('glove-twitter-25') self.assertIsInstance(technique.embedding_source, Gensim) embedding_list = technique.produce_content("Plot", [NLTK()], JSONFile(file_path)) self.assertEqual(len(embedding_list), 20) self.assertIsInstance(embedding_list[0], EmbeddingField) technique = SentenceEmbeddingTechnique( 'paraphrase-distilroberta-base-v1') self.assertIsInstance(technique.embedding_source, Sbert) embedding_list = technique.produce_content("Plot", [NLTK()], JSONFile(file_path)) self.assertEqual(len(embedding_list), 20) self.assertIsInstance(embedding_list[0], EmbeddingField) technique = FromWordsDocumentEmbeddingTechnique( 'glove-twitter-25', Centroid()) self.assertIsInstance(technique.embedding_source, Gensim) embedding_list = technique.produce_content("Plot", [], JSONFile(file_path)) self.assertEqual(len(embedding_list), 20) self.assertIsInstance(embedding_list[0], EmbeddingField) technique = FromSentencesDocumentEmbeddingTechnique( 'paraphrase-distilroberta-base-v1', Centroid()) self.assertIsInstance(technique.embedding_source, Sbert) embedding_list = technique.produce_content("Plot", [NLTK()], JSONFile(file_path)) self.assertEqual(len(embedding_list), 20) self.assertIsInstance(embedding_list[0], EmbeddingField) technique = FromWordsSentenceEmbeddingTechnique( 'glove-twitter-25', Centroid()) self.assertIsInstance(technique.embedding_source, Gensim) embedding_list = technique.produce_content("Plot", [], JSONFile(file_path)) self.assertEqual(len(embedding_list), 20) self.assertIsInstance(embedding_list[0], EmbeddingField)
def test_create_content_embedding(self): movies_ca_config = ContentAnalyzerConfig( content_type='Item', source=JSONFile(file_path), id_field_name_list=['imdbID'], output_directory="movielens_test", ) movies_ca_config.append_field_config( field_name='Title', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline( preprocessor_list=[ NLTK(lemmatization=True, stopwords_removal=True) ], content_technique=EmbeddingTechnique( combining_technique=Centroid(), embedding_source=GensimDownloader( name='glove-twitter-25'), granularity='doc')) ])) content_analyzer = ContentAnalyzer(movies_ca_config) content_analyzer.fit()
def test_produce_cotent(self): technique = WordEmbeddingTechnique(GensimFastText()) embedding_list = technique.produce_content("Plot", [NLTK()], JSONFile(file_path)) self.assertEqual(len(embedding_list), 20) self.assertIsInstance(embedding_list[0], EmbeddingField)
field_name='Title', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline(content_technique=LuceneTfIdf()) ])) movies_ca_config.append_field_config( field_name='Year', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline(content_technique=LuceneTfIdf()) ])) movies_ca_config.append_field_config( field_name='Genre', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline(preprocessor_list=[ NLTK(lemmatization=True, stopwords_removal=True) ], content_technique=LuceneTfIdf()) ])) movies_ca_config.append_field_config( field_name='Plot', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline(preprocessor_list=[ NLTK(lemmatization=True, stopwords_removal=True) ], content_technique=LuceneTfIdf()), FieldRepresentationPipeline(preprocessor_list=[ NLTK(lemmatization=True, stopwords_removal=True) ], content_technique=EmbeddingTechnique(
movies_ca_config.append_field_config( field_name='Year', field_config=FieldConfig( pipelines_list=[FieldRepresentationPipeline( content_technique=LuceneTfIdf())] ) ) movies_ca_config.append_field_config( field_name='Genre', field_config=FieldConfig( pipelines_list=[FieldRepresentationPipeline( preprocessor_list=[NLTK(lemmatization=True, stopwords_removal=True)], content_technique=LuceneTfIdf())] ) ) movies_ca_config.append_field_config( 'Plot', [ FieldConfig(WhooshTfIdf(),NLTK(lemmatization=True, stopwords_removal=True)), FieldConfig(EmbeddingTechnique(Centroid(), GensimDownloader(name='glove-twitter-25'), granularity='word'), NLTK(lemmatization=True, stopwords_removal=True)) ] ) pipelines_list=[FieldRepresentationPipeline( preprocessor_list=[NLTK(lemmatization=True, stopwords_removal=True)],