Exemple #1
0
    def test_field_exceptions(self):
        # test to make sure that the method that checks the field configs ids for each field name in the field_dict
        # of the content analyzer works. It considers the three cases this can occur: when passing the field_dict
        # with duplicate ids as argument for the content_analyzer, when setting the FieldConfig list with duplicates
        # for a specific field_name, and when appending a FieldConfig to the list associated with a specific field_name
        # but the config id is already in the list

        config_1 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test")
        config_2 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test")
        config_list = [config_1, config_2]
        field_dict = dict()
        field_dict["test"] = config_list

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test", field_dict)
            ContentAnalyzer(config).fit()

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test")
            config.add_multiple_config("test", config_list)
            ContentAnalyzer(config).fit()

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test")
            config.add_single_config("test", config_1)
            config.add_single_config("test", config_2)
            ContentAnalyzer(config).fit()
Exemple #2
0
    def test_create_content_embedding(self):
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id=['imdbID'],
            output_directory="movielens_test_embedding",
        )

        movies_ca_config.add_multiple_config(
            field_name='Title',
            config_list=[FieldConfig(
                    WordEmbeddingTechnique(Gensim('glove-twitter-25')),
                    NLTK(lemmatization=True, stopwords_removal=True))])

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test_embedding' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(content.get_field("Title")[0], EmbeddingField)
                    self.assertIsInstance(content.get_field("Title")[0].value, np.ndarray)
                    break
    def test_create_contents_in_index(self):
        output_dir = os.path.join(THIS_DIR, "movielens_test_original_index")
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id=['imdbID'],
            output_directory=output_dir,
        )

        movies_ca_config.add_multiple_config(
            field_name='Title',
            config_list=[
                FieldConfig(OriginalData(),
                            NLTK(lemmatization=True, stopwords_removal=True),
                            SearchIndex(os.path.join(output_dir, "index")),
                            "test_search"),
                FieldConfig(SkLearnTfIdf(), NLTK(),
                            KeywordIndex(os.path.join(output_dir, "index1")),
                            "test_keyword"),
                FieldConfig(OriginalData(), NLTK(),
                            SearchIndex(os.path.join(output_dir, "index")))
            ])

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test_original_index' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'),
                               'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(
                        content.get_field("Title")[0], IndexField)
                    self.assertIsInstance(
                        content.get_field("Title")[0].value, str)
                    self.assertIsInstance(
                        content.get_field("Title")[1], IndexField)
                    self.assertIsInstance(
                        content.get_field("Title")[1].value, str)
                    break
    def test_produce_content_str(self):
        self.skipTest("Test requires internet but is too complex to be mocked")
        technique = WordEmbeddingTechnique('glove-twitter-25')
        self.assertIsInstance(technique.embedding_source, Gensim)
        embedding_list = technique.produce_content("Plot", [NLTK()],
                                                   JSONFile(file_path))
        self.assertEqual(len(embedding_list), 20)
        self.assertIsInstance(embedding_list[0], EmbeddingField)

        technique = SentenceEmbeddingTechnique(
            'paraphrase-distilroberta-base-v1')
        self.assertIsInstance(technique.embedding_source, Sbert)
        embedding_list = technique.produce_content("Plot", [NLTK()],
                                                   JSONFile(file_path))
        self.assertEqual(len(embedding_list), 20)
        self.assertIsInstance(embedding_list[0], EmbeddingField)

        technique = FromWordsDocumentEmbeddingTechnique(
            'glove-twitter-25', Centroid())
        self.assertIsInstance(technique.embedding_source, Gensim)
        embedding_list = technique.produce_content("Plot", [],
                                                   JSONFile(file_path))
        self.assertEqual(len(embedding_list), 20)
        self.assertIsInstance(embedding_list[0], EmbeddingField)

        technique = FromSentencesDocumentEmbeddingTechnique(
            'paraphrase-distilroberta-base-v1', Centroid())
        self.assertIsInstance(technique.embedding_source, Sbert)
        embedding_list = technique.produce_content("Plot", [NLTK()],
                                                   JSONFile(file_path))
        self.assertEqual(len(embedding_list), 20)
        self.assertIsInstance(embedding_list[0], EmbeddingField)

        technique = FromWordsSentenceEmbeddingTechnique(
            'glove-twitter-25', Centroid())
        self.assertIsInstance(technique.embedding_source, Gensim)
        embedding_list = technique.produce_content("Plot", [],
                                                   JSONFile(file_path))
        self.assertEqual(len(embedding_list), 20)
        self.assertIsInstance(embedding_list[0], EmbeddingField)
Exemple #5
0
    def test_create_content_embedding(self):
        movies_ca_config = ContentAnalyzerConfig(
            content_type='Item',
            source=JSONFile(file_path),
            id_field_name_list=['imdbID'],
            output_directory="movielens_test",
        )

        movies_ca_config.append_field_config(
            field_name='Title',
            field_config=FieldConfig(pipelines_list=[
                FieldRepresentationPipeline(
                    preprocessor_list=[
                        NLTK(lemmatization=True, stopwords_removal=True)
                    ],
                    content_technique=EmbeddingTechnique(
                        combining_technique=Centroid(),
                        embedding_source=GensimDownloader(
                            name='glove-twitter-25'),
                        granularity='doc'))
            ]))

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()
 def test_produce_cotent(self):
     technique = WordEmbeddingTechnique(GensimFastText())
     embedding_list = technique.produce_content("Plot", [NLTK()],
                                                JSONFile(file_path))
     self.assertEqual(len(embedding_list), 20)
     self.assertIsInstance(embedding_list[0], EmbeddingField)
Exemple #7
0
    field_name='Title',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(content_technique=LuceneTfIdf())
    ]))

movies_ca_config.append_field_config(
    field_name='Year',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(content_technique=LuceneTfIdf())
    ]))

movies_ca_config.append_field_config(
    field_name='Genre',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(preprocessor_list=[
            NLTK(lemmatization=True, stopwords_removal=True)
        ],
                                    content_technique=LuceneTfIdf())
    ]))

movies_ca_config.append_field_config(
    field_name='Plot',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(preprocessor_list=[
            NLTK(lemmatization=True, stopwords_removal=True)
        ],
                                    content_technique=LuceneTfIdf()),
        FieldRepresentationPipeline(preprocessor_list=[
            NLTK(lemmatization=True, stopwords_removal=True)
        ],
                                    content_technique=EmbeddingTechnique(

movies_ca_config.append_field_config(
    field_name='Year',
    field_config=FieldConfig(
        pipelines_list=[FieldRepresentationPipeline(
            content_technique=LuceneTfIdf())]
    )
)


movies_ca_config.append_field_config(
    field_name='Genre',
    field_config=FieldConfig(
        pipelines_list=[FieldRepresentationPipeline(
            preprocessor_list=[NLTK(lemmatization=True, stopwords_removal=True)],
            content_technique=LuceneTfIdf())]
    )
)


movies_ca_config.append_field_config(
    'Plot',
    [ FieldConfig(WhooshTfIdf(),NLTK(lemmatization=True, stopwords_removal=True)),
      FieldConfig(EmbeddingTechnique(Centroid(),
                                     GensimDownloader(name='glove-twitter-25'),
                                     granularity='word'),
                  NLTK(lemmatization=True, stopwords_removal=True)) ]
)
        pipelines_list=[FieldRepresentationPipeline(
                            preprocessor_list=[NLTK(lemmatization=True, stopwords_removal=True)],