Ejemplo n.º 1
0
    def test_field_exceptions(self):
        # test to make sure that the method that checks the field configs ids for each field name in the field_dict
        # of the content analyzer works. It considers the three cases this can occur: when passing the field_dict
        # with duplicate ids as argument for the content_analyzer, when setting the FieldConfig list with duplicates
        # for a specific field_name, and when appending a FieldConfig to the list associated with a specific field_name
        # but the config id is already in the list

        config_1 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test")
        config_2 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test")
        config_list = [config_1, config_2]
        field_dict = dict()
        field_dict["test"] = config_list

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test", field_dict)
            ContentAnalyzer(config).fit()

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test")
            config.add_multiple_config("test", config_list)
            ContentAnalyzer(config).fit()

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test")
            config.add_single_config("test", config_1)
            config.add_single_config("test", config_2)
            ContentAnalyzer(config).fit()
Ejemplo n.º 2
0
    def test_create_content_embedding(self):
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id=['imdbID'],
            output_directory="movielens_test_embedding",
        )

        movies_ca_config.add_multiple_config(
            field_name='Title',
            config_list=[FieldConfig(
                    WordEmbeddingTechnique(Gensim('glove-twitter-25')),
                    NLTK(lemmatization=True, stopwords_removal=True))])

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test_embedding' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(content.get_field("Title")[0], EmbeddingField)
                    self.assertIsInstance(content.get_field("Title")[0].value, np.ndarray)
                    break
    def test_create_content_tfidf(self):
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id='imdbID',
            output_directory="movielens_test_tfidf",
        )

        movies_ca_config.add_multiple_config(
            field_name='Title', config_list=[FieldConfig(SkLearnTfIdf())])

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test_tfidf' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'),
                               'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(
                        content.get_field("Title")[0], FeaturesBagField)
                    self.assertIsInstance(
                        content.get_field("Title")[0].value, dict)
                    break
    def test_create_contents_in_index(self):
        output_dir = os.path.join(THIS_DIR, "movielens_test_original_index")
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id=['imdbID'],
            output_directory=output_dir,
        )

        movies_ca_config.add_multiple_config(
            field_name='Title',
            config_list=[
                FieldConfig(OriginalData(),
                            NLTK(lemmatization=True, stopwords_removal=True),
                            SearchIndex(os.path.join(output_dir, "index")),
                            "test_search"),
                FieldConfig(SkLearnTfIdf(), NLTK(),
                            KeywordIndex(os.path.join(output_dir, "index1")),
                            "test_keyword"),
                FieldConfig(OriginalData(), NLTK(),
                            SearchIndex(os.path.join(output_dir, "index")))
            ])

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test_original_index' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'),
                               'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(
                        content.get_field("Title")[0], IndexField)
                    self.assertIsInstance(
                        content.get_field("Title")[0].value, str)
                    self.assertIsInstance(
                        content.get_field("Title")[1], IndexField)
                    self.assertIsInstance(
                        content.get_field("Title")[1].value, str)
                    break
Ejemplo n.º 5
0
    def test_fit_export_json(self):
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id=['imdbID'],
            output_directory=self.out_dir,
            export_json=True
        )

        movies_ca_config.add_single_config('Plot', FieldConfig(OriginalData()))
        movies_ca_config.add_single_config('Plot', FieldConfig(SkLearnTfIdf()))
        movies_ca_config.add_single_config('imdbRating', FieldConfig())

        ContentAnalyzer(movies_ca_config).fit()

        self.assertTrue(os.path.isfile(os.path.join(self.out_dir, 'contents.json')))
        processed_source = list(JSONFile(os.path.join(self.out_dir, 'contents.json')))

        self.assertEqual(len(processed_source), 20)
        for processed_content in processed_source:
            self.assertIn('Plot#0', processed_content)
            self.assertIn('Plot#1', processed_content)
            self.assertIn('imdbRating#0', processed_content)

    # def doCleanups(self) -> None:
    #     if os.path.isdir(self.out_dir):
    #         shutil.rmtree(self.out_dir)
    def test_exogenous_exceptions(self):
        # test to make sure that the method that checks the exogenous configs ids in the exogenous_representation_list
        # of the content analyzer works. It considers the two cases this can occur: when passing the
        # exogenous_representation_list with duplicate ids as argument for the content_analyzer,
        # and when appending an ExogenousConfig to the list but the config id is already in the list

        config_1 = ExogenousConfig(
            DBPediaMappingTechnique('dbo:Film', 'Title'), "test")
        config_2 = ExogenousConfig(
            DBPediaMappingTechnique('dbo:Film', 'Title'), "test")
        exogenous_representation_list = [config_1, config_2]

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(
                JSONFile(movies_info_reduced), ["imdbID"],
                "movielens_test",
                exogenous_representation_list=exogenous_representation_list)
            ContentAnalyzer(config).fit()

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced),
                                        ["imdbID"], "movielens_test")
            config.add_single_exogenous(config_1)
            config.add_single_exogenous(config_2)
            ContentAnalyzer(config).fit()
Ejemplo n.º 7
0
    def test_decode_field_data_embedding(self):
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(decode_embedding),
            id=['imdbID'],
            output_directory=decode_path + 'movies_embedding_'
        )

        movies_ca_config.add_multiple_config(
            field_name='Title',
            config_list=[FieldConfig()]
        )
        ContentAnalyzer(config=movies_ca_config).fit()

        for name in os.listdir(decode_path):
            if os.path.isdir(os.path.join(decode_path, name)) \
                    and 'movies_embedding_' in str(name):
                with lzma.open(os.path.join(decode_path, name, 'tt0113497.xz'), 'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(content.get_field("Title")[0], EmbeddingField)
                    self.assertIsInstance(content.get_field("Title")[0].value, np.ndarray)
                    break
Ejemplo n.º 8
0
    def test_create_content(self):
        plot_config = FieldConfig(BabelPyEntityLinking())
        exogenous_config = ExogenousConfig(DBPediaMappingTechnique('Film', 'EN', 'Title'))
        content_analyzer_config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test")
        content_analyzer_config.add_single_config("Title", plot_config)
        content_analyzer_config.add_single_exogenous(exogenous_config)
        content_analyzer = ContentAnalyzer(content_analyzer_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(content.get_field("Title")[0], FeaturesBagField)
                    self.assertIsInstance(content.get_field("Title")[0].value, dict)
                    break
Ejemplo n.º 9
0
            timestamp_field_name='timestamp',
        )
        ratings_frame = ratings_importer.import_ratings()
                
                
                
    RatingsImporter(
        source: RawInformationSource,
        from_id_column: Union[str, int] = 0,
        to_id_column: Union[str, int] = 1,
        score_column: Union[str, int] = 2,
        timestamp_column: Union[str, int] = None,
        score_processor: RatingProcessor = None
    )
    
    ri = RatingsImporter(CSVFile(ratings_filename))
    ri.import_ratings()
    ri.add_score_column(
        score_column = 'title_review',
        column_name = 'title_sentiment',
        score_processor = TextBlobSentimentAnalysis()
    )
                
    from orange_cb_recsys.content_analyzer import ItemAnalyzerConfig, JSONFile
                
    config = ItemAnalyzerConfig(
        source=JSONFile(raw_source),
        id="imdbId"
        output_directory='movies_codified/'
    )