コード例 #1
0
    def test_exogenous_exceptions(self):
        # test to make sure that the method that checks the exogenous configs ids in the exogenous_representation_list
        # of the content analyzer works. It considers the two cases this can occur: when passing the
        # exogenous_representation_list with duplicate ids as argument for the content_analyzer,
        # and when appending an ExogenousConfig to the list but the config id is already in the list

        config_1 = ExogenousConfig(
            DBPediaMappingTechnique('dbo:Film', 'Title'), "test")
        config_2 = ExogenousConfig(
            DBPediaMappingTechnique('dbo:Film', 'Title'), "test")
        exogenous_representation_list = [config_1, config_2]

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(
                JSONFile(movies_info_reduced), ["imdbID"],
                "movielens_test",
                exogenous_representation_list=exogenous_representation_list)
            ContentAnalyzer(config).fit()

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced),
                                        ["imdbID"], "movielens_test")
            config.add_single_exogenous(config_1)
            config.add_single_exogenous(config_2)
            ContentAnalyzer(config).fit()
コード例 #2
0
    def test_field_exceptions(self):
        # test to make sure that the method that checks the field configs ids for each field name in the field_dict
        # of the content analyzer works. It considers the three cases this can occur: when passing the field_dict
        # with duplicate ids as argument for the content_analyzer, when setting the FieldConfig list with duplicates
        # for a specific field_name, and when appending a FieldConfig to the list associated with a specific field_name
        # but the config id is already in the list

        config_1 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test")
        config_2 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test")
        config_list = [config_1, config_2]
        field_dict = dict()
        field_dict["test"] = config_list

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test", field_dict)
            ContentAnalyzer(config).fit()

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test")
            config.add_multiple_config("test", config_list)
            ContentAnalyzer(config).fit()

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test")
            config.add_single_config("test", config_1)
            config.add_single_config("test", config_2)
            ContentAnalyzer(config).fit()
コード例 #3
0
    def test_fit_export_json(self):
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id=['imdbID'],
            output_directory=self.out_dir,
            export_json=True
        )

        movies_ca_config.add_single_config('Plot', FieldConfig(OriginalData()))
        movies_ca_config.add_single_config('Plot', FieldConfig(SkLearnTfIdf()))
        movies_ca_config.add_single_config('imdbRating', FieldConfig())

        ContentAnalyzer(movies_ca_config).fit()

        self.assertTrue(os.path.isfile(os.path.join(self.out_dir, 'contents.json')))
        processed_source = list(JSONFile(os.path.join(self.out_dir, 'contents.json')))

        self.assertEqual(len(processed_source), 20)
        for processed_content in processed_source:
            self.assertIn('Plot#0', processed_content)
            self.assertIn('Plot#1', processed_content)
            self.assertIn('imdbRating#0', processed_content)

    # def doCleanups(self) -> None:
    #     if os.path.isdir(self.out_dir):
    #         shutil.rmtree(self.out_dir)
コード例 #4
0
 def test_save(self):
     self.skipTest("_")
     preprocessor = NLTK(stopwords_removal=True)
     fields = ["Plot"]
     try:
         src = JSONFile("datasets/movies_info_reduced.json")
         learner = GensimLatentSemanticAnalysis(src, preprocessor, fields)
         learner.fit()
     except FileNotFoundError:
         src = JSONFile("../../../datasets/movies_info_reduced.json")
         learner = GensimLatentSemanticAnalysis(src, preprocessor, fields)
         learner.fit()
     learner.save()
コード例 #5
0
    def test_extract_corpus(self):
        preprocessor = NLTK(stopwords_removal=True, stemming=True)
        fields = ["Title", "Released"]
        expected = [['jumanji', '15', 'dec', '1995'],
                    ['grumpier', 'old', 'men', '22', 'dec', '1995'],
                    ['toy', 'stori', '22', 'nov', '1995'],
                    ['father', 'bride', 'part', 'ii', '08', 'dec', '1995'],
                    ['heat', '15', 'dec', '1995'],
                    ['tom', 'huck', '22', 'dec', '1995'],
                    ['wait', 'exhal', '22', 'dec', '1995'],
                    ['sabrina', '15', 'dec', '1995'],
                    ['dracula', ':', 'dead', 'love', '22', 'dec', '1995'],
                    ['nixon', '05', 'jan', '1996'],
                    ['american', 'presid', '17', 'nov', '1995'],
                    ['goldeney', '17', 'nov', '1995'],
                    ['balto', '22', 'dec', '1995'],
                    ['cutthroat', 'island', '22', 'dec', '1995'],
                    ['casino', '22', 'nov', '1995'],
                    ['sudden', 'death', '22', 'dec', '1995'],
                    ['sens', 'sensibl', '26', 'jan', '1996'],
                    ['four', 'room', '25', 'dec', '1995'],
                    ['money', 'train', '22', 'nov', '1995'],
                    [
                        'ace', 'ventura', ':', 'natur', 'call', '10', 'nov',
                        '1995'
                    ]]
        file_path = os.path.join(THIS_DIR,
                                 "../../../datasets/movies_info_reduced.json")
        src = JSONFile(file_path)
        learner = GensimLatentSemanticAnalysis(src, preprocessor, fields)
        generated = learner.extract_corpus()

        self.assertEqual(generated, expected)
コード例 #6
0
 def test_fit(self):
     path = os.path.join(THIS_DIR, "../../../datasets/d2v_test_data.json")
     doc2vec = GensimDoc2Vec(source=JSONFile(file_path=path),
                             preprocessor=NLTK(),
                             field_list=["doc_field"])
     doc2vec.fit()
     self.assertIsInstance(doc2vec.model, gensim.models.doc2vec.Doc2Vec)
コード例 #7
0
    def test_decode_field_data_embedding(self):
        file_path_test_decode = os.path.join(
            THIS_DIR, "../../datasets/test_decode/movies_title_embedding.json")
        test_dir = os.path.join(THIS_DIR, "../../datasets/test_decode/")

        movies_ca_config = ContentAnalyzerConfig(
            content_type='Item',
            source=JSONFile(file_path_test_decode),
            id_field_name_list=['imdbID'],
            output_directory=test_dir + 'movies_embedding_')

        movies_ca_config.append_field_config(
            field_name='Title',
            field_config=FieldConfig(pipelines_list=[
                FieldRepresentationPipeline(content_technique=None),
            ]))
        ContentAnalyzer(config=movies_ca_config).fit()

        for name in os.listdir(test_dir):
            if os.path.isdir(os.path.join(test_dir, name)) \
                    and 'movies_embedding_' in str(name):
                with lzma.open(os.path.join(test_dir, name, 'tt0113497.xz'),
                               'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(
                        content.get_field("Title").get_representation('0'),
                        EmbeddingField)
                    self.assertIsInstance(
                        content.get_field("Title").get_representation(
                            '0').value, np.ndarray)
                    break
コード例 #8
0
    def test_create_content_tfidf(self):
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id='imdbID',
            output_directory="movielens_test_tfidf",
        )

        movies_ca_config.add_multiple_config(
            field_name='Title', config_list=[FieldConfig(SkLearnTfIdf())])

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test_tfidf' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'),
                               'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(
                        content.get_field("Title")[0], FeaturesBagField)
                    self.assertIsInstance(
                        content.get_field("Title")[0].value, dict)
                    break
コード例 #9
0
    def test_create_content_embedding(self):
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id=['imdbID'],
            output_directory="movielens_test_embedding",
        )

        movies_ca_config.add_multiple_config(
            field_name='Title',
            config_list=[FieldConfig(
                    WordEmbeddingTechnique(Gensim('glove-twitter-25')),
                    NLTK(lemmatization=True, stopwords_removal=True))])

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test_embedding' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(content.get_field("Title")[0], EmbeddingField)
                    self.assertIsInstance(content.get_field("Title")[0].value, np.ndarray)
                    break
    def test_produce_content(self):

        data_list = self.technique.produce_content("Title", [],
                                                   JSONFile(file_path))

        self.assertEqual(len(data_list), 20)
        self.assertIsInstance(data_list[0], SimpleField)
コード例 #11
0
 def test_fit(self):
     file_path = os.path.join(THIS_DIR, "../../../datasets/movies_info_reduced.json")
     preprocessor = NLTK(stopwords_removal=True)
     fields = ["Plot"]
     src = JSONFile(file_path)
     learner = GensimLatentSemanticAnalysis(src, preprocessor, fields)
     learner.fit()
     self.assertIsInstance(learner.model, gensim.models.lsimodel.LsiModel)
コード例 #12
0
    def test_produce_content(self):
        technique = SkLearnTfIdf()

        features_bag_list = technique.produce_content("Title", [],
                                                      JSONFile(file_path))

        self.assertEqual(len(features_bag_list), 20)
        self.assertIsInstance(features_bag_list[0], FeaturesBagField)
コード例 #13
0
 def test_fit(self):
     file_path = os.path.join(THIS_DIR,
                              "../../../datasets/movies_info_reduced.json")
     random_indexing = GensimRandomIndexing(JSONFile(file_path), NLTK(),
                                            ['Genre', 'Plot'])
     random_indexing.fit()
     self.assertIsInstance(random_indexing.model,
                           gensim.models.rpmodel.RpModel)
コード例 #14
0
    def test_produce_content(self):
        technique = PyWSDSynsetDocumentFrequency()

        features_bag_list = technique.produce_content("Plot", [],
                                                      JSONFile(file_path))

        self.assertEqual(len(features_bag_list), 20)
        self.assertIsInstance(features_bag_list[0], FeaturesBagField)
コード例 #15
0
 def test_fit(self):
     field_list = ['Title', 'Year', 'Genre']
     file_path = os.path.join(THIS_DIR,
                              "../../../datasets/movies_info_reduced.json")
     fast_text = GensimFastText(source=JSONFile(file_path),
                                preprocessor=NLTK(),
                                field_list=field_list)
     fast_text.fit()
     self.assertIsInstance(fast_text.model, gensim.models.fasttext.FastText)
    def test_produce_content_list(self):
        source = [{"field": "['52ciao', '78999stringa']"}]
        with open(self.file_name, 'w') as f:
            json.dump(source, f)

        result = self.technique.produce_content("field", [], JSONFile(self.file_name))

        self.assertIsInstance(result[0], SimpleField)
        self.assertIsInstance(result[0].value, list)

        source = [{"field": ['52ciao', '78999stringa']}]
        with open(self.file_name, 'w') as f:
            json.dump(source, f)

        result = self.technique.produce_content("field", [], JSONFile(self.file_name))

        self.assertIsInstance(result[0], SimpleField)
        self.assertIsInstance(result[0].value, list)
    def test_produce_content_float(self):
        source = [{"field": '50.23'}]
        with open(self.file_name, 'w') as f:
            json.dump(source, f)

        result = self.technique.produce_content("field", [], JSONFile(self.file_name))

        self.assertIsInstance(result[0], SimpleField)
        self.assertIsInstance(result[0].value, float)

        source = [{"field": 50.23}]
        with open(self.file_name, 'w') as f:
            json.dump(source, f)

        result = self.technique.produce_content("field", [], JSONFile(self.file_name))

        self.assertIsInstance(result[0], SimpleField)
        self.assertIsInstance(result[0].value, float)
コード例 #18
0
 def test_fit(self):
     file_path = os.path.join(THIS_DIR,
                              "../../../datasets/movies_info_reduced.json")
     field_list = ['Title', 'Year', 'Genre']
     word2vec = GensimWord2Vec(source=JSONFile(file_path),
                               preprocessor=NLTK(),
                               field_list=field_list)
     word2vec.fit()
     self.assertIsInstance(word2vec.model, gensim.models.word2vec.Word2Vec)
コード例 #19
0
    def test_fit(self):
        model_path = os.path.join(THIS_DIR, "/model_test_Lsa")
        learner = GensimLatentSemanticAnalysis(model_path, True)
        learner.fit(source=JSONFile(file_path),
                    field_list=["Plot", "Genre"],
                    preprocessor_list=[NLTK()])
        model_path += ".model"

        self.assertEqual(learner.get_embedding("ace").any(), True)
        self.assertEqual(pl.Path(model_path).resolve().is_file(), True)
コード例 #20
0
    def test_fit(self):
        file_path = '../../../datasets/movies_info_reduced.json'
        try:
            with open(file_path):
                pass
        except FileNotFoundError:
            file_path = 'datasets/movies_info_reduced.json'

        GensimRandomIndexing(JSONFile(file_path), NLTK(),
                             ['Genre', 'Plot']).fit()
コード例 #21
0
    def test_produce_content(self, mocked):
        instance = mocked.return_value
        instance.entities = [{'babelSynsetID': 123, 'globalScore': 0.0}]

        technique = BabelPyEntityLinking()

        features_bag_list = technique.produce_content("Title", [],
                                                      JSONFile(file_path))

        self.assertEqual(len(features_bag_list), 20)
        self.assertIsInstance(features_bag_list[0], FeaturesBagField)
コード例 #22
0
    def test_produce_content(self):
        technique = SkLearnTfIdf()
        technique.field_need_refactor = "Plot"
        technique.pipeline_need_refactor = str(1)
        technique.processor_list = [NLTK()]
        technique.dataset_refactor(JSONFile(file_path), ["imdbID"])
        features_bag_test = technique.produce_content("test", "tt0113497",
                                                      "Plot")
        features = features_bag_test.value

        self.assertLess(features['the'], 0.15)
コード例 #23
0
 def test_save(self):
     preprocessor = NLTK(stopwords_removal=True)
     fields = ["Plot"]
     file_path = os.path.join(THIS_DIR,
                              "../../../datasets/movies_info_reduced.json")
     src = JSONFile(file_path)
     learner = GensimWord2Vec(src, preprocessor, fields)
     learner.fit()
     learner.save()
     self.assertIsInstance(learner.model, gensim.models.word2vec.Word2Vec)
     """
コード例 #24
0
    def test_iter(self):
        filepath = '../../datasets/movies_info_reduced.json'
        try:
            with open(filepath):
                pass
        except FileNotFoundError:
            filepath = 'datasets/movies_info_reduced.json'

        csv = JSONFile(filepath)
        my_iter = iter(csv)
        d1 = {"Title": "Jumanji", "Year": "1995", "Rated": "PG", "Released": "15 Dec 1995", "Runtime": "104 min",
              "Genre": "Adventure, Family, Fantasy", "Director": "Joe Johnston",
              "Writer": "Jonathan Hensleigh (screenplay by), Greg Taylor (screenplay by), Jim Strain (screenplay by), Greg Taylor (screen story by), Jim Strain (screen story by), Chris Van Allsburg (screen story by), Chris Van Allsburg (based on the book by)",
              "Actors": "Robin Williams, Jonathan Hyde, Kirsten Dunst, Bradley Pierce",
              "Plot": "After being trapped in a jungle board game for 26 years, a Man-Child wins his release from the game. But, no sooner has he arrived that he is forced to play again, and this time sets the creatures of the jungle loose on the city. Now it is up to him to stop them.",
              "Language": "English, French", "Country": "USA", "Awards": "4 wins & 9 nominations.",
              "Poster": "https://m.media-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg",
              "Ratings": [{"Source": "Internet Movie Database", "Value": "6.9/10"},
                          {"Source": "Rotten Tomatoes", "Value": "53%"}, {"Source": "Metacritic", "Value": "39/100"}],
              "Metascore": "39", "imdbRating": "6.9", "imdbVotes": "260,909", "imdbID": "tt0113497", "Type": "movie",
              "DVD": "25 Jan 2000", "BoxOffice": "N/A", "Production": "Sony Pictures Home Entertainment",
              "Website": "N/A",
              "Response": "True"}
        d2 = {"Title": "Grumpier Old Men", "Year": "1995", "Rated": "PG-13", "Released": "22 Dec 1995",
              "Runtime": "101 min",
              "Genre": "Comedy, Romance", "Director": "Howard Deutch",
              "Writer": "Mark Steven Johnson (characters), Mark Steven Johnson",
              "Actors": "Walter Matthau, Jack Lemmon, Sophia Loren, Ann-Margret",
              "Plot": "Things don't seem to change much in Wabasha County: Max and John are still fighting after 35 years, Grandpa still drinks, smokes, and chases women , and nobody's been able to catch the fabled \"Catfish Hunter\", a gigantic catfish that actually smiles at fishermen who try to snare it. Six months ago John married the new girl in town (Ariel), and people begin to suspect that Max might be missing something similar in his life. The only joy Max claims is left in his life is fishing, but that might change with the new owner of the bait shop.",
              "Language": "English, Italian, German", "Country": "USA", "Awards": "2 wins & 2 nominations.",
              "Poster": "https://m.media-amazon.com/images/M/MV5BMjQxM2YyNjMtZjUxYy00OGYyLTg0MmQtNGE2YzNjYmUyZTY1XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg",
              "Ratings": [{"Source": "Internet Movie Database", "Value": "6.6/10"},
                          {"Source": "Rotten Tomatoes", "Value": "17%"}, {"Source": "Metacritic", "Value": "46/100"}],
              "Metascore": "46", "imdbRating": "6.6", "imdbVotes": "21,823", "imdbID": "tt0113228", "Type": "movie",
              "DVD": "18 Nov 1997", "BoxOffice": "N/A", "Production": "Warner Home Video", "Website": "N/A",
              "Response": "True"}
        d3 = {"Title": "Toy Story", "Year": "1995", "Rated": "G", "Released": "22 Nov 1995", "Runtime": "81 min",
              "Genre": "Animation, Adventure, Comedy, Family, Fantasy", "Director": "John Lasseter",
              "Writer": "John Lasseter (original story by), Pete Docter (original story by), Andrew Stanton (original story by), Joe Ranft (original story by), Joss Whedon (screenplay by), Andrew Stanton (screenplay by), Joel Cohen (screenplay by), Alec Sokolow (screenplay by)",
              "Actors": "Tom Hanks, Tim Allen, Don Rickles, Jim Varney",
              "Plot": "A little boy named Andy loves to be in his room, playing with his toys, especially his doll named \"Woody\". But, what do the toys do when Andy is not with them, they come to life. Woody believes that he has life (as a toy) good. However, he must worry about Andy's family moving, and what Woody does not know is about Andy's birthday party. Woody does not realize that Andy's mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a toy, and quickly becomes Andy's new favorite toy. Woody, who is now consumed with jealousy, tries to get rid of Buzz. Then, both Woody and Buzz are now lost. They must find a way to get back to Andy before he moves without them, but they will have to pass through a ruthless toy killer, Sid Phillips.",
              "Language": "English", "Country": "USA",
              "Awards": "Nominated for 3 Oscars. Another 23 wins & 17 nominations.",
              "Poster": "https://m.media-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_SX300.jpg",
              "Ratings": [{"Source": "Internet Movie Database", "Value": "8.3/10"},
                          {"Source": "Rotten Tomatoes", "Value": "100%"}, {"Source": "Metacritic", "Value": "95/100"}],
              "Metascore": "95", "imdbRating": "8.3", "imdbVotes": "761,649", "imdbID": "tt0114709", "Type": "movie",
              "DVD": "20 Mar 2001", "BoxOffice": "N/A", "Production": "Buena Vista",
              "Website": "http://www.disney.com/ToyStory", "Response": "True"}

        self.assertDictEqual(next(my_iter), d1)
        self.assertDictEqual(next(my_iter), d2)
        self.assertDictEqual(next(my_iter), d3)
コード例 #25
0
    def test_fit(self):

        try:
            path = "datasets/d2v_test_data.json"
            with open(path):
                pass
        except FileNotFoundError:
            path = "../../../datasets/d2v_test_data.json"

        GensimDoc2Vec(source=JSONFile(file_path=path),
                      preprocessor=NLTK(),
                      field_list=["doc_field"]).fit()
コード例 #26
0
    def test_produce_content_str(self):
        self.skipTest("Test requires internet but is too complex to be mocked")
        technique = WordEmbeddingTechnique('glove-twitter-25')
        self.assertIsInstance(technique.embedding_source, Gensim)
        embedding_list = technique.produce_content("Plot", [NLTK()],
                                                   JSONFile(file_path))
        self.assertEqual(len(embedding_list), 20)
        self.assertIsInstance(embedding_list[0], EmbeddingField)

        technique = SentenceEmbeddingTechnique(
            'paraphrase-distilroberta-base-v1')
        self.assertIsInstance(technique.embedding_source, Sbert)
        embedding_list = technique.produce_content("Plot", [NLTK()],
                                                   JSONFile(file_path))
        self.assertEqual(len(embedding_list), 20)
        self.assertIsInstance(embedding_list[0], EmbeddingField)

        technique = FromWordsDocumentEmbeddingTechnique(
            'glove-twitter-25', Centroid())
        self.assertIsInstance(technique.embedding_source, Gensim)
        embedding_list = technique.produce_content("Plot", [],
                                                   JSONFile(file_path))
        self.assertEqual(len(embedding_list), 20)
        self.assertIsInstance(embedding_list[0], EmbeddingField)

        technique = FromSentencesDocumentEmbeddingTechnique(
            'paraphrase-distilroberta-base-v1', Centroid())
        self.assertIsInstance(technique.embedding_source, Sbert)
        embedding_list = technique.produce_content("Plot", [NLTK()],
                                                   JSONFile(file_path))
        self.assertEqual(len(embedding_list), 20)
        self.assertIsInstance(embedding_list[0], EmbeddingField)

        technique = FromWordsSentenceEmbeddingTechnique(
            'glove-twitter-25', Centroid())
        self.assertIsInstance(technique.embedding_source, Gensim)
        embedding_list = technique.produce_content("Plot", [],
                                                   JSONFile(file_path))
        self.assertEqual(len(embedding_list), 20)
        self.assertIsInstance(embedding_list[0], EmbeddingField)
コード例 #27
0
    def test_fit(self):
        field_list = ['Title', 'Year', 'Genre']

        file_path = '../../../datasets/movies_info_reduced.json'
        try:
            with open(file_path):
                pass
        except FileNotFoundError:
            file_path = 'datasets/movies_info_reduced.json'

        GensimFastText(source=JSONFile(file_path),
                       preprocessor=NLTK(),
                       field_list=field_list).fit()
コード例 #28
0
 def test_create_content(self):
     file_path_content_analyzer = os.path.join(
         THIS_DIR, "../../test/content_analyzer/movielens_test*")
     entity_linking_pipeline = FieldRepresentationPipeline(
         BabelPyEntityLinking())
     plot_config = FieldConfig(None)
     plot_config.append_pipeline(entity_linking_pipeline)
     content_analyzer_config = ContentAnalyzerConfig(
         'ITEM', JSONFile(file_path), ["imdbID"], "movielens_test")
     content_analyzer_config.append_field_config("Plot", plot_config)
     content_analyzer = ContentAnalyzer(content_analyzer_config)
     content_analyzer.fit()
     """
コード例 #29
0
    def test_fit(self):
        preprocessor = NLTK(stopwords_removal=True)
        fields = ["Plot"]

        file_path = "datasets/movies_info_reduced.json"
        try:
            with open(file_path):
                pass
        except FileNotFoundError:
            file_path = "../../../datasets/movies_info_reduced.json"

        src = JSONFile(file_path)
        learner = GensimLatentSemanticAnalysis(src, preprocessor, fields)
        learner.fit()
コード例 #30
0
    def test_produce_content(self):
        try:
            technique = WhooshTfIdf()
            technique.field_need_refactor = "Plot"
            technique.pipeline_need_refactor = str(1)
            technique.processor_list = [NLTK()]
            technique.dataset_refactor(JSONFile(file_path), ["imdbID"])
            features_bag_test = technique.produce_content(
                "test", "tt0113497", "Plot")
            features = features_bag_test.value

            self.assertEqual(features['years'], 0.6989700043360189)
        except AttributeError:
            self.fail("Couldn't load feature bag!")