Beispiel #1
0
    def test_iter_no_header(self):
        csv = CSVFile(self.filepath_no_header, has_header=False)

        expected_row_1 = {'0': '01', '1': 'a', '2': '0.2333333333333333', '3': '1234567', '4': 'not so good',
                          '5': 'I expected more from this product', '6': '2.0'}
        expected_row_2 = {'0': '01', '1': 'b', '2': '0.8333333333333334', '3': '1234567', '4': 'perfect',
                          '5': 'I love this product', '6': '5.0'}
        expected_row_3 = {'0': '01', '1': 'c', '2': '0.8666666666666667', '3': '1234567', '4': 'awesome',
                          '5': 'The perfect gift for my darling', '6': '4.0'}
        expected_row_4 = {'0': '02', '1': 'a', '2': '-0.3666666666666667', '3': '1234567', '4': 'a disaster',
                          '5': 'Too much expensive ', '6': '1.0'}
        expected_row_5 = {'0': '02', '1': 'c', '2': '0.6', '3': '1234567', '4': 'really good',
                          '5': 'A good compromise', '6': '3.5'}
        expected_row_6 = {'0': '03', '1': 'b', '2': '0.6666666666666666', '3': '1234567', '4': 'Awesome',
                          '5': '', '6': '5.0'}

        csv_iterator = iter(csv)

        result_row_1 = next(csv_iterator)
        result_row_2 = next(csv_iterator)
        result_row_3 = next(csv_iterator)
        result_row_4 = next(csv_iterator)
        result_row_5 = next(csv_iterator)
        result_row_6 = next(csv_iterator)

        with self.assertRaises(StopIteration):
            next(csv_iterator)

        self.assertDictEqual(expected_row_1, result_row_1)
        self.assertDictEqual(expected_row_2, result_row_2)
        self.assertDictEqual(expected_row_3, result_row_3)
        self.assertDictEqual(expected_row_4, result_row_4)
        self.assertDictEqual(expected_row_5, result_row_5)
        self.assertDictEqual(expected_row_6, result_row_6)
    def test_graph_creation_exo_missing(self):
        # Test multiple graph creation possibilities with not existent exo_representations/exo_properties

        # Import ratings as DataFrame
        ratings_import = RatingsImporter(
            source=CSVFile(ratings_filename),
            from_id_column='user_id',
            to_id_column='item_id',
            score_column='points',
            timestamp_column='timestamp',
            score_processor=NumberNormalizer()
        )
        ratings_frame = ratings_import.import_ratings()

        # Create graph with non-existent exo_properties
        g = NXFullGraph(
            source_frame=ratings_frame,
            item_contents_dir=movies_dir,
            user_contents_dir=user_dir,
            item_exo_properties=['asdds', 'dsdds'],
            user_exo_properties=['vvvv']  # It's the column in the users DAT which identifies the gender
        )

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertEqual(len(g.property_nodes), 0)

        # Create graph with non-existent exo_representations
        g = NXFullGraph(
            source_frame=ratings_frame,
            item_contents_dir=movies_dir,
            user_contents_dir=user_dir,
            item_exo_representation="asdsa",
            user_exo_representation="dsdssd"
        )

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertEqual(len(g.property_nodes), 0)

        # Create graph with non-existent exo_representations and non-existent exo_properties
        g = NXFullGraph(
            source_frame=ratings_frame,
            item_contents_dir=movies_dir,
            user_contents_dir=user_dir,
            user_exo_representation='not_exist',
            item_exo_representation='not_Exist2',
            item_exo_properties=["asdsa"],
            user_exo_properties=["dsdssd"]
        )

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertEqual(len(g.property_nodes), 0)
Beispiel #3
0
    def test_graph_creation(self):
        # Test multiple graph creation possibilities

        # Import ratings as DataFrame
        ratings_import = RatingsImporter(
            source=CSVFile(ratings_filename),
            rating_configs=[
                RatingsFieldConfig(field_name='points',
                                   processor=NumberNormalizer(min_=1, max_=5))
            ],
            from_field_name='user_id',
            to_field_name='item_id',
            timestamp_field_name='timestamp',
        )
        ratings_frame = ratings_import.import_ratings()

        # Create graph without setting the representation
        # EX. Create graph with properties 'producer' and 'starring' from
        # all exo representation, since there can be multiple exo representation
        # containing the same properties
        g = NXFullGraph(
            source_frame=ratings_frame,
            item_contents_dir=movies_dir,
            user_contents_dir=user_dir,
            item_exo_properties=['producer', 'starring'],
            user_exo_properties=[
                '1'
            ]  # It's the column in the users DAT which identifies the gender
        )

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertGreater(len(g.property_nodes), 0)

        # Create graph without setting properties,
        # so ALL exo properties of the representation 0 will be retrieved
        g = NXFullGraph(source_frame=ratings_frame,
                        item_contents_dir=movies_dir,
                        user_contents_dir=user_dir,
                        item_exo_representation="0",
                        user_exo_representation="0")

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertGreater(len(g.property_nodes), 0)

        # Create graph specifying without properties
        g = NXFullGraph(ratings_frame)

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertEqual(len(g.property_nodes), 0)
Beispiel #4
0
    def test_graph_creation(self):
        # Test multiple graph creation possibilities

        # Import ratings as DataFrame
        ratings_import = RatingsImporter(
            source=CSVFile(ratings_filename),
            rating_configs=[
                RatingsFieldConfig(field_name='points',
                                   processor=NumberNormalizer(min_=1, max_=5))
            ],
            from_field_name='user_id',
            to_field_name='item_id',
            timestamp_field_name='timestamp',
        )
        ratings_frame = ratings_import.import_ratings()

        # Create graph using the property 'starring' from representation '0'
        g = NXTripartiteGraph(ratings_frame,
                              movies_dir,
                              item_exo_representation="0",
                              item_exo_properties=['starring'])

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertGreater(len(g.property_nodes), 0)

        # Create graph specifying only the exo representation
        g = NXTripartiteGraph(ratings_frame,
                              movies_dir,
                              item_exo_representation="0")

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertGreater(len(g.property_nodes), 0)

        # Create graph specifying only the exo representation
        g = NXTripartiteGraph(ratings_frame,
                              movies_dir,
                              item_exo_properties=['starring'])

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertGreater(len(g.property_nodes), 0)

        # Create graph specifying without properties
        g = NXTripartiteGraph(ratings_frame)

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertEqual(len(g.property_nodes), 0)
    def test_all(self):
        ratings_filename = os.path.join(contents_path, '..', 'datasets',
                                        'examples', 'new_ratings.csv')

        ratings_frame = RatingsImporter(
            CSVFile(ratings_filename)).import_ratings()

        rs = ContentBasedRS(
            LinearPredictor(
                {"Plot": ['tfidf', 'embedding']},
                SkLinearRegression(),
            ), ratings_frame, items_dir)

        catalog = set([
            os.path.splitext(f)[0] for f in os.listdir(items_dir)
            if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz')
        ])

        em = EvalModel(rs,
                       KFoldPartitioning(),
                       metric_list=[
                           Precision(sys_average='micro'),
                           PrecisionAtK(1, sys_average='micro'),
                           RPrecision(),
                           Recall(),
                           RecallAtK(3, ),
                           FMeasure(1, sys_average='macro'),
                           FMeasureAtK(2, beta=1, sys_average='micro'),
                           NDCG(),
                           NDCGAtK(3),
                           MRR(),
                           MRRAtK(5, ),
                           Correlation('pearson', top_n=5),
                           Correlation('kendall', top_n=3),
                           Correlation('spearman', top_n=4),
                           MAE(),
                           MSE(),
                           RMSE(),
                           CatalogCoverage(catalog),
                           CatalogCoverage(catalog, k=2),
                           CatalogCoverage(catalog, top_n=3),
                           GiniIndex(),
                           GiniIndex(top_n=3),
                           DeltaGap({
                               'primo': 0.5,
                               'secondo': 0.5
                           })
                       ],
                       methodology=TestItemsMethodology())

        result = em.fit()
Beispiel #6
0
    def test_graph_creation(self):
        # Test multiple graph creation possibilities

        # Import ratings as DataFrame
        ratings_import = RatingsImporter(source=CSVFile(ratings_filename),
                                         from_id_column='user_id',
                                         to_id_column='item_id',
                                         score_column='points',
                                         timestamp_column='timestamp',
                                         score_processor=NumberNormalizer())
        ratings_frame = ratings_import.import_ratings()

        # Create graph using the property 'starring' from representation '0' ('dbpedia')
        g = NXTripartiteGraph(ratings_frame,
                              movies_dir,
                              item_exo_representation=0,
                              item_exo_properties=['starring'])

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertGreater(len(g.property_nodes), 0)

        # Create graph specifying only the exo representation
        g = NXTripartiteGraph(ratings_frame,
                              movies_dir,
                              item_exo_representation="dbpedia")

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertGreater(len(g.property_nodes), 0)

        # Create graph specifying only the exo representation
        g = NXTripartiteGraph(ratings_frame,
                              movies_dir,
                              item_exo_properties=['starring'])

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertGreater(len(g.property_nodes), 0)

        # Create graph specifying without properties
        g = NXTripartiteGraph(ratings_frame)

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertEqual(len(g.property_nodes), 0)
Beispiel #7
0
    def test_iter(self):
        filepath = '../../datasets/movies_info_reduced.csv'
        try:
            with open(filepath):
                pass
        except FileNotFoundError:
            filepath = 'datasets/movies_info_reduced.csv'

        csv = CSVFile(filepath)
        my_iter = iter(csv)
        d1 = {"Title": "Jumanji", "Year": "1995", "Rated": "PG", "Released": "15 Dec 1995", "Runtime": "104 min",
              "Genre": "Adventure, Family, Fantasy", "Director": "Joe Johnston",
              "Writer": "Jonathan Hensleigh (screenplay by), Greg Taylor (screenplay by), Jim Strain (screenplay by), Greg Taylor (screen story by), Jim Strain (screen story by), Chris Van Allsburg (screen story by), Chris Van Allsburg (based on the book by)",
              "Actors": "Robin Williams, Jonathan Hyde, Kirsten Dunst, Bradley Pierce",
              "Plot": "After being trapped in a jungle board game for 26 years, a Man-Child wins his release from the game. But, no sooner has he arrived that he is forced to play again, and this time sets the creatures of the jungle loose on the city. Now it is up to him to stop them.",
              "Language": "English, French", "Country": "USA", "Awards": "4 wins & 9 nominations.",
              "Poster": "https://m.media-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg",
              "Metascore": "39", "imdbRating": "6.9", "imdbVotes": "260,909", "imdbID": "tt0113497", "Type": "movie",
              "DVD": "25 Jan 2000", "BoxOffice": "N/A", "Production": "Sony Pictures Home Entertainment",
              "Website": "N/A",
              "Response": "True"}
        d2 = {"Title": "Grumpier Old Men", "Year": "1995", "Rated": "PG-13", "Released": "22 Dec 1995",
              "Runtime": "101 min",
              "Genre": "Comedy, Romance", "Director": "Howard Deutch",
              "Writer": "Mark Steven Johnson (characters), Mark Steven Johnson",
              "Actors": "Walter Matthau, Jack Lemmon, Sophia Loren, Ann-Margret",
              "Plot": "Things don't seem to change much in Wabasha County: Max and John are still fighting after 35 years, Grandpa still drinks, smokes, and chases women , and nobody's been able to catch the fabled \"Catfish Hunter\", a gigantic catfish that actually smiles at fishermen who try to snare it. Six months ago John married the new girl in town (Ariel), and people begin to suspect that Max might be missing something similar in his life. The only joy Max claims is left in his life is fishing, but that might change with the new owner of the bait shop.",
              "Language": "English, Italian, German", "Country": "USA", "Awards": "2 wins & 2 nominations.",
              "Poster": "https://m.media-amazon.com/images/M/MV5BMjQxM2YyNjMtZjUxYy00OGYyLTg0MmQtNGE2YzNjYmUyZTY1XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg",
              "Metascore": "46", "imdbRating": "6.6", "imdbVotes": "21,823", "imdbID": "tt0113228", "Type": "movie",
              "DVD": "18 Nov 1997", "BoxOffice": "N/A", "Production": "Warner Home Video", "Website": "N/A",
              "Response": "True"}
        d3 = {"Title": "Toy Story", "Year": "1995", "Rated": "G", "Released": "22 Nov 1995", "Runtime": "81 min",
              "Genre": "Animation, Adventure, Comedy, Family, Fantasy", "Director": "John Lasseter",
              "Writer": "John Lasseter (original story by), Pete Docter (original story by), Andrew Stanton (original story by), Joe Ranft (original story by), Joss Whedon (screenplay by), Andrew Stanton (screenplay by), Joel Cohen (screenplay by), Alec Sokolow (screenplay by)",
              "Actors": "Tom Hanks, Tim Allen, Don Rickles, Jim Varney",
              "Plot": "A little boy named Andy loves to be in his room, playing with his toys, especially his doll named \"Woody\". But, what do the toys do when Andy is not with them, they come to life. Woody believes that he has life (as a toy) good. However, he must worry about Andy's family moving, and what Woody does not know is about Andy's birthday party. Woody does not realize that Andy's mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a toy, and quickly becomes Andy's new favorite toy. Woody, who is now consumed with jealousy, tries to get rid of Buzz. Then, both Woody and Buzz are now lost. They must find a way to get back to Andy before he moves without them, but they will have to pass through a ruthless toy killer, Sid Phillips.",
              "Language": "English", "Country": "USA",
              "Awards": "Nominated for 3 Oscars. Another 23 wins & 17 nominations.",
              "Poster": "https://m.media-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_SX300.jpg",
              "Metascore": "95", "imdbRating": "8.3", "imdbVotes": "761,649", "imdbID": "tt0114709", "Type": "movie",
              "DVD": "20 Mar 2001", "BoxOffice": "N/A", "Production": "Buena Vista",
              "Website": "http://www.disney.com/ToyStory", "Response": "True"}

        self.assertDictEqual(next(my_iter), d1)
        self.assertDictEqual(next(my_iter), d2)
        self.assertDictEqual(next(my_iter), d3)
from orange_cb_recsys.evaluation.ranking_metrics import NDCG, Correlation
from orange_cb_recsys.recsys import CosineSimilarity, ClassifierRecommender
from orange_cb_recsys.recsys.ranking_algorithms.classifier import SVM
from orange_cb_recsys.recsys.config import RecSysConfig

THIS_DIR = os.path.dirname(os.path.abspath(__file__))
contents_path = os.path.join(THIS_DIR, "../../contents")
datasets_path = os.path.join(THIS_DIR, "../../datasets")
ratings_filename = os.path.join(datasets_path, "examples/new_ratings.csv")
users_dir = os.path.join(contents_path,
                         "examples/ex_1/users_1600355755.1935306")
items_dir = os.path.join(contents_path,
                         "examples/ex_1/movies_1600355972.49884")

t_ratings = RatingsImporter(
    source=CSVFile(ratings_filename),
    rating_configs=[
        RatingsFieldConfig(field_name='points',
                           processor=NumberNormalizer(min_=1, max_=5))
    ],
    from_field_name='user_id',
    to_field_name='item_id',
    timestamp_field_name='timestamp',
).import_ratings()


class TestRankingEvalModel(TestCase):
    def test_fit(self):

        recsys_config = RecSysConfig(users_directory=users_dir,
                                     items_directory=items_dir,
Beispiel #9
0
import lzma
import pandas as pd

# id_vecchio - id_nuovo con query sul titolo
from orange_cb_recsys.content_analyzer.raw_information_source import DATFile, JSONFile, CSVFile

movies_info_filename = '../../../datasets/movies_info.json'
movies_filename = '../../../datasets/ml-1m/movies.csv'

ratings_filename = '../../../datasets/ml-1m/ratings.dat'
new_ratings_filename = '../../../datasets/new_ratings_full.csv'

movies_info = JSONFile(movies_info_filename)
movies = CSVFile(movies_filename)
ratings = DATFile(ratings_filename)

dict1 = {}
for film1 in movies_info:
    print('imdbID: {} |Title: {} ({})'.format(film1['imdbID'], film1['Title'],
                                              film1['Year']))
    dict1['{} ({})'.format(film1['Title'], film1['Year'])] = film1['imdbID']

dict2 = {}
for film2 in movies:
    print('movieId: {} |Title: {}'.format(film2['movieId'], film2['title']))
    dict2['{}'.format(film2['title'])] = film2['movieId']

dict3 = {}
for k in dict1.keys():
    if k in dict2.keys():
        dict3[dict2[k]] = dict1[k]
Beispiel #10
0
    def test_fit(self):
        """
        item_id_list = [
            'tt0112281',
            'tt0112302',
            'tt0112346',
            'tt0112453',
            'tt0112641',
            'tt0112760',
            'tt0112896',
            'tt0113041',
            'tt0113101',
            'tt0113189',
            'tt0113228',
            'tt0113277',
            'tt0113497',
            'tt0113845',
            'tt0113987',
            'tt0114319',
            'tt0114388',
            'tt0114576',
            'tt0114709',
            'tt0114885',
        ]

        record_list = []
        for i in range(1, 7):
            extract_items = set([x for i, x in enumerate(item_id_list) if np.random.randint(0, 2) == 1 and i < 10])
            for item in extract_items:
                record_list.append((str(i), item, str(np.random.randint(-0, 11) / 10)))

        t_ratings = pd.DataFrame.from_records(record_list, columns=['from_id', 'to_id', 'score'])
        """
        ratings_filename = 'datasets/examples/new_ratings.csv'
        t_ratings = RatingsImporter(
            source=CSVFile(ratings_filename),
            rating_configs=[
                RatingsFieldConfig(field_name='points',
                                   processor=NumberNormalizer(min_=1, max_=5))
            ],
            from_field_name='user_id',
            to_field_name='item_id',
            timestamp_field_name='timestamp',
        ).import_ratings()
        print(t_ratings)

        recsys_config = RecSysConfig(
            users_directory='contents/examples/ex_1/users_1600355755.1935306',
            items_directory='contents/examples/ex_1/movies_1600355972.49884',
            score_prediction_algorithm=None,
            ranking_algorithm=CentroidVector(item_field='Plot',
                                             field_representation='1',
                                             similarity=CosineSimilarity()),
            rating_frame=t_ratings)
        try:
            RankingAlgEvalModel(config=recsys_config,
                                partitioning=KFoldPartitioning(),
                                metric_list=[
                                    Precision(0.4),
                                    Recall(0.4),
                                    FNMeasure(1, 0.4),
                                    MRR(0.4),
                                    NDCG({
                                        0: (-1, 0),
                                        1: (0, 1)
                                    }),
                                    Correlation('pearson'),
                                    Correlation('kendall'),
                                    Correlation('spearman')
                                ]).fit()
        except TypeError:
            pass
        except ValueError:
            pass