コード例 #1
0
    def load_dataset(self):
        print('loading ratings from path: ' + self.rates_file_dir)
        train_data_frame, test_data_frame = self.__load_rate(
            self.rates_file_dir, self.split_ratio)

        self.user_ids = self.data['User ID'].drop_duplicates().tolist()
        self.item_ids = self.data['Item ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)

        self.train_dataset.load_dataset(train_data_frame)
        self.test_dataset.load_dataset(test_data_frame)
        print('Done.')
コード例 #2
0
    def __init__(self,
                 name='MovieLensDataset100K',
                 type=DataType.ndarray,
                 split_ratio=0.8,
                 dataset_path='../data/MovieLens/ml-100k'):

        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'u.user')
        self.items_info_path = os.path.join(dataset_path, 'u.item')
        self.rates_file_path = os.path.join(dataset_path, 'u.data')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)

        occupation_group = self.users.groupby(by='Occupation')
        occupation_list = list(occupation_group.groups.keys())
        self.occupation2id = {}
        for i, occupation in enumerate(occupation_list):
            self.occupation2id[occupation] = i
        self.gender2id = {'M': 0, 'F': 1}

        self.user_ids = self.users['ID'].drop_duplicates().tolist()
        self.item_ids = self.items['ID'].drop_duplicates().tolist()

        self.user_num = len(self.user_ids)
        self.item_num = len(self.item_ids)
        self.user_occupation_num = len(self.occupation2id)

        self.users = self.users.set_index('ID').sort_index()
        self.items = self.items.set_index('ID').sort_index()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
コード例 #3
0
    def __init__(self,
                 name='BookCrossingDataset',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/Book-crossing/BX-CSV-Dump'):
        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'BX-Users.csv')
        self.items_info_path = os.path.join(dataset_path, 'BX-Books.csv')
        self.rates_file_path = os.path.join(dataset_path,
                                            'BX-Book-Ratings.csv')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)
        self.data = pd.read_csv(self.rates_file_path,
                                sep=';',
                                skiprows=[0],
                                names=['User ID', 'Item ID', 'Rating'],
                                dtype={
                                    'User ID': str,
                                    'Item ID': str,
                                    'Rating': np.float16
                                },
                                encoding='latin1')
        self.user_ids = self.data['User ID'].drop_duplicates().tolist()
        self.item_ids = self.data['Item ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
コード例 #4
0
    def __init__(self,
                 name='MovieLensDataset100K',
                 type=DataType.ndarray,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/MovieLens/ml-100k'):

        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'u.user')
        self.items_info_path = os.path.join(dataset_path, 'u.item')
        self.rates_file_path = os.path.join(dataset_path, 'u.data')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)
        self.user_ids = self.users['ID'].drop_duplicates().tolist()
        self.item_ids = self.items['ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
コード例 #5
0
    def __init__(self,
                 name='MovieLensDataset1M',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/MovieLens/ml-1m'):
        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'users.dat')
        self.items_info_path = os.path.join(dataset_path, 'movies.dat')
        self.rates_file_path = os.path.join(dataset_path, 'ratings.dat')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)
        self.user_ids = list(range(1, 6041))
        self.item_ids = list(range(1, 3953))

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
コード例 #6
0
    def __init__(self,
                 name='MovieLensDataset25M',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/MovieLens/ml-25m'):
        self.name = name
        self.type = type
        self.items_info_path = os.path.join(dataset_path, 'movies.csv')
        self.rates_file_path = os.path.join(dataset_path, 'ratings.csv')
        self.user_num = 162541
        self.split_ratio = split_ratio
        self.items = self.__load_item_info(self.items_info_path)
        self.user_ids = list(range(1, self.user_num + 1))
        self.item_ids = self.items['ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
コード例 #7
0
class NetflixDataset():
    def __init__(self,
                 name='NetflixDataset',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/netflix-prize-data'):
        self.name = name
        self.type = type
        self.items_info_path = os.path.join(dataset_path, 'movie_titles.txt')
        self.rates_file_dir = os.path.join(dataset_path, 'training_set')

        self.split_ratio = split_ratio
        self.items = self.__load_item_info(self.items_info_path)

    def load_dataset(self):
        print('loading ratings from path: ' + self.rates_file_dir)
        train_data_frame, test_data_frame = self.__load_rate(
            self.rates_file_dir, self.split_ratio)

        self.user_ids = self.data['User ID'].drop_duplicates().tolist()
        self.item_ids = self.data['Item ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)

        self.train_dataset.load_dataset(train_data_frame)
        self.test_dataset.load_dataset(test_data_frame)
        print('Done.')

    def __load_item_info(self, file_path):
        return pd.read_csv(file_path,
                           sep=',',
                           names=['ID', 'Year of Release', 'Title'],
                           dtype={'ID': np.uint32},
                           encoding='latin1')

    def __load_rate(self, file_dir, split_ratio):
        self.data = pd.DataFrame(
            columns=['User ID', 'Item ID', 'Rating', 'Timestamp'])

        for file in os.listdir(file_dir):
            user_id = int(file.split('.')[0].split('_')[1])
            if user_id % 100 == 0:
                print('loading file: ' + file)

            rating_data = pd.read_csv(os.path.join(file_dir, file),
                                      sep=',',
                                      skiprows=[0],
                                      encoding='latin1',
                                      names=['Item ID', 'Rating', 'Timestamp'],
                                      dtype={
                                          'Item ID': np.uint32,
                                          'Rating': np.float16
                                      })
            rating_data['User ID'] = np.uint32(user_id)

            self.data = self.data.append(rating_data)

        train_data = self.data.sample(frac=split_ratio, random_state=0, axis=0)
        test_data = self.data[~self.data.index.isin(train_data.index)]
        return train_data, test_data
コード例 #8
0
class PersonalityDataset():
    def __init__(self,
                 name='PersonalityDataset',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/Personality-isf2018'):
        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path,
                                            'personality-data.csv')
        self.rates_file_path = os.path.join(dataset_path, 'ratings.csv')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)

    def load_dataset(self):
        print('loading rating from path: ' + self.rates_file_path)
        train_data_frame, test_data_frame = self.__load_rate(
            self.rates_file_path, self.split_ratio)

        self.user_ids = self.data['User ID'].drop_duplicates().tolist()
        self.item_ids = self.data['Item ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)

        self.train_dataset.load_dataset(train_data_frame)
        self.test_dataset.load_dataset(test_data_frame)
        print('Done.')

    def __load_user_info(self, file_path):
        return pd.read_csv(
            file_path,
            sep=',',
            skiprows=[0],
            names=[
                'User ID', 'Openness', 'Agreeableness', 'Emotional Stability',
                'Conscientiousness', 'Extraversion', 'Assigned Metric',
                'Assigned Condition', 'Movie_1', 'Predicted_rating_1',
                'Movie_2', 'Predicted_rating_2', 'Movie_3',
                'Predicted_rating_3', 'Movie_4', 'Predicted_rating_4',
                'Movie_5', 'Predicted_rating_5', 'Movie_6',
                'Predicted_rating_6', 'Movie_7', 'Predicted_rating_7',
                'Movie_8', 'Predicted_rating_8', 'Movie_9',
                'Predicted_rating_9', 'Movie_10', 'Predicted_rating_10',
                'Movie_11', 'Predicted_rating_11', 'Movie_12',
                'Predicted_rating_12', 'Is Personalized', 'Enjoy Watching'
            ],
            dtype={'User ID': str})

    def __load_rate(self, file_path, split_ratio):
        self.data = pd.read_csv(
            file_path,
            sep=',',
            skiprows=[0],
            names=['User ID', 'Item ID', 'Rating', 'Timestamp'],
            dtype={
                'User ID': str,
                'Item ID': np.uint32,
                'Rating': np.float16
            },
        )
        train_data = self.data.sample(frac=split_ratio, random_state=0, axis=0)
        test_data = self.data[~self.data.index.isin(train_data.index)]
        return train_data, test_data
コード例 #9
0
class MovieLensDataset1M():
    def __init__(self,
                 name='MovieLensDataset1M',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/MovieLens/ml-1m'):
        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'users.dat')
        self.items_info_path = os.path.join(dataset_path, 'movies.dat')
        self.rates_file_path = os.path.join(dataset_path, 'ratings.dat')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)
        self.user_ids = list(range(1, 6041))
        self.item_ids = list(range(1, 3953))

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)

    def load_dataset(self):
        print('loading rating from path: ' + self.rates_file_path)
        train_data_frame, test_data_frame = self.__load_rate(
            self.rates_file_path, self.split_ratio)
        self.train_dataset.load_dataset(train_data_frame)
        self.test_dataset.load_dataset(test_data_frame)
        print('Done')

    def __load_user_info(self, file_path):
        return pd.read_csv(file_path,
                           sep='::',
                           names=['ID', 'Gender', 'Age', 'Occupation', 'Code'],
                           dtype={
                               'ID': np.uint16,
                               'Age': np.uint8
                           },
                           encoding='utf-8',
                           engine='python')

    def __load_item_info(self, file_path):
        return pd.read_csv(file_path,
                           sep='::',
                           names=['ID', 'Title', 'Tag'],
                           dtype={'ID': np.uint16},
                           encoding='latin1',
                           engine='python')

    def __load_rate(self, file_path, split_ratio):
        data = pd.read_csv(file_path,
                           sep='::',
                           names=['User ID', 'Item ID', 'Rating', 'Timestamp'],
                           dtype={
                               'User ID': np.uint16,
                               'Item ID': np.uint16,
                               'Rating': np.float16,
                               'Timestamp': np.uint32
                           },
                           engine='python')
        train_data = data.sample(frac=split_ratio, random_state=0, axis=0)
        test_data = data[~data.index.isin(train_data.index)]
        return train_data, test_data
コード例 #10
0
class MovieLensDataset25M():
    def __init__(self,
                 name='MovieLensDataset25M',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/MovieLens/ml-25m'):
        self.name = name
        self.type = type
        self.items_info_path = os.path.join(dataset_path, 'movies.csv')
        self.rates_file_path = os.path.join(dataset_path, 'ratings.csv')
        self.user_num = 162541
        self.split_ratio = split_ratio
        self.items = self.__load_item_info(self.items_info_path)
        self.user_ids = list(range(1, self.user_num + 1))
        self.item_ids = self.items['ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)

    def load_dataset(self):
        print('loading rating from path: ' + self.rates_file_path)
        train_data_frame, test_data_frame = self.__load_rate(
            self.rates_file_path, self.split_ratio)
        self.train_dataset.load_dataset(train_data_frame)
        self.test_dataset.load_dataset(test_data_frame)
        print('Done')

    def __load_item_info(self, file_path):
        return pd.read_csv(file_path,
                           sep=',',
                           skiprows=[0],
                           names=['ID', 'Title', 'Tag'],
                           dtype={'ID': np.uint16},
                           encoding='utf-8')

    def __load_rate(self, file_path, split_ratio):
        data = pd.read_csv(file_path,
                           sep=',',
                           skiprows=[0],
                           names=['User ID', 'Item ID', 'Rating', 'Timestamp'],
                           dtype={
                               'User ID': np.uint16,
                               'Item ID': np.uint16,
                               'Rating': np.float16,
                               'Timestamp': np.int16
                           })
        train_data = data.sample(frac=split_ratio, random_state=0, axis=0)
        test_data = data[~data.index.isin(train_data.index)]
        return train_data, test_data
コード例 #11
0
class MovieLensDataset100K():
    def __init__(self,
                 name='MovieLensDataset100K',
                 type=DataType.ndarray,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/MovieLens/ml-100k'):

        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'u.user')
        self.items_info_path = os.path.join(dataset_path, 'u.item')
        self.rates_file_path = os.path.join(dataset_path, 'u.data')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)
        self.user_ids = self.users['ID'].drop_duplicates().tolist()
        self.item_ids = self.items['ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)

    def load_dataset(self):
        print('loading rating from path: ' + self.rates_file_path)
        train_data_frame, test_data_frame = self.__load_rate(
            self.rates_file_path, self.split_ratio)
        self.train_dataset.load_dataset(train_data_frame)
        self.test_dataset.load_dataset(test_data_frame)
        print('Done')

    def __load_user_info(self, file_path):
        return pd.read_csv(file_path,
                           sep='|',
                           names=['ID', 'Age', 'Gender', 'Occupation', 'Code'],
                           dtype={
                               'ID': np.uint16,
                               'Age': np.uint8
                           },
                           encoding='utf-8')

    def __load_item_info(self, file_path):
        return pd.read_csv(
            file_path,
            sep='|',
            names=[
                'ID', 'Title', 'Release date', 'Video release date', 'URL',
                'Unknown', 'Action', 'Adveenture', 'Animation', "Children's",
                'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
                'Sci-Fi', 'Thriller', 'War', 'Wstern'
            ],
            dtype={
                'ID': np.uint16,
                'video release data': str
            },
            encoding='latin1')

    def __load_rate(self, file_path, split_ratio):
        data = pd.read_csv(file_path,
                           sep='\t',
                           names=['User ID', 'Item ID', 'Rating', 'Timestamp'])
        train_data = data.sample(frac=split_ratio, random_state=0, axis=0)
        test_data = data[~data.index.isin(train_data.index)]
        return train_data, test_data
コード例 #12
0
class BookCrossingDataset():
    def __init__(self,
                 name='BookCrossingDataset',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='G:/dataset/RS/Book-crossing/BX-CSV-Dump'):
        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'BX-Users.csv')
        self.items_info_path = os.path.join(dataset_path, 'BX-Books.csv')
        self.rates_file_path = os.path.join(dataset_path,
                                            'BX-Book-Ratings.csv')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)
        self.data = pd.read_csv(self.rates_file_path,
                                sep=';',
                                skiprows=[0],
                                names=['User ID', 'Item ID', 'Rating'],
                                dtype={
                                    'User ID': str,
                                    'Item ID': str,
                                    'Rating': np.float16
                                },
                                encoding='latin1')
        self.user_ids = self.data['User ID'].drop_duplicates().tolist()
        self.item_ids = self.data['Item ID'].drop_duplicates().tolist()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)

    def load_dataset(self):
        print('loading rating from path: ' + self.rates_file_path)
        train_data_frame, test_data_frame = self.__load_rate(
            self.rates_file_path, self.split_ratio)
        self.train_dataset.load_dataset(train_data_frame, time_stamp=False)
        self.test_dataset.load_dataset(test_data_frame, time_stamp=False)
        print('Done.')

    def __load_user_info(self, file_path):
        return pd.read_csv(file_path,
                           sep=';',
                           skiprows=[0],
                           names=['User ID', 'Location', 'Age'],
                           dtype={'User ID': np.uint32},
                           encoding='latin1')

    def __load_item_info(self, file_path):
        return pd.read_csv(file_path,
                           sep=';',
                           skiprows=[0],
                           names=[
                               'Item ID', 'Book Title', 'Book Author',
                               'Year of Publication', 'Publisher',
                               'Image URL S', 'Image URL M', 'Image URL L'
                           ],
                           dtype={'Item ID': str},
                           encoding='latin1')

    def __load_rate(self, file_path, split_ratio):
        train_data = self.data.sample(frac=split_ratio, random_state=0, axis=0)
        test_data = self.data[~self.data.index.isin(train_data.index)]
        return train_data, test_data
コード例 #13
0
class MovieLensDataset100K():
    def __init__(self,
                 name='MovieLensDataset100K',
                 type=DataType.ndarray,
                 split_ratio=0.8,
                 dataset_path='../data/MovieLens/ml-100k'):

        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'u.user')
        self.items_info_path = os.path.join(dataset_path, 'u.item')
        self.rates_file_path = os.path.join(dataset_path, 'u.data')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)

        occupation_group = self.users.groupby(by='Occupation')
        occupation_list = list(occupation_group.groups.keys())
        self.occupation2id = {}
        for i, occupation in enumerate(occupation_list):
            self.occupation2id[occupation] = i
        self.gender2id = {'M': 0, 'F': 1}

        self.user_ids = self.users['ID'].drop_duplicates().tolist()
        self.item_ids = self.items['ID'].drop_duplicates().tolist()

        self.user_num = len(self.user_ids)
        self.item_num = len(self.item_ids)
        self.user_occupation_num = len(self.occupation2id)

        self.users = self.users.set_index('ID').sort_index()
        self.items = self.items.set_index('ID').sort_index()

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)

    def load_dataset(self):
        print('loading rating from path: ' + self.rates_file_path)
        train_data_frame, test_data_frame = self.__load_rate(
            self.rates_file_path, self.split_ratio)
        self.train_dataset.load_dataset(train_data_frame)
        self.test_dataset.load_dataset(test_data_frame)
        print('Done')

    def train(self):
        for user_id, item_id, rating in self.train_dataset:
            user_info = self.users.loc[user_id]
            item_info = self.items.loc[item_id]
            user_age_id = user_info['Age'] // 10
            user_gender_id = self.gender2id[user_info['Gender']]
            user_occupation_id = self.occupation2id[user_info['Occupation']]

            item_category_one_hot = np.array(item_info[4:], dtype=np.float32)
            yield user_id, user_age_id, user_gender_id, user_occupation_id, item_id, item_category_one_hot, rating

    def evaluate(self):
        for user_id, item_id, rating in self.test_dataset:
            user_info = self.users.loc[user_id]
            item_info = self.items.loc[item_id]
            user_age_id = user_info['Age'] // 10
            user_gender_id = self.gender2id[user_info['Gender']]
            user_occupation_id = self.occupation2id[user_info['Occupation']]

            item_category_one_hot = np.array(item_info[4:], dtype=np.float32)
            yield user_id, user_age_id, user_gender_id, user_occupation_id, item_id, item_category_one_hot, rating

    def user_info(self, user_id):
        user_info = self.users.loc[user_id]
        user_age_id = user_info['Age'] // 10
        user_gender_id = self.gender2id[user_info['Gender']]
        user_occupation_id = self.occupation2id[user_info['Occupation']]
        return user_id, user_age_id, user_gender_id, user_occupation_id

    def item_info(self, item_id):
        item_info = self.items.loc[item_id]
        item_category_one_hot = np.array(item_info[4:], dtype=np.float32)
        return item_id, item_category_one_hot

    def __load_user_info(self, file_path):
        return pd.read_csv(file_path,
                           sep='|',
                           names=['ID', 'Age', 'Gender', 'Occupation', 'Code'],
                           dtype={
                               'ID': np.uint16,
                               'Age': np.uint8
                           },
                           encoding='utf-8')

    def __load_item_info(self, file_path):
        return pd.read_csv(
            file_path,
            sep='|',
            names=[
                'ID', 'Title', 'Release date', 'Video release date', 'URL',
                'Unknown', 'Action', 'Adveenture', 'Animation', "Children's",
                'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
                'Sci-Fi', 'Thriller', 'War', 'Wstern'
            ],
            dtype={
                'ID': np.uint16,
                'video release data': str
            },
            encoding='latin1')

    def __load_rate(self, file_path, split_ratio):
        data = pd.read_csv(file_path,
                           sep='\t',
                           names=['User ID', 'Item ID', 'Rating', 'Timestamp'])
        train_data = data.sample(frac=split_ratio, random_state=0, axis=0)
        test_data = data[~data.index.isin(train_data.index)]
        return train_data, test_data
コード例 #14
0
    def __init__(self,
                 name='MovieLensDataset1M',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='../data/MovieLens/ml-1m'):
        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'users.dat')
        self.items_info_path = os.path.join(dataset_path, 'movies.dat')
        self.rates_file_path = os.path.join(dataset_path, 'ratings.dat')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)

        self.gender2id = {'M': 0, 'F': 1}
        self.category2id = {
            'Action': 0,
            'Adventure': 1,
            'Animation': 2,
            "Children's": 3,
            'Comedy': 4,
            'Crime': 5,
            'Documentary': 6,
            'Drama': 7,
            'Fantasy': 8,
            'Film-Noir': 9,
            'Horror': 10,
            'Musical': 11,
            'Mystery': 12,
            'Romance': 13,
            'Sci-Fi': 14,
            'Thriller': 15,
            'War': 16,
            'Western': 17
        }

        self.user_ids = self.users['ID'].drop_duplicates().tolist()
        self.item_ids = self.items['ID'].drop_duplicates().tolist()

        self.user_occupation_num = 21

        self.users = self.users.set_index('ID').sort_index()
        self.items = self.items.set_index('ID').sort_index()

        self.user_ids = list(range(1, 6041))
        self.item_ids = list(range(1, 3953))

        self.user_num = len(self.user_ids)
        self.item_num = len(self.item_ids)

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
コード例 #15
0
class MovieLensDataset1M():
    def __init__(self,
                 name='MovieLensDataset1M',
                 type=DataType.dictionary,
                 split_ratio=0.8,
                 dataset_path='../data/MovieLens/ml-1m'):
        self.name = name
        self.type = type
        self.users_info_path = os.path.join(dataset_path, 'users.dat')
        self.items_info_path = os.path.join(dataset_path, 'movies.dat')
        self.rates_file_path = os.path.join(dataset_path, 'ratings.dat')

        self.split_ratio = split_ratio
        self.users = self.__load_user_info(self.users_info_path)
        self.items = self.__load_item_info(self.items_info_path)

        self.gender2id = {'M': 0, 'F': 1}
        self.category2id = {
            'Action': 0,
            'Adventure': 1,
            'Animation': 2,
            "Children's": 3,
            'Comedy': 4,
            'Crime': 5,
            'Documentary': 6,
            'Drama': 7,
            'Fantasy': 8,
            'Film-Noir': 9,
            'Horror': 10,
            'Musical': 11,
            'Mystery': 12,
            'Romance': 13,
            'Sci-Fi': 14,
            'Thriller': 15,
            'War': 16,
            'Western': 17
        }

        self.user_ids = self.users['ID'].drop_duplicates().tolist()
        self.item_ids = self.items['ID'].drop_duplicates().tolist()

        self.user_occupation_num = 21

        self.users = self.users.set_index('ID').sort_index()
        self.items = self.items.set_index('ID').sort_index()

        self.user_ids = list(range(1, 6041))
        self.item_ids = list(range(1, 3953))

        self.user_num = len(self.user_ids)
        self.item_num = len(self.item_ids)

        if self.type == DataType.ndarray:
            self.train_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_trainset',
                len(self.user_ids), len(self.item_ids))
            self.test_dataset = RatingNDArray(
                self.name + '_' + self.type.name + '_testset',
                len(self.user_ids), len(self.item_ids))
        elif self.type == DataType.dataframe:
            self.train_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDataFrame(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)
        elif self.type == DataType.dictionary:
            self.train_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_trainset', self.user_ids,
                self.item_ids)
            self.test_dataset = RatingDictionary(
                self.name + '_' + self.type.name + '_testset', self.user_ids,
                self.item_ids)

    def load_dataset(self):
        print('loading rating from path: ' + self.rates_file_path)
        train_data_frame, test_data_frame = self.__load_rate(
            self.rates_file_path, self.split_ratio)
        self.train_dataset.load_dataset(train_data_frame)
        self.test_dataset.load_dataset(test_data_frame)
        print('Done')

    def train(self):
        for user_id, item_id, rating in self.train_dataset:
            user_info = self.users.loc[user_id]
            item_info = self.items.loc[item_id]
            user_age_id = user_info['Age'] // 10
            user_gender_id = self.gender2id[user_info['Gender']]
            user_occupation_id = user_info['Occupation']

            item_category_one_hot = self.category2onehot(item_info['Tag'])
            yield user_id, user_age_id, user_gender_id, user_occupation_id, item_id, item_category_one_hot, rating

    def evaluate(self):
        for user_id, item_id, rating in self.test_dataset:
            user_info = self.users.loc[user_id]
            item_info = self.items.loc[item_id]
            user_age_id = user_info['Age'] // 10
            user_gender_id = self.gender2id[user_info['Gender']]
            user_occupation_id = user_info['Occupation']

            item_category_one_hot = self.category2onehot(item_info['Tag'])
            yield user_id, user_age_id, user_gender_id, user_occupation_id, item_id, item_category_one_hot, rating

    def user_info(self, user_id):
        user_info = self.users.loc[user_id]
        user_age_id = user_info['Age'] // 10
        user_gender_id = self.gender2id[user_info['Gender']]
        user_occupation_id = user_info['Occupation']
        return user_id, user_age_id, user_gender_id, user_occupation_id

    def item_info(self, item_id):
        item_info = self.items.loc[item_id]
        item_category_one_hot = self.category2onehot(item_info['Tag'])
        return item_id, item_category_one_hot

    def category2onehot(self, category):
        category_list = category.split('|')
        onehot = np.zeros(len(self.category2id) + 1, dtype=np.float32)
        for c in category_list:
            onehot[self.category2id[c]] = 1
        return onehot

    def __load_user_info(self, file_path):
        return pd.read_csv(file_path,
                           sep='::',
                           names=['ID', 'Gender', 'Age', 'Occupation', 'Code'],
                           dtype={
                               'ID': np.uint32,
                               'Age': np.int64,
                               'Occupation': np.int64
                           },
                           encoding='utf-8',
                           engine='python')

    def __load_item_info(self, file_path):
        return pd.read_csv(file_path,
                           sep='::',
                           names=['ID', 'Title', 'Tag'],
                           dtype={'ID': np.uint32},
                           encoding='latin1',
                           engine='python')

    def __load_rate(self, file_path, split_ratio):
        data = pd.read_csv(file_path,
                           sep='::',
                           names=['User ID', 'Item ID', 'Rating', 'Timestamp'],
                           dtype={
                               'User ID': np.uint16,
                               'Item ID': np.uint16,
                               'Rating': np.float16,
                               'Timestamp': np.uint32
                           },
                           engine='python')
        train_data = data.sample(frac=split_ratio, random_state=0, axis=0)
        test_data = data[~data.index.isin(train_data.index)]
        return train_data, test_data