def fetch_movielens(data_home=None, indicator_features=True, genre_features=False, min_rating=0.0, download_if_missing=True): """ Fetch the `Movielens 100k dataset <http://grouplens.org/datasets/movielens/100k/>`_. The dataset contains 100,000 interactions from 1000 users on 1700 movies, and is exhaustively described in its `README <http://files.grouplens.org/datasets/movielens/ml-100k-README.txt>`_. Parameters ---------- data_home: path, optional Path to the directory in which the downloaded data should be placed. Defaults to ``~/lightfm_data/``. indicator_features: bool, optional Use an [n_users, n_users] identity matrix for item features. When True with genre_features, indicator and genre features are concatenated into a single feature matrix of shape [n_users, n_users + n_genres]. genre_features: bool, optional Use a [n_users, n_genres] matrix for item features. When True with item_indicator_features, indicator and genre features are concatenated into a single feature matrix of shape [n_users, n_users + n_genres]. min_rating: float, optional Minimum rating to include in the interaction matrix. download_if_missing: bool, optional Download the data if not present. Raises an IOError if False and data is missing. Notes ----- The return value is a dictionary containing the following keys: Returns ------- train: sp.coo_matrix of shape [n_users, n_items] Contains training set interactions. test: sp.coo_matrix of shape [n_users, n_items] Contains testing set interactions. item_features: sp.csr_matrix of shape [n_items, n_item_features] Contains item features. item_feature_labels: np.array of strings of shape [n_item_features,] Labels of item features. item_labels: np.array of strings of shape [n_items,] Items' titles. """ if not (indicator_features or genre_features): raise ValueError('At least one of item_indicator_features ' 'or genre_features must be True') zip_path = _common.get_data(data_home, 'http://files.grouplens.org/datasets/movielens/ml-100k.zip', 'movielens100k', 'movielens.zip', download_if_missing) # Load raw data (train_raw, test_raw, item_metadata_raw, genres_raw) = _read_raw_data(zip_path) # Figure out the dimensions num_users, num_items = _get_dimensions(_parse(train_raw), _parse(test_raw)) # Load train interactions train = _build_interaction_matrix(num_users, num_items, _parse(train_raw), min_rating) # Load test interactions test = _build_interaction_matrix(num_users, num_items, _parse(test_raw), min_rating) assert train.shape == test.shape # Load metadata features (id_features, id_feature_labels, genre_features_matrix, genre_feature_labels) = _parse_item_metadata(num_items, item_metadata_raw, genres_raw) assert id_features.shape == (num_items, len(id_feature_labels)) assert genre_features_matrix.shape == (num_items, len(genre_feature_labels)) if indicator_features and not genre_features: features = id_features feature_labels = id_feature_labels elif genre_features and not indicator_features: features = genre_features_matrix feature_labels = genre_feature_labels else: features = sp.hstack([id_features, genre_features_matrix]).tocsr() feature_labels = np.concatenate((id_feature_labels, genre_feature_labels)) data = {'train': train, 'test': test, 'item_features': features, 'item_feature_labels': feature_labels, 'item_labels': id_feature_labels} return data
def fetch_movielens(data_home=None, indicator_features=True, genre_features=False, min_rating=0.0, download_if_missing=True): """ Fetch the `Movielens 100k dataset <http://grouplens.org/datasets/movielens/100k/>`_. The dataset contains 100,000 interactions from 1000 users on 1700 movies, and is exhaustively described in its `README <http://files.grouplens.org/datasets/movielens/ml-100k-README.txt>`_. Parameters ---------- data_home: path, optional Path to the directory in which the downloaded data should be placed. Defaults to ``~/lightfm_data/``. indicator_features: bool, optional Use an [n_users, n_users] identity matrix for item features. When True with genre_features, indicator and genre features are concatenated into a single feature matrix of shape [n_users, n_users + n_genres]. genre_features: bool, optional Use a [n_users, n_genres] matrix for item features. When True with item_indicator_features, indicator and genre features are concatenated into a single feature matrix of shape [n_users, n_users + n_genres]. min_rating: float, optional Minimum rating to include in the interaction matrix. download_if_missing: bool, optional Download the data if not present. Raises an IOError if False and data is missing. Notes ----- The return value is a dictionary containing the following keys: Returns ------- train: sp.coo_matrix of shape [n_users, n_items] Contains training set interactions. test: sp.coo_matrix of shape [n_users, n_items] Contains testing set interactions. item_features: sp.csr_matrix of shape [n_items, n_item_features] Contains item features. item_feature_labels: np.array of strings of shape [n_item_features,] Labels of item features. item_labels: np.array of strings of shape [n_items,] Items' titles. """ if not (indicator_features or genre_features): raise ValueError('At least one of item_indicator_features ' 'or genre_features must be True') zip_path = _common.get_data( data_home, 'http://files.grouplens.org/datasets/movielens/ml-100k.zip', 'movielens100k', 'movielens.zip', download_if_missing) # Load raw data (train_raw, test_raw, item_metadata_raw, genres_raw) = _read_raw_data(zip_path) # Figure out the dimensions num_users, num_items = _get_dimensions(_parse(train_raw), _parse(test_raw)) # Load train interactions train = _build_interaction_matrix(num_users, num_items, _parse(train_raw), min_rating) # Load test interactions test = _build_interaction_matrix(num_users, num_items, _parse(test_raw), min_rating) assert train.shape == test.shape # Load metadata features (id_features, id_feature_labels, genre_features_matrix, genre_feature_labels) = _parse_item_metadata(num_items, item_metadata_raw, genres_raw) assert id_features.shape == (num_items, len(id_feature_labels)) assert genre_features_matrix.shape == (num_items, len(genre_feature_labels)) if indicator_features and not genre_features: features = id_features feature_labels = id_feature_labels elif genre_features and not indicator_features: features = genre_features_matrix feature_labels = genre_feature_labels else: features = sp.hstack([id_features, genre_features_matrix]).tocsr() feature_labels = np.concatenate( (id_feature_labels, genre_feature_labels)) data = { 'train': train, 'test': test, 'item_features': features, 'item_feature_labels': feature_labels, 'item_labels': id_feature_labels } return data
def fetch_movielens( data_home=None, indicator_features=True, genre_features=False, min_rating=0.0, download_if_missing=True, ): """ Fetch the `Movielens 100k dataset <http://grouplens.org/datasets/movielens/100k/>`_. The dataset contains 100,000 interactions from 1000 users on 1700 movies, and is exhaustively described in its `README <http://files.grouplens.org/datasets/movielens/ml-100k-README.txt>`_. Parameters ---------- data_home: path, optional Path to the directory in which the downloaded data should be placed. Defaults to ``~/lightfm_data/``. indicator_features: bool, optional Use an [n_items, n_items] identity matrix for item features. When True with genre_features, indicator and genre features are concatenated into a single feature matrix of shape [n_items, n_items + n_genres]. genre_features: bool, optional Use a [n_items, n_genres] matrix for item features. When True with item_indicator_features, indicator and genre features are concatenated into a single feature matrix of shape [n_items, n_items + n_genres]. min_rating: float, optional Minimum rating to include in the interaction matrix. download_if_missing: bool, optional Download the data if not present. Raises an IOError if False and data is missing. Notes ----- The return value is a dictionary containing the following keys: Returns ------- train: sp.coo_matrix of shape [n_users, n_items] Contains training set interactions. test: sp.coo_matrix of shape [n_users, n_items] Contains testing set interactions. item_features: sp.csr_matrix of shape [n_items, n_item_features] Contains item features. item_feature_labels: np.array of strings of shape [n_item_features,] Labels of item features. item_labels: np.array of strings of shape [n_items,] Items' titles. """ if not (indicator_features or genre_features): raise ValueError("At least one of item_indicator_features " "or genre_features must be True") zip_path = _common.get_data( data_home, ("http://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip" ), "LastFM", "lastfm-2k.zip", download_if_missing, ) # Load raw data try: #put in this variables the data gained from the zip (train_raw, test_raw, item_metadata_raw, genres_raw) = _read_raw_data(zip_path) except zipfile.BadZipFile: # Download was corrupted, get rid of the partially # downloaded file so that we re-download on the # next try. os.unlink(zip_path) raise ValueError("Corrupted Movielens download. Check your " "internet connection and try again.") # Figure out the dimensions num_users, num_items = _get_dimensions(_parse(train_raw), _parse(test_raw)) # Load train interactions train = _build_interaction_matrix(num_users, num_items, _parse(train_raw), min_rating) # Load test interactions test = _build_interaction_matrix(num_users, num_items, _parse(test_raw), min_rating) assert train.shape == test.shape # Load metadata features ( id_features, id_feature_labels, genre_features_matrix, genre_feature_labels, ) = _parse_item_metadata(num_items, item_metadata_raw, genres_raw) assert id_features.shape == (num_items, len(id_feature_labels)) assert genre_features_matrix.shape == (num_items, len(genre_feature_labels)) if indicator_features and not genre_features: features = id_features feature_labels = id_feature_labels elif genre_features and not indicator_features: features = genre_features_matrix feature_labels = genre_feature_labels else: features = sp.hstack([id_features, genre_features_matrix]).tocsr() feature_labels = np.concatenate( (id_feature_labels, genre_feature_labels)) data = { "train": train, "test": test, "item_features": features, "item_feature_labels": feature_labels, "item_labels": id_feature_labels, } return data
def fetch_stackexchange(dataset, test_set_fraction=0.2, min_training_interactions=1, data_home=None, indicator_features=True, tag_features=False, download_if_missing=True): """ Fetch a dataset from the `StackExchange network <http://stackexchange.com/>`_. The datasets contain users answering questions: an interaction is defined as a user answering a given question. The following datasets from the StackExchange network are available: - CrossValidated: From stats.stackexchange.com. Approximately 9000 users, 72000 questions, and 70000 answers. - StackOverflow: From stackoverflow.stackexchange.com. Approximately 1.3M users, 11M questions, and 18M answers. Parameters ---------- dataset: string, one of ('crossvalidated', 'stackoverflow') The part of the StackExchange network for which to fetch the dataset. test_set_fraction: float, optional The fraction of the dataset used for testing. Splitting into the train and test set is done in a time-based fashion: all interactions before a certain time are in the train set and all interactions after that time are in the test set. min_training_interactions: int, optional Only include users with this amount of interactions in the training set. data_home: path, optional Path to the directory in which the downloaded data should be placed. Defaults to ``~/lightfm_data/``. indicator_features: bool, optional Use an [n_users, n_users] identity matrix for item features. When True with genre_features, indicator and genre features are concatenated into a single feature matrix of shape [n_users, n_users + n_genres]. download_if_missing: bool, optional Download the data if not present. Raises an IOError if False and data is missing. Notes ----- The return value is a dictionary containing the following keys: Returns ------- train: sp.coo_matrix of shape [n_users, n_items] Contains training set interactions. test: sp.coo_matrix of shape [n_users, n_items] Contains testing set interactions. item_features: sp.csr_matrix of shape [n_items, n_item_features] Contains item features. item_feature_labels: np.array of strings of shape [n_item_features,] Labels of item features. """ if not (indicator_features or tag_features): raise ValueError('At least one of item_indicator_features ' 'or tag_features must be True') if dataset not in ('crossvalidated', 'stackoverflow'): raise ValueError('Unknown dataset') if not (0.0 < test_set_fraction < 1.0): raise ValueError('Test set fraction must be between 0 and 1') urls = {'crossvalidated': ('https://github.com/maciejkula/lightfm_datasets/releases/' 'download/v0.1.0/stackexchange_crossvalidated.npz'), 'stackoverflow': ('https://github.com/maciejkula/lightfm_datasets/releases/' 'download/v0.1.0/stackexchange_stackoverflow.npz')} path = _common.get_data(data_home, urls[dataset], os.path.join('stackexchange', dataset), 'data.npz', download_if_missing) data = np.load(path) interactions = sp.coo_matrix((data['interactions_data'], (data['interactions_row'], data['interactions_col'])), shape=data['interactions_shape'].flatten()) tag_features_mat = sp.coo_matrix((data['features_data'], (data['features_row'], data['features_col'])), shape=data['features_shape'].flatten()) tag_labels = data['labels'] test_cutoff_timestamp = np.sort(interactions.data)[len(interactions.data) * (1.0 - test_set_fraction)] in_train = interactions.data < test_cutoff_timestamp in_test = np.logical_not(in_train) train = sp.coo_matrix((np.ones(in_train.sum(), dtype=np.float32), (interactions.row[in_train], interactions.col[in_train])), shape=interactions.shape) test = sp.coo_matrix((np.ones(in_test.sum(), dtype=np.float32), (interactions.row[in_test], interactions.col[in_test])), shape=interactions.shape) if min_training_interactions > 0: include = np.squeeze(np.array(train.getnnz(axis=1))) > min_training_interactions train = train.tocsr()[include].tocoo() test = test.tocsr()[include].tocoo() if indicator_features and not tag_features: features = sp.identity(train.shape[1], format='csr', dtype=np.float32) labels = np.array(['question_id:{}'.format(x) for x in range(train.shape[1])]) elif not indicator_features and tag_features: features = tag_features_mat.tocsr() labels = tag_labels else: id_features = sp.identity(train.shape[1], format='csr', dtype=np.float32) features = sp.hstack([id_features, tag_features_mat]).tocsr() labels = np.concatenate([np.array(['question_id:{}'.format(x) for x in range(train.shape[1])]), tag_labels]) return {'train': train, 'test': test, 'item_features': features, 'item_feature_labels': labels}
def fetch_stackexchange(dataset, test_set_fraction=0.2, min_training_interactions=1, data_home=None, indicator_features=True, tag_features=False, download_if_missing=True): """ Fetch a dataset from the `StackExchange network <http://stackexchange.com/>`_. The datasets contain users answering questions: an interaction is defined as a user answering a given question. The following datasets from the StackExchange network are available: - CrossValidated: From stats.stackexchange.com. Approximately 9000 users, 72000 questions, and 70000 answers. - StackOverflow: From stackoverflow.stackexchange.com. Approximately 1.3M users, 11M questions, and 18M answers. Parameters ---------- dataset: string, one of ('crossvalidated', 'stackoverflow') The part of the StackExchange network for which to fetch the dataset. test_set_fraction: float, optional The fraction of the dataset used for testing. Splitting into the train and test set is done in a time-based fashion: all interactions before a certain time are in the train set and all interactions after that time are in the test set. min_training_interactions: int, optional Only include users with this amount of interactions in the training set. data_home: path, optional Path to the directory in which the downloaded data should be placed. Defaults to ``~/lightfm_data/``. indicator_features: bool, optional Use an [n_users, n_users] identity matrix for item features. When True with genre_features, indicator and genre features are concatenated into a single feature matrix of shape [n_users, n_users + n_genres]. download_if_missing: bool, optional Download the data if not present. Raises an IOError if False and data is missing. Notes ----- The return value is a dictionary containing the following keys: Returns ------- train: sp.coo_matrix of shape [n_users, n_items] Contains training set interactions. test: sp.coo_matrix of shape [n_users, n_items] Contains testing set interactions. item_features: sp.csr_matrix of shape [n_items, n_item_features] Contains item features. item_feature_labels: np.array of strings of shape [n_item_features,] Labels of item features. """ if not (indicator_features or tag_features): raise ValueError('At least one of item_indicator_features ' 'or tag_features must be True') if dataset not in ('crossvalidated', 'stackoverflow'): raise ValueError('Unknown dataset') if not (0.0 < test_set_fraction < 1.0): raise ValueError('Test set fraction must be between 0 and 1') urls = { 'crossvalidated': ('https://github.com/maciejkula/lightfm_datasets/releases/' 'download/v0.1.0/stackexchange_crossvalidated.npz'), 'stackoverflow': ('https://github.com/maciejkula/lightfm_datasets/releases/' 'download/v0.1.0/stackexchange_stackoverflow.npz') } path = _common.get_data(data_home, urls[dataset], os.path.join('stackexchange', dataset), 'data.npz', download_if_missing) data = np.load(path) interactions = sp.coo_matrix( (data['interactions_data'], (data['interactions_row'], data['interactions_col'])), shape=data['interactions_shape'].flatten()) tag_features_mat = sp.coo_matrix( (data['features_data'], (data['features_row'], data['features_col'])), shape=data['features_shape'].flatten()) tag_labels = data['labels'] test_cutoff_index = int(len(interactions.data) * (1.0 - test_set_fraction)) test_cutoff_timestamp = np.sort(interactions.data)[test_cutoff_index] in_train = interactions.data < test_cutoff_timestamp in_test = np.logical_not(in_train) train = sp.coo_matrix( (np.ones(in_train.sum(), dtype=np.float32), (interactions.row[in_train], interactions.col[in_train])), shape=interactions.shape) test = sp.coo_matrix( (np.ones(in_test.sum(), dtype=np.float32), (interactions.row[in_test], interactions.col[in_test])), shape=interactions.shape) if min_training_interactions > 0: include = np.squeeze(np.array( train.getnnz(axis=1))) > min_training_interactions train = train.tocsr()[include].tocoo() test = test.tocsr()[include].tocoo() if indicator_features and not tag_features: features = sp.identity(train.shape[1], format='csr', dtype=np.float32) labels = np.array( ['question_id:{}'.format(x) for x in range(train.shape[1])]) elif not indicator_features and tag_features: features = tag_features_mat.tocsr() labels = tag_labels else: id_features = sp.identity(train.shape[1], format='csr', dtype=np.float32) features = sp.hstack([id_features, tag_features_mat]).tocsr() labels = np.concatenate([ np.array( ['question_id:{}'.format(x) for x in range(train.shape[1])]), tag_labels ]) return { 'train': train, 'test': test, 'item_features': features, 'item_feature_labels': labels }
def fetch_movielens( data_home=None, indicator_features=True, genre_features=False, min_rating=0.0, download_if_missing=True, ): """ Fetch the `Movielens 100k dataset <http://grouplens.org/datasets/movielens/100k/>`_. The dataset contains 100,000 interactions from 1000 users on 1700 movies, and is exhaustively described in its `README <http://files.grouplens.org/datasets/movielens/ml-100k-README.txt>`_. Parameters ---------- data_home: path, optional Path to the directory in which the downloaded data should be placed. Defaults to ``~/lightfm_data/``. indicator_features: bool, optional Use an [n_items, n_items] identity matrix for item features. When True with genre_features, indicator and genre features are concatenated into a single feature matrix of shape [n_items, n_items + n_genres]. genre_features: bool, optional Use a [n_items, n_genres] matrix for item features. When True with item_indicator_features, indicator and genre features are concatenated into a single feature matrix of shape [n_items, n_items + n_genres]. min_rating: float, optional Minimum rating to include in the interaction matrix. download_if_missing: bool, optional Download the data if not present. Raises an IOError if False and data is missing. Notes ----- The return value is a dictionary containing the following keys: Returns ------- train: sp.coo_matrix of shape [n_users, n_items] Contains training set interactions. test: sp.coo_matrix of shape [n_users, n_items] Contains testing set interactions. item_features: sp.csr_matrix of shape [n_items, n_item_features] Contains item features. item_feature_labels: np.array of strings of shape [n_item_features,] Labels of item features. item_labels: np.array of strings of shape [n_items,] Items' titles. """ if not (indicator_features or genre_features): raise ValueError( "At least one of item_indicator_features " "or genre_features must be True" ) zip_path = _common.get_data( data_home, ( "https://github.com/maciejkula/" "lightfm_datasets/releases/" "download/v0.1.0/movielens.zip" ), "movielens100k", "movielens.zip", download_if_missing, ) # Load raw data try: (train_raw, test_raw, item_metadata_raw, genres_raw) = _read_raw_data(zip_path) except zipfile.BadZipFile: # Download was corrupted, get rid of the partially # downloaded file so that we re-download on the # next try. os.unlink(zip_path) raise ValueError( "Corrupted Movielens download. Check your " "internet connection and try again." ) # Figure out the dimensions num_users, num_items = _get_dimensions(_parse(train_raw), _parse(test_raw)) # Load train interactions train = _build_interaction_matrix( num_users, num_items, _parse(train_raw), min_rating ) # Load test interactions test = _build_interaction_matrix(num_users, num_items, _parse(test_raw), min_rating) assert train.shape == test.shape # Load metadata features ( id_features, id_feature_labels, genre_features_matrix, genre_feature_labels, ) = _parse_item_metadata(num_items, item_metadata_raw, genres_raw) assert id_features.shape == (num_items, len(id_feature_labels)) assert genre_features_matrix.shape == (num_items, len(genre_feature_labels)) if indicator_features and not genre_features: features = id_features feature_labels = id_feature_labels elif genre_features and not indicator_features: features = genre_features_matrix feature_labels = genre_feature_labels else: features = sp.hstack([id_features, genre_features_matrix]).tocsr() feature_labels = np.concatenate((id_feature_labels, genre_feature_labels)) data = { "train": train, "test": test, "item_features": features, "item_feature_labels": feature_labels, "item_labels": id_feature_labels, } return data
def fetch_movielens(data_home=None, indicator_features=True, genre_features=False, min_rating=0.0, download_if_missing=True): """ Fetch the `Movielens 100k dataset <http://grouplens.org/datasets/movielens/100k/>`_. The dataset contains 100,000 interactions from 1000 users on 1700 movies, and is exhaustively described in its `README <http://files.grouplens.org/datasets/movielens/ml-100k-README.txt>`_. Parameters ---------- data_home: path, optional Path to the directory in which the downloaded data should be placed. Defaults to ``~/lightfm_data/``. indicator_features: bool, optional Use an [n_users, n_users] identity matrix for item features. When True with genre_features, indicator and genre features are concatenated into a single feature matrix of shape [n_users, n_users + n_genres]. genre_features: bool, optional Use a [n_users, n_genres] matrix for item features. When True with item_indicator_features, indicator and genre features are concatenated into a single feature matrix of shape [n_users, n_users + n_genres]. min_rating: float, optional Minimum rating to include in the interaction matrix. download_if_missing: bool, optional Download the data if not present. Raises an IOError if False and data is missing. Notes ----- The return value is a dictionary containing the following keys: Returns ------- train: sp.coo_matrix of shape [n_users, n_items] Contains training set interactions. test: sp.coo_matrix of shape [n_users, n_items] Contains testing set interactions. item_features: sp.csr_matrix of shape [n_items, n_item_features] Contains item features. item_feature_labels: np.array of strings of shape [n_item_features,] Labels of item features. item_labels: np.array of strings of shape [n_items,] Items' titles. """ if not (indicator_features or genre_features): raise ValueError('At least one of item_indicator_features ' 'or genre_features must be True') zip_path = _common.get_data(data_home, ('https://github.com/maciejkula/' 'lightfm_datasets/releases/' 'download/v0.1.0/movielens.zip'), 'movielens100k', 'movielens.zip', download_if_missing) #------------------------------------------------------------------------------------------------------------------- # Load raw data (train_raw, test_raw, item_metadata_raw, genres_raw) = _read_raw_data(zip_path) #train_raw, test_raw: #[user id | item id | rating | timestamp]. For example: ['487\t684\t5\t883446543', '487\t685\t3\t883444252'] # item_metadata: # movie id | movie title | release date | video release date | # IMDb URL | unknown | Action | Adventure | Animation | # Children's | Comedy | Crime | Documentary | Drama | Fantasy | # Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | # Thriller | War | Western | # The last 19 fields are the genres, a 1 indicates the movie # is of that genre, a 0 indicates it is not; movies can be in # several genres at once. # For example: # ['1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0', ....] # genres_raw: # ['unknown|0', 'Action|1', 'Adventure|2', 'Animation|3', "Children's|4", 'Comedy|5', # 'Crime|6', 'Documentary|7', 'Drama|8', 'Fantasy|9', 'Film-Noir|10', 'Horror|11', 'Musical|12', 'Mystery|13', # 'Romance|14', 'Sci-Fi|15', 'Thriller|16', 'War|17', 'Western|18', '', ''] #------------------------------------------------------------------------------------------------------------------- #------------------------------------------------------------------------------------------------------------------- # Figure out the dimensions without the new user num_users, num_items = _get_dimensions(_parse(train_raw)) # num_users = 943 # num_items = 1682 #------------------------------------------------------------------------------------------------------------------- #------------------------------------------------------------------------------------------------------------------- # Load metadata features (id_features, id_feature_labels, genre_features_matrix, genre_feature_labels) = _parse_item_metadata(num_items, item_metadata_raw, genres_raw) # id_features: # <1682x1682 sparse matrix of type '<class 'numpy.float32'>' # with 1682 stored elements in Compressed Sparse Row format> # (0, 0) 1.0 # (1, 1) 1.0 # (2, 2) 1.0 # (3, 3) 1.0 # (4, 4) 1.0 # (5, 5) 1.0 ....... # id_feature_labels (title) is an array of: # ['Toy Story (1995)' 'GoldenEye (1995)' 'Four Rooms (1995)' ..., # 'Sliding Doors (1998)' 'You So Crazy (1994)' # 'Scream of Stone (Schrei aus Stein) (1991)'] # genre_features_matrix <1682x19 sparse matrix of type '<class 'numpy.float32'>' # with 2893 stored elements in Compressed Sparse Row format> # (num_items, len(genres)) # (0, 3) 1.0 # (0, 4) 1.0 # (0, 5) 1.0 # (1, 1) 1.0 # genre_feature_labels: # ['genre:unknown' 'genre:Action' 'genre:Adventure' 'genre:Animation' # "genre:Children's" 'genre:Comedy' 'genre:Crime' 'genre:Documentary' # 'genre:Drama' 'genre:Fantasy' 'genre:Film-Noir' 'genre:Horror' # 'genre:Musical' 'genre:Mystery' 'genre:Romance' 'genre:Sci-Fi' # 'genre:Thriller' 'genre:War' 'genre:Western'] #------------------------------------------------------------------------------------------------------------------- #------------------------------------------------------------------------------------------------------------------- # inserting new user ''' manualy enter random ratings for 20 random movies for the new user (only for testing) train_raw.append('944\t684\t5\t883446543') train_raw.append('944\t685\t5\t883446543') train_raw.append('944\t686\t5\t883446543') train_raw.append('944\t687\t5\t883446543') train_raw.append('944\t688\t5\t883446543') train_raw.append('944\t689\t5\t883446543') train_raw.append('944\t690\t5\t883446543') train_raw.append('944\t691\t5\t883446543') train_raw.append('944\t692\t5\t883446543') train_raw.append('944\t693\t5\t883446543') train_raw.append('944\t694\t5\t883446543') train_raw.append('944\t695\t5\t883446543') train_raw.append('944\t696\t5\t883446543') train_raw.append('944\t697\t5\t883446543') train_raw.append('944\t698\t5\t883446543') train_raw.append('944\t699\t5\t883446543') train_raw.append('944\t700\t5\t883446543') train_raw.append('944\t701\t5\t883446543') train_raw.append('944\t702\t5\t883446543') train_raw.append('944\t703\t5\t883446543') ''' #insert new user by creating gui in python console newUserID = 944 # new user's id timestamp = 883446543 # random timestamp, we dont care about that i = 0 j = 20 f = 0 while (f < 5): userList = [] for x in range(i, j): userList.append({x % 20 + 1: id_feature_labels[x]}) print('\n') for p in userList: print(p) print('\n') ui = input("Choose a movie, or press -1 to change movieset: ") var = int(ui) if (var == -1): if ((1681 - j) >= 20): i = j j += 20 elif ((1681 - j) > 0): i = j j = 1682 else: i = 0 j = 20 continue else: print('\n') print("You selected the movie: " + userList[var - 1][var] + " with ID: " + str(id_feature_labels.tolist().index(userList[var - 1][var]) + 1)) print('\n') rating = input("Rate the movie: ") train_raw.append('944\t' + str( id_feature_labels.tolist().index(userList[var - 1][var]) + 1) + '\t' + rating + '\t883446543') f = f + 1 while (f < 5): ch = input( "To change the movieset press -1, to keep press 1: ") if (int(ch) == -1): break else: print('\n') for p in userList: print(p) print('\n') mov = input("Choose a movie: ") var = int(mov) print('\n') print("You selected the movie: " + userList[var - 1][var] + " with ID: " + str(id_feature_labels.tolist().index( userList[var - 1][var]) + 1)) print('\n') rating = input("Rate the movie: ") train_raw.append('944\t' + str(id_feature_labels.tolist().index( userList[var - 1][var]) + 1) + '\t' + rating + '\t883446543') f = f + 1 if ((1681 - j) >= 20): i = j j += 20 elif ((1681 - j) > 0): i = j j = 1682 else: i = 0 j = 20 print('\n') #len(train_raw) = 90591 #first user id is 1, last user id is 944 #------------------------------------------------------------------------------------------------------------------- #------------------------------------------------------------------------------------------------------------------- # Figure out the dimensions with the new user num_users, num_items = _get_dimensions(_parse(train_raw)) # num_users = 944 # num_items = 1682 #------------------------------------------------------------------------------------------------------------------- #------------------------------------------------------------------------------------------------------------------- # Load train interactions train = _build_interaction_matrix(num_users, num_items, _parse(train_raw), min_rating) #train matrix: #(0, 0) 5 #(0, 2) 4 #(0, 5) 5 #(0, 6) 4 #(0, 8) 5 #first user id is 0, last user id is 943 #------------------------------------------------------------------------------------------------------------------- #------------------------------------------------------------------------------------------------------------------- # Load test interactions #test = _build_interaction_matrix(num_users,num_items,_parse(test_raw),min_rating) # test matrix: # (0, 0) 5 # (0, 2) 4 # (0, 5) 5 # (0, 6) 4 # (0, 8) 5 test = [] # assert train.shape == test.shape # (943, 1682) == (943, 1682) #------------------------------------------------------------------------------------------------------------------- #------------------------------------------------------------------------------------------------------------------- '''# Load metadata features (we already defined it above!) (id_features, id_feature_labels, genre_features_matrix, genre_feature_labels) = _parse_item_metadata(num_items, item_metadata_raw, genres_raw) #id_features: #<1682x1682 sparse matrix of type '<class 'numpy.float32'>' #with 1682 stored elements in Compressed Sparse Row format> #(0, 0) 1.0 #(1, 1) 1.0 #(2, 2) 1.0 #(3, 3) 1.0 #(4, 4) 1.0 #(5, 5) 1.0 ....... #id_feature_labels (title) is an array of: #['Toy Story (1995)' 'GoldenEye (1995)' 'Four Rooms (1995)' ..., #'Sliding Doors (1998)' 'You So Crazy (1994)' #'Scream of Stone (Schrei aus Stein) (1991)'] #genre_features_matrix <1682x19 sparse matrix of type '<class 'numpy.float32'>' #with 2893 stored elements in Compressed Sparse Row format> #(num_items, len(genres)) #(0, 3) 1.0 #(0, 4) 1.0 #(0, 5) 1.0 #(1, 1) 1.0 #genre_feature_labels: #['genre:unknown' 'genre:Action' 'genre:Adventure' 'genre:Animation' #"genre:Children's" 'genre:Comedy' 'genre:Crime' 'genre:Documentary' #'genre:Drama' 'genre:Fantasy' 'genre:Film-Noir' 'genre:Horror' #'genre:Musical' 'genre:Mystery' 'genre:Romance' 'genre:Sci-Fi' #'genre:Thriller' 'genre:War' 'genre:Western'] ''' #------------------------------------------------------------------------------------------------------------------- assert id_features.shape == (num_items, len(id_feature_labels) ) #1682x1682 = 1682x1682 assert genre_features_matrix.shape == (num_items, len(genre_feature_labels) ) #1682x19 = 1682x19 if indicator_features and not genre_features: features = id_features feature_labels = id_feature_labels elif genre_features and not indicator_features: features = genre_features_matrix feature_labels = genre_feature_labels else: features = sp.hstack([id_features, genre_features_matrix]).tocsr() feature_labels = np.concatenate( (id_feature_labels, genre_feature_labels)) data = { 'train': train, 'test': test, 'item_features': features, 'item_feature_labels': feature_labels, 'item_labels': id_feature_labels } return data