def test_get_full_dataset_6(): try: shutil.rmtree(os.path.join(os.path.expanduser('~') + '/.DRecPy_data/', 'ml-1m')) except FileNotFoundError: pass ret = get_full_dataset('ml-1m') assert open(os.path.expanduser('~') + '/.DRecPy_data/ml-1m/ratings.dat', 'r') is not None assert (len(ret), len(ret.columns)) == (1000209, 5) assert next(ret.values()) == {'interaction': 5, 'user': 1, 'item': 1193, 'rid': 0, 'timestamp': 978300760}
def test_get_full_dataset_1(): try: shutil.rmtree(os.path.join(os.path.expanduser('~') + '/.DRecPy_data/', 'ml-100k')) except FileNotFoundError: pass ret = get_full_dataset('ml-100k') assert open(os.path.expanduser('~') + '/.DRecPy_data/ml-100k/ua.base', 'r') is not None assert (len(ret), len(ret.columns)) == (100000, 5) assert next(ret.values()) == {'interaction': 3, 'user': 196, 'item': 242, 'rid': 0, 'timestamp': 881250949}
def test_get_full_dataset_5(): try: shutil.rmtree(os.path.join(os.path.expanduser('~') + '/.DRecPy_data/', 'bx')) except FileNotFoundError: pass ret = get_full_dataset('bx') assert open(os.path.expanduser('~') + '/.DRecPy_data/bx/BX-Book-Ratings.csv', 'r') is not None assert (len(ret), len(ret.columns)) == (1149780, 4) assert next(ret.values()) == {'interaction': 0, 'user': 276725, 'item': '034545104X', 'rid': 0}
def test_get_full_dataset_4(): try: os.environ['DATA_FOLDER'] = os.path.curdir try: shutil.rmtree(os.path.join(os.environ.get('DATA_FOLDER'), 'ml-100k')) except FileNotFoundError: pass ret = get_full_dataset('ml-100k') assert open(os.path.curdir + '/ml-100k/ua.base', 'r') is not None assert (len(ret), len(ret.columns)) == (100000, 5) assert next(ret.values()) == {'interaction': 3, 'user': 196, 'item': 242, 'rid': 0, 'timestamp': 881250949} shutil.rmtree(os.path.join(os.environ.get('DATA_FOLDER'), 'ml-100k')) finally: del os.environ['DATA_FOLDER']
from DRecPy.Recommender.Baseline import UserKNN from DRecPy.Dataset import get_full_dataset from DRecPy.Evaluation.Processes import ranking_evaluation from DRecPy.Evaluation.Splits import matrix_split from DRecPy.Evaluation.Metrics import Precision from DRecPy.Evaluation.Metrics import Recall from DRecPy.Evaluation.Metrics import NDCG ds = get_full_dataset('ml-100k') ds_train, ds_test = matrix_split(ds, user_test_ratio=0.2, item_test_ratio=0.2, seed=0, verbose=False) # cosine sim knn = UserKNN(k=10, m=0, sim_metric='cosine_cf', shrinkage=None, seed=15, use_averages=False, verbose=True) knn.fit(ds_train) evaluation = ranking_evaluation(knn, ds_test, interaction_threshold=5, k=list(range(1, 11)), generate_negative_pairs=False, n_pos_interactions=None, n_neg_interactions=None, seed=15, verbose=True, metrics=[Precision(), Recall(), NDCG()]) print('cosine sim', evaluation) # jaccard sim knn = UserKNN(k=10, m=0, sim_metric='jaccard', shrinkage=None, seed=15, use_averages=False, verbose=True) knn.fit(ds_train) evaluation = ranking_evaluation(knn, ds_test, interaction_threshold=5, k=list(range(1, 11)), generate_negative_pairs=False, n_pos_interactions=None, n_neg_interactions=None, seed=15, verbose=True, metrics=[Precision(), Recall(), NDCG()]) print('jaccard sim', evaluation)
from DRecPy.Evaluation.Splits import leave_k_out from DRecPy.Dataset import get_full_dataset import time dataset = get_full_dataset("ml-100k") print('Full dataset', dataset) # Dataset is split by leaving k user interactions out from the train set. # If a given user does not have k interactions, all interactions stay on train set. # Although, if a given user has < min_user_interactions, it will be removed # from both sets. start_t = time.time() dataset_train, dataset_test = leave_k_out(dataset, k=10, min_user_interactions=20) print(f'Splitting complete. Took: {time.time() - start_t}s') print('Train dataset', dataset_train) print('Test dataset', dataset_test)
def test_get_full_dataset_3(): ret = get_full_dataset('ml-100k', force_out_of_memory=True) assert isinstance(ret, DatabaseInteractionDataset)
def test_get_full_dataset_2(): ret = get_full_dataset('ml-100k') assert isinstance(ret, MemoryInteractionDataset)
def test_get_full_dataset_0(): try: get_full_dataset('') except FileNotFoundError as e: assert str(e) == '"" is not a valid dataset. Supported datasets: ml-100k, ml-1m, ml-10m, ml-20m, bx.'
from DRecPy.Dataset import get_train_dataset from DRecPy.Dataset import get_test_dataset from DRecPy.Dataset import get_full_dataset from DRecPy.Dataset import available_datasets print('Available datasets', available_datasets()) # Reading the ml-100k full dataset and prebuilt train and test datasets. print('ml-100k full dataset', get_full_dataset('ml-100k')) print('ml-100k train dataset', get_train_dataset('ml-100k')) print('ml-100k test dataset', get_test_dataset('ml-100k')) # Reading the ml-1m full dataset and generated train and test datasets using out of memory storage. print('ml-1m full dataset', get_full_dataset('ml-1m', force_out_of_memory=True)) print('ml-1m train dataset', get_train_dataset('ml-1m', force_out_of_memory=True)) print('ml-1m test dataset', get_test_dataset('ml-1m', force_out_of_memory=True)) # Showcase some dataset operations ds_ml = get_full_dataset('ml-100k') print('Minimum rating value:', ds_ml.min('interaction')) print('Unique rating values:', ds_ml.unique('interaction').values_list()) ds_ml.apply('interaction', lambda x: x / ds_ml.max('interaction')) # standardize the rating value print('New values', ds_ml.values_list()[:5])