Python get_full_dataset Examples, DRecPy.Dataset.get_full_dataset Python Examples

Example #1

0

Show file

File: test_integrated_datasets.py Project: lasigeBioTM/DRecPy

def test_get_full_dataset_6():
    try:
        shutil.rmtree(os.path.join(os.path.expanduser('~') + '/.DRecPy_data/', 'ml-1m'))
    except FileNotFoundError:
        pass

    ret = get_full_dataset('ml-1m')
    assert open(os.path.expanduser('~') + '/.DRecPy_data/ml-1m/ratings.dat', 'r') is not None
    assert (len(ret), len(ret.columns)) == (1000209, 5)
    assert next(ret.values()) == {'interaction': 5, 'user': 1, 'item': 1193, 'rid': 0, 'timestamp': 978300760}

Example #2

0

Show file

File: test_integrated_datasets.py Project: lasigeBioTM/DRecPy

def test_get_full_dataset_1():
    try:
        shutil.rmtree(os.path.join(os.path.expanduser('~') + '/.DRecPy_data/', 'ml-100k'))
    except FileNotFoundError:
        pass

    ret = get_full_dataset('ml-100k')
    assert open(os.path.expanduser('~') + '/.DRecPy_data/ml-100k/ua.base', 'r') is not None
    assert (len(ret), len(ret.columns)) == (100000, 5)
    assert next(ret.values()) == {'interaction': 3, 'user': 196, 'item': 242, 'rid': 0, 'timestamp': 881250949}

Example #3

0

Show file

File: test_integrated_datasets.py Project: lasigeBioTM/DRecPy

def test_get_full_dataset_5():
    try:
        shutil.rmtree(os.path.join(os.path.expanduser('~') + '/.DRecPy_data/', 'bx'))
    except FileNotFoundError:
        pass

    ret = get_full_dataset('bx')
    assert open(os.path.expanduser('~') + '/.DRecPy_data/bx/BX-Book-Ratings.csv', 'r') is not None
    assert (len(ret), len(ret.columns)) == (1149780, 4)
    assert next(ret.values()) == {'interaction': 0, 'user': 276725, 'item': '034545104X', 'rid': 0}

Example #4

0

Show file

File: test_integrated_datasets.py Project: lasigeBioTM/DRecPy

def test_get_full_dataset_4():
    try:
        os.environ['DATA_FOLDER'] = os.path.curdir

        try:
            shutil.rmtree(os.path.join(os.environ.get('DATA_FOLDER'), 'ml-100k'))
        except FileNotFoundError:
            pass

        ret = get_full_dataset('ml-100k')
        assert open(os.path.curdir + '/ml-100k/ua.base', 'r') is not None
        assert (len(ret), len(ret.columns)) == (100000, 5)
        assert next(ret.values()) == {'interaction': 3, 'user': 196, 'item': 242, 'rid': 0, 'timestamp': 881250949}
        shutil.rmtree(os.path.join(os.environ.get('DATA_FOLDER'), 'ml-100k'))
    finally:
        del os.environ['DATA_FOLDER']

Example #5

0

Show file

File: ml_knn.py Project: lasigeBioTM/DRecPy

from DRecPy.Recommender.Baseline import UserKNN
from DRecPy.Dataset import get_full_dataset
from DRecPy.Evaluation.Processes import ranking_evaluation
from DRecPy.Evaluation.Splits import matrix_split
from DRecPy.Evaluation.Metrics import Precision
from DRecPy.Evaluation.Metrics import Recall
from DRecPy.Evaluation.Metrics import NDCG

ds = get_full_dataset('ml-100k')

ds_train, ds_test = matrix_split(ds, user_test_ratio=0.2, item_test_ratio=0.2, seed=0, verbose=False)

# cosine sim
knn = UserKNN(k=10, m=0, sim_metric='cosine_cf', shrinkage=None, seed=15, use_averages=False, verbose=True)
knn.fit(ds_train)

evaluation = ranking_evaluation(knn, ds_test, interaction_threshold=5, k=list(range(1, 11)),
                                generate_negative_pairs=False, n_pos_interactions=None,
                                n_neg_interactions=None, seed=15, verbose=True,
                                metrics=[Precision(), Recall(), NDCG()])
print('cosine sim', evaluation)

# jaccard sim
knn = UserKNN(k=10, m=0, sim_metric='jaccard', shrinkage=None, seed=15, use_averages=False, verbose=True)
knn.fit(ds_train)

evaluation = ranking_evaluation(knn, ds_test, interaction_threshold=5, k=list(range(1, 11)),
                                generate_negative_pairs=False, n_pos_interactions=None,
                                n_neg_interactions=None, seed=15, verbose=True,
                                metrics=[Precision(), Recall(), NDCG()])
print('jaccard sim', evaluation)

Example #6

0

Show file

from DRecPy.Evaluation.Splits import leave_k_out
from DRecPy.Dataset import get_full_dataset
import time

dataset = get_full_dataset("ml-100k")
print('Full dataset', dataset)

# Dataset is split by leaving k user interactions out from the train set.
# If a given user does not have k interactions, all interactions stay on train set.
# Although, if a given user has < min_user_interactions, it will be removed
# from both sets.
start_t = time.time()
dataset_train, dataset_test = leave_k_out(dataset,
                                          k=10,
                                          min_user_interactions=20)
print(f'Splitting complete. Took: {time.time() - start_t}s')
print('Train dataset', dataset_train)
print('Test dataset', dataset_test)

Example #7

0

Show file

File: test_integrated_datasets.py Project: lasigeBioTM/DRecPy

def test_get_full_dataset_3():
    ret = get_full_dataset('ml-100k', force_out_of_memory=True)
    assert isinstance(ret, DatabaseInteractionDataset)

Example #8

0

Show file

File: test_integrated_datasets.py Project: lasigeBioTM/DRecPy

def test_get_full_dataset_2():
    ret = get_full_dataset('ml-100k')
    assert isinstance(ret, MemoryInteractionDataset)

Example #9

0

Show file

File: test_integrated_datasets.py Project: lasigeBioTM/DRecPy

def test_get_full_dataset_0():
    try:
        get_full_dataset('')
    except FileNotFoundError as e:
        assert str(e) == '"" is not a valid dataset. Supported datasets: ml-100k, ml-1m, ml-10m, ml-20m, bx.'

Example #10

0

Show file

from DRecPy.Dataset import get_train_dataset
from DRecPy.Dataset import get_test_dataset
from DRecPy.Dataset import get_full_dataset
from DRecPy.Dataset import available_datasets

print('Available datasets', available_datasets())

# Reading the ml-100k full dataset and prebuilt train and test datasets.
print('ml-100k full dataset', get_full_dataset('ml-100k'))
print('ml-100k train dataset', get_train_dataset('ml-100k'))
print('ml-100k test dataset', get_test_dataset('ml-100k'))

# Reading the ml-1m full dataset and generated train and test datasets using out of memory storage.
print('ml-1m full dataset', get_full_dataset('ml-1m', force_out_of_memory=True))
print('ml-1m train dataset', get_train_dataset('ml-1m', force_out_of_memory=True))
print('ml-1m test dataset', get_test_dataset('ml-1m', force_out_of_memory=True))

# Showcase some dataset operations
ds_ml = get_full_dataset('ml-100k')
print('Minimum rating value:', ds_ml.min('interaction'))
print('Unique rating values:', ds_ml.unique('interaction').values_list())

ds_ml.apply('interaction', lambda x: x / ds_ml.max('interaction'))  # standardize the rating value
print('New values', ds_ml.values_list()[:5])