Exemple #1
0
def test_import_dense_type_mat():
    """4
    Test the type after saving and loading of .sparset1 format.
    """
    x = np.random.rand(3, 2)
    export_data('/tmp/test.mat', x)
    assert x.dtype == import_data('/tmp/test.mat').dtype
Exemple #2
0
def test_import_sparse_values_mat():
    """
    Test values after saving and loading of .sparse format.
    """
    x = sps.csr_matrix(np.random.rand(3, 2))
    export_data('/tmp/test.mat', x)
    assert np.array_equal(x.toarray(), import_data('/tmp/test.mat').toarray())
Exemple #3
0
def test_import_dense_values_mat():
    """
    Test values after saving and loading of .sparse format.
    """
    x = np.random.rand(3, 2)
    export_data('/tmp/test.mat', x)
    assert np.array_equal(x, import_data('/tmp/test.mat'))
Exemple #4
0
def test_import_type_sparsetxt():
    """4
    Test the type after saving and loading of .sparset1 format.
    """
    x = sps.csr_matrix(np.random.rand(3, 2))
    export_data('/tmp/test.sparsetxt', x)
    assert x.dtype == import_data('/tmp/test.sparsetxt').dtype
Exemple #5
0
def test_import_type_densetxt():
    """
    Test the type after saving and loading of .dense format.
    """
    x = np.random.rand(7, 11)
    export_data('/tmp/test.densetxt', x)
    assert x.dtype == import_data('/tmp/test.densetxt').dtype
Exemple #6
0
def test_import_values_dense():
    """
    Test values after saving and loading of .dense format.
    """
    x = np.random.rand(7, 11)
    export_data('/tmp/test.dense', x)
    assert np.array_equal(x, import_data('/tmp/test.dense'))
            # age==================================
            ages.append(float(line[1]))
            # sex==================================
            if line[2] == 'M':
                sexes.append(1)
            else:
                sexes.append(0)
            # occupation==================================
            occupations.append(occmap[line[3]])
            # first three zip code digits: section center facility (regional mail sorting centers)
            if line[4][0:3].isdigit():
                zipcodes.append(int(line[4][0:3]))
            else:
                zipcodes.append(0)

    loader.export_data(args.outpath + 'features_sex.index', loader.HotIndex(numpy.array(sexes), 2))
    loader.export_data(args.outpath + 'features_occ.index', loader.HotIndex(numpy.array(occupations), len(occlist) -1))
    loader.export_data(args.outpath + 'features_zip.index', loader.HotIndex(numpy.array(zipcodes), 1000))
    loader.export_data(args.outpath + 'features_age.mat', numpy.array(ages))

    # inspect processed data
    sex = loader.imatload(args.outpath + 'features_sex.index')
    print('sex:')
    print(sex.vec.shape, sex.dim)
    print(sex.vec[0:10])
    print('\n')

    occ = loader.imatload(args.outpath + 'features_occ.index')
    print('occ:')
    print(occ.vec.shape, occ.dim)
    print(occ.vec[0:10])
Exemple #8
0
print(len(order))
random.shuffle(order)
sparse = sparse[order, :]
# time = loader.import_data(args.readpath + 'features_time.densetxt')
# time = time[order]

split_size = int(round(num_ratings * .025))

num_users = sparse[:, 0].max()
num_items = sparse[:, 1].max()

rows = sparse[:, 0] - 1
cols = sparse[:, 1] - 1
vals = sparse[:, 2]

loader.export_data(args.writepath + '500k/dev/features_user.index',
                   loader.HotIndex(rows[0:split_size], num_users))
loader.export_data(args.writepath + '500k/dev/features_item.index',
                   loader.HotIndex(cols[0:split_size], num_items))
loader.export_data(args.writepath + '500k/dev/labels_ratings.mat',
                   vals[0:split_size])

loader.export_data(args.writepath + '500k/test/features_user.index',
                   loader.HotIndex(rows[split_size:2 * split_size], num_users))
loader.export_data(args.writepath + '500k/test/features_item.index',
                   loader.HotIndex(cols[split_size:2 * split_size], num_items))
loader.export_data(args.writepath + '500k/test/labels_ratings.mat',
                   vals[split_size:2 * split_size])

loader.export_data(
    args.writepath + '500k/train/features_user.index',
    loader.HotIndex(rows[2 * split_size:2 * split_size + 500000], num_users))
        first = False
        lines = infile.readlines()
        for line in lines:
            line = line.split('|')
            # month released==================================
            date = line[2].split('-')
            month.append(monthmap[date[1]])
            # year released==================================
            year.append(float(int(date[2])))
            genres = line[5:len(line)]
            for i in range(len(genres)):
                genres[i] = float(genres[i])
            genre.append(genres)

    # print(month_matrix.vec.shape, month_matrix.dim, genre_matrix.vec.shape, genre_matrix.dim, year_matrix.shape)
    loader.export_data(args.outpath + 'features_month.index', loader.HotIndex(numpy.array(month), 12))
    loader.export_data(args.outpath + 'features_year.mat', numpy.array(year))
    loader.export_data(args.outpath + 'features_genre.mat', numpy.array(genre))

    # inspect processed data
    month = loader.import_data(args.outpath + 'features_month.index')
    print('month:')
    print(month.vec.shape, month.dim)
    print(month.vec[0:20])
    print('\n')

    year = loader.import_data(args.outpath + 'features_year.mat')
    print('year:')
    print(year.shape)
    print(year[0:10])
    print('\n')
Exemple #10
0
print(len(order))
random.shuffle(order)
sparse = sparse[order, :]
# time = loader.import_data(args.readpath + 'features_time.densetxt')
# time = time[order]

split_size = int(round(num_ratings*.1))

num_users = sparse[:,0].max()
num_items = sparse[:,1].max()

rows = sparse[:, 0] - 1
cols = sparse[:, 1] - 1
vals = sparse[:, 2]

# loader.export_data(args.writepath + 'dev/features_time.mat', time[0:split_size])
loader.export_data(args.writepath + 'dev/features_user.index', loader.HotIndex(rows[0:split_size], num_users))
loader.export_data(args.writepath + 'dev/features_item.index', loader.HotIndex(cols[0:split_size], num_items))
loader.export_data(args.writepath + 'dev/labels_ratings.mat', vals[0:split_size])

# loader.export_data(args.writepath + 'test/features_time.mat', time[split_size:split_size*2])
loader.export_data(args.writepath + 'test/features_user.index', loader.HotIndex(rows[split_size:split_size*2], num_users))
loader.export_data(args.writepath + 'test/features_item.index', loader.HotIndex(cols[split_size:split_size*2], num_items))
loader.export_data(args.writepath + 'test/labels_ratings.mat', vals[split_size:split_size*2])

# loader.export_data(args.writepath + 'train/features_time.mat', time[split_size*2:time.shape[0]])
loader.export_data(args.writepath + 'train/features_user.index', loader.HotIndex(rows[split_size*2:sparse.shape[0]],num_users))
loader.export_data(args.writepath + 'train/features_item.index', loader.HotIndex(cols[split_size*2:sparse.shape[0]], num_items))
loader.export_data(args.writepath + 'train/labels_ratings.mat', vals[split_size*2:sparse.shape[0]])

Exemple #11
0
words = loader.import_data(
    '/home/aarontuor/data/ml100k/item/features_bin_doc_term.mat')

genre_dist = genres / (norm(genres, axis=1, keepdims=True) *
                       norm(genres, axis=1, keepdims=True))
devgenre = genres[dev_item.vec]
testgenre = genres[test_item.vec]
traingenre = genres[train_item.vec]
devwords = words[dev_item.vec]
testwords = words[test_item.vec]
trainwords = words[train_item.vec]
print(devgenre.shape)
print(testgenre.shape)
print(traingenre.shape)

print(devwords.shape)
print(testwords.shape)
print(trainwords.shape)

loader.export_data('/home/aarontuor/data/ml100k/dev/labels_genre.mat',
                   devgenre)
loader.export_data('/home/aarontuor/data/ml100k/train/labels_genre.mat',
                   traingenre)
loader.export_data('/home/aarontuor/data/ml100k/test/labels_genre.mat',
                   testgenre)
loader.export_data('/home/aarontuor/data/ml100k/dev/features_words.mat',
                   devwords)
loader.export_data('/home/aarontuor/data/ml100k/train/features_words.mat',
                   trainwords)
loader.export_data('/home/aarontuor/data/ml100k/test/features_words.mat',
                   testwords)