Beispiel #1
0
            # age==================================
            ages.append(float(line[1]))
            # sex==================================
            if line[2] == 'M':
                sexes.append(1)
            else:
                sexes.append(0)
            # occupation==================================
            occupations.append(occmap[line[3]])
            # first three zip code digits: section center facility (regional mail sorting centers)
            if line[4][0:3].isdigit():
                zipcodes.append(int(line[4][0:3]))
            else:
                zipcodes.append(0)

    loader.export_data(args.outpath + 'features_sex.index', loader.HotIndex(numpy.array(sexes), 2))
    loader.export_data(args.outpath + 'features_occ.index', loader.HotIndex(numpy.array(occupations), len(occlist) -1))
    loader.export_data(args.outpath + 'features_zip.index', loader.HotIndex(numpy.array(zipcodes), 1000))
    loader.export_data(args.outpath + 'features_age.mat', numpy.array(ages))

    # inspect processed data
    sex = loader.imatload(args.outpath + 'features_sex.index')
    print('sex:')
    print(sex.vec.shape, sex.dim)
    print(sex.vec[0:10])
    print('\n')

    occ = loader.imatload(args.outpath + 'features_occ.index')
    print('occ:')
    print(occ.vec.shape, occ.dim)
    print(occ.vec[0:10])
Beispiel #2
0
def test_HotIndex_to_one_hot():
    w = np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])
    x = loader.HotIndex(sps.csr_matrix(w))
    assert np.array_equal(w, x.hot().toarray())
Beispiel #3
0
random.shuffle(order)
sparse = sparse[order, :]
# time = loader.import_data(args.readpath + 'features_time.densetxt')
# time = time[order]

split_size = int(round(num_ratings * .025))

num_users = sparse[:, 0].max()
num_items = sparse[:, 1].max()

rows = sparse[:, 0] - 1
cols = sparse[:, 1] - 1
vals = sparse[:, 2]

loader.export_data(args.writepath + '500k/dev/features_user.index',
                   loader.HotIndex(rows[0:split_size], num_users))
loader.export_data(args.writepath + '500k/dev/features_item.index',
                   loader.HotIndex(cols[0:split_size], num_items))
loader.export_data(args.writepath + '500k/dev/labels_ratings.mat',
                   vals[0:split_size])

loader.export_data(args.writepath + '500k/test/features_user.index',
                   loader.HotIndex(rows[split_size:2 * split_size], num_users))
loader.export_data(args.writepath + '500k/test/features_item.index',
                   loader.HotIndex(cols[split_size:2 * split_size], num_items))
loader.export_data(args.writepath + '500k/test/labels_ratings.mat',
                   vals[split_size:2 * split_size])

loader.export_data(
    args.writepath + '500k/train/features_user.index',
    loader.HotIndex(rows[2 * split_size:2 * split_size + 500000], num_users))
Beispiel #4
0
def test_HotIndex_step_slicing():
    x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]))
    assert x[0:4:2] == loader.HotIndex(np.array([[1, 0, 0], [0, 1, 0]]))
Beispiel #5
0
def test_HotIndex_indexing():
    x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]))
    assert x[3] == 2
Beispiel #6
0
def test_HotIndex_slicing_right_index():
    x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]))
    assert x[:4] == x
Beispiel #7
0
def test_HotIndex_slicing_two_indices():
    x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]))
    assert x[1:3] == loader.HotIndex(np.array([[1, 0, 0], [0, 1, 0]]))
Beispiel #8
0
def test_dim():
    x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]))
    assert x.dim == 3
Beispiel #9
0
def test_eq():
    x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]))
    y = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]))
    assert x == y
Beispiel #10
0
def test_vec():
    x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]))
    assert np.array_equal(x.vec, np.array([0, 0, 1, 2]))
Beispiel #11
0
def test_HotIndex_iteration():
    w = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]))
    indices = [0, 0, 1, 2]
    for ind, item in enumerate(w):
        assert indices[ind] == item
Beispiel #12
0
def test_init_from_vec():
    w = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]))
    x = loader.HotIndex(np.array([0, 0, 1, 2]))
    assert x == w
Beispiel #13
0
        first = False
        lines = infile.readlines()
        for line in lines:
            line = line.split('|')
            # month released==================================
            date = line[2].split('-')
            month.append(monthmap[date[1]])
            # year released==================================
            year.append(float(int(date[2])))
            genres = line[5:len(line)]
            for i in range(len(genres)):
                genres[i] = float(genres[i])
            genre.append(genres)

    # print(month_matrix.vec.shape, month_matrix.dim, genre_matrix.vec.shape, genre_matrix.dim, year_matrix.shape)
    loader.export_data(args.outpath + 'features_month.index', loader.HotIndex(numpy.array(month), 12))
    loader.export_data(args.outpath + 'features_year.mat', numpy.array(year))
    loader.export_data(args.outpath + 'features_genre.mat', numpy.array(genre))

    # inspect processed data
    month = loader.import_data(args.outpath + 'features_month.index')
    print('month:')
    print(month.vec.shape, month.dim)
    print(month.vec[0:20])
    print('\n')

    year = loader.import_data(args.outpath + 'features_year.mat')
    print('year:')
    print(year.shape)
    print(year[0:10])
    print('\n')
Beispiel #14
0
print(len(order))
random.shuffle(order)
sparse = sparse[order, :]
# time = loader.import_data(args.readpath + 'features_time.densetxt')
# time = time[order]

split_size = int(round(num_ratings*.1))

num_users = sparse[:,0].max()
num_items = sparse[:,1].max()

rows = sparse[:, 0] - 1
cols = sparse[:, 1] - 1
vals = sparse[:, 2]

# loader.export_data(args.writepath + 'dev/features_time.mat', time[0:split_size])
loader.export_data(args.writepath + 'dev/features_user.index', loader.HotIndex(rows[0:split_size], num_users))
loader.export_data(args.writepath + 'dev/features_item.index', loader.HotIndex(cols[0:split_size], num_items))
loader.export_data(args.writepath + 'dev/labels_ratings.mat', vals[0:split_size])

# loader.export_data(args.writepath + 'test/features_time.mat', time[split_size:split_size*2])
loader.export_data(args.writepath + 'test/features_user.index', loader.HotIndex(rows[split_size:split_size*2], num_users))
loader.export_data(args.writepath + 'test/features_item.index', loader.HotIndex(cols[split_size:split_size*2], num_items))
loader.export_data(args.writepath + 'test/labels_ratings.mat', vals[split_size:split_size*2])

# loader.export_data(args.writepath + 'train/features_time.mat', time[split_size*2:time.shape[0]])
loader.export_data(args.writepath + 'train/features_user.index', loader.HotIndex(rows[split_size*2:sparse.shape[0]],num_users))
loader.export_data(args.writepath + 'train/features_item.index', loader.HotIndex(cols[split_size*2:sparse.shape[0]], num_items))
loader.export_data(args.writepath + 'train/labels_ratings.mat', vals[split_size*2:sparse.shape[0]])