# age================================== ages.append(float(line[1])) # sex================================== if line[2] == 'M': sexes.append(1) else: sexes.append(0) # occupation================================== occupations.append(occmap[line[3]]) # first three zip code digits: section center facility (regional mail sorting centers) if line[4][0:3].isdigit(): zipcodes.append(int(line[4][0:3])) else: zipcodes.append(0) loader.export_data(args.outpath + 'features_sex.index', loader.HotIndex(numpy.array(sexes), 2)) loader.export_data(args.outpath + 'features_occ.index', loader.HotIndex(numpy.array(occupations), len(occlist) -1)) loader.export_data(args.outpath + 'features_zip.index', loader.HotIndex(numpy.array(zipcodes), 1000)) loader.export_data(args.outpath + 'features_age.mat', numpy.array(ages)) # inspect processed data sex = loader.imatload(args.outpath + 'features_sex.index') print('sex:') print(sex.vec.shape, sex.dim) print(sex.vec[0:10]) print('\n') occ = loader.imatload(args.outpath + 'features_occ.index') print('occ:') print(occ.vec.shape, occ.dim) print(occ.vec[0:10])
def test_HotIndex_to_one_hot(): w = np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]) x = loader.HotIndex(sps.csr_matrix(w)) assert np.array_equal(w, x.hot().toarray())
random.shuffle(order) sparse = sparse[order, :] # time = loader.import_data(args.readpath + 'features_time.densetxt') # time = time[order] split_size = int(round(num_ratings * .025)) num_users = sparse[:, 0].max() num_items = sparse[:, 1].max() rows = sparse[:, 0] - 1 cols = sparse[:, 1] - 1 vals = sparse[:, 2] loader.export_data(args.writepath + '500k/dev/features_user.index', loader.HotIndex(rows[0:split_size], num_users)) loader.export_data(args.writepath + '500k/dev/features_item.index', loader.HotIndex(cols[0:split_size], num_items)) loader.export_data(args.writepath + '500k/dev/labels_ratings.mat', vals[0:split_size]) loader.export_data(args.writepath + '500k/test/features_user.index', loader.HotIndex(rows[split_size:2 * split_size], num_users)) loader.export_data(args.writepath + '500k/test/features_item.index', loader.HotIndex(cols[split_size:2 * split_size], num_items)) loader.export_data(args.writepath + '500k/test/labels_ratings.mat', vals[split_size:2 * split_size]) loader.export_data( args.writepath + '500k/train/features_user.index', loader.HotIndex(rows[2 * split_size:2 * split_size + 500000], num_users))
def test_HotIndex_step_slicing(): x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])) assert x[0:4:2] == loader.HotIndex(np.array([[1, 0, 0], [0, 1, 0]]))
def test_HotIndex_indexing(): x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])) assert x[3] == 2
def test_HotIndex_slicing_right_index(): x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])) assert x[:4] == x
def test_HotIndex_slicing_two_indices(): x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])) assert x[1:3] == loader.HotIndex(np.array([[1, 0, 0], [0, 1, 0]]))
def test_dim(): x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])) assert x.dim == 3
def test_eq(): x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])) y = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])) assert x == y
def test_vec(): x = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])) assert np.array_equal(x.vec, np.array([0, 0, 1, 2]))
def test_HotIndex_iteration(): w = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])) indices = [0, 0, 1, 2] for ind, item in enumerate(w): assert indices[ind] == item
def test_init_from_vec(): w = loader.HotIndex(np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])) x = loader.HotIndex(np.array([0, 0, 1, 2])) assert x == w
first = False lines = infile.readlines() for line in lines: line = line.split('|') # month released================================== date = line[2].split('-') month.append(monthmap[date[1]]) # year released================================== year.append(float(int(date[2]))) genres = line[5:len(line)] for i in range(len(genres)): genres[i] = float(genres[i]) genre.append(genres) # print(month_matrix.vec.shape, month_matrix.dim, genre_matrix.vec.shape, genre_matrix.dim, year_matrix.shape) loader.export_data(args.outpath + 'features_month.index', loader.HotIndex(numpy.array(month), 12)) loader.export_data(args.outpath + 'features_year.mat', numpy.array(year)) loader.export_data(args.outpath + 'features_genre.mat', numpy.array(genre)) # inspect processed data month = loader.import_data(args.outpath + 'features_month.index') print('month:') print(month.vec.shape, month.dim) print(month.vec[0:20]) print('\n') year = loader.import_data(args.outpath + 'features_year.mat') print('year:') print(year.shape) print(year[0:10]) print('\n')
print(len(order)) random.shuffle(order) sparse = sparse[order, :] # time = loader.import_data(args.readpath + 'features_time.densetxt') # time = time[order] split_size = int(round(num_ratings*.1)) num_users = sparse[:,0].max() num_items = sparse[:,1].max() rows = sparse[:, 0] - 1 cols = sparse[:, 1] - 1 vals = sparse[:, 2] # loader.export_data(args.writepath + 'dev/features_time.mat', time[0:split_size]) loader.export_data(args.writepath + 'dev/features_user.index', loader.HotIndex(rows[0:split_size], num_users)) loader.export_data(args.writepath + 'dev/features_item.index', loader.HotIndex(cols[0:split_size], num_items)) loader.export_data(args.writepath + 'dev/labels_ratings.mat', vals[0:split_size]) # loader.export_data(args.writepath + 'test/features_time.mat', time[split_size:split_size*2]) loader.export_data(args.writepath + 'test/features_user.index', loader.HotIndex(rows[split_size:split_size*2], num_users)) loader.export_data(args.writepath + 'test/features_item.index', loader.HotIndex(cols[split_size:split_size*2], num_items)) loader.export_data(args.writepath + 'test/labels_ratings.mat', vals[split_size:split_size*2]) # loader.export_data(args.writepath + 'train/features_time.mat', time[split_size*2:time.shape[0]]) loader.export_data(args.writepath + 'train/features_user.index', loader.HotIndex(rows[split_size*2:sparse.shape[0]],num_users)) loader.export_data(args.writepath + 'train/features_item.index', loader.HotIndex(cols[split_size*2:sparse.shape[0]], num_items)) loader.export_data(args.writepath + 'train/labels_ratings.mat', vals[split_size*2:sparse.shape[0]])