コード例 #1
0
ファイル: utils.py プロジェクト: twkillian/endomondo_fun
def load_data(args):
    raw_data, x_data, y_data = get_npy_data(args)
    train_idx, val_idx, test_idx, context = get_metadata(args)
    train_idx, val_idx, test_idx = map_data(train_idx, val_idx, test_idx,
                                            context, raw_data)

    if args.task == 'forecasting':
        dataset = TSData(x_data, y_data, args.step)
    elif args.task == 'prediction':
        dataset = ClassData(x_data, y_data)

    trainset = Subset(dataset, train_idx)
    valset = Subset(dataset, val_idx)
    testset = Subset(dataset, test_idx)

    trainloader = DataLoader(trainset, batch_size=args.batch_size)
    valloader = DataLoader(valset, batch_size=args.eval_batch_size)
    testloader = DataLoader(testset, batch_size=args.eval_batch_size)
    return trainloader, valloader, testloader
def load_official_trainvaltest_split(dataset, testing=False):
    """
    Loads official train/test split and uses 10% of training samples for validaiton
    For each split computes 1-of-num_classes labels. Also computes training
    adjacency matrix. Assumes flattening happens everywhere in row-major fashion.
    """

    sep = '\t'

    # Check if files exist and download otherwise
    files = ['/u1.base', '/u1.test', '/u.item', '/u.user']
    fname = dataset
    data_dir = 'data/' + fname

    download_dataset(fname, files, data_dir)

    dtypes = {
        'u_nodes': np.int32,
        'v_nodes': np.int32,
        'ratings': np.float32,
        'timestamp': np.float64
    }

    filename_train = 'data/' + dataset + '/u1.base'
    filename_test = 'data/' + dataset + '/u1.test'

    data_train = pd.read_csv(
        filename_train,
        sep=sep,
        header=None,
        names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'],
        dtype=dtypes)

    data_test = pd.read_csv(
        filename_test,
        sep=sep,
        header=None,
        names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'],
        dtype=dtypes)

    data_array_train = data_train.as_matrix().tolist()
    data_array_train = np.array(data_array_train)
    data_array_test = data_test.as_matrix().tolist()
    data_array_test = np.array(data_array_test)

    data_array = np.concatenate([data_array_train, data_array_test], axis=0)

    u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
    v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
    ratings = data_array[:, 2].astype(dtypes['ratings'])

    u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
    v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

    u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(
        np.int64), v_nodes_ratings.astype(np.int32)
    ratings = ratings.astype(np.float64)

    u_nodes = u_nodes_ratings
    v_nodes = v_nodes_ratings

    neutral_rating = -1  # int(np.ceil(np.float(num_classes)/2.)) - 1

    # assumes that ratings_train contains at least one example of every rating type
    rating_dict = {
        r: i
        for i, r in enumerate(np.sort(np.unique(ratings)).tolist())
    }

    labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
    labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])

    for i in range(len(u_nodes)):
        assert (labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]])

    labels = labels.reshape([-1])

    # number of test and validation edges, see cf-nade code

    num_train = data_array_train.shape[0]
    num_test = data_array_test.shape[0]
    num_val = int(np.ceil(num_train * 0.2))
    num_train = num_train - num_val

    pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])
    idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

    for i in range(len(ratings)):
        assert (labels[idx_nonzero[i]] == rating_dict[ratings[i]])

    idx_nonzero_train = idx_nonzero[0:num_train + num_val]
    idx_nonzero_test = idx_nonzero[num_train + num_val:]

    pairs_nonzero_train = pairs_nonzero[0:num_train + num_val]
    pairs_nonzero_test = pairs_nonzero[num_train + num_val:]

    # Internally shuffle training set (before splitting off validation set)
    rand_idx = range(len(idx_nonzero_train))
    np.random.seed(42)
    np.random.shuffle(rand_idx)
    idx_nonzero_train = idx_nonzero_train[rand_idx]
    pairs_nonzero_train = pairs_nonzero_train[rand_idx]

    idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
    pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test],
                                   axis=0)

    val_idx = idx_nonzero[0:num_val]
    train_idx = idx_nonzero[num_val:num_train + num_val]
    test_idx = idx_nonzero[num_train + num_val:]

    assert (len(test_idx) == num_test)

    val_pairs_idx = pairs_nonzero[0:num_val]
    train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]
    test_pairs_idx = pairs_nonzero[num_train + num_val:]

    u_test_idx, v_test_idx = test_pairs_idx.transpose()
    u_val_idx, v_val_idx = val_pairs_idx.transpose()
    u_train_idx, v_train_idx = train_pairs_idx.transpose()

    # create labels
    train_labels = labels[train_idx]
    val_labels = labels[val_idx]
    test_labels = labels[test_idx]

    if testing:
        u_train_idx = np.hstack([u_train_idx, u_val_idx])
        v_train_idx = np.hstack([v_train_idx, v_val_idx])
        train_labels = np.hstack([train_labels, val_labels])
        # for adjacency matrix construction
        train_idx = np.hstack([train_idx, val_idx])

    # make training adjacency matrix
    rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
    rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
    rating_mx_train = sp.csr_matrix(
        rating_mx_train.reshape(num_users, num_items))

    class_values = np.sort(np.unique(ratings))

    if dataset == 'ml_100k':

        # movie features (genres)
        sep = r'|'
        movie_file = 'data/' + dataset + '/u.item'
        movie_headers = [
            'movie id', 'movie title', 'release date', 'video release date',
            'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
            'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
            'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
            'Thriller', 'War', 'Western'
        ]
        movie_df = pd.read_csv(movie_file,
                               sep=sep,
                               header=None,
                               names=movie_headers,
                               engine='python')

        genre_headers = movie_df.columns.values[6:]
        num_genres = genre_headers.shape[0]

        v_features = np.zeros((num_items, num_genres), dtype=np.float32)
        for movie_id, g_vec in zip(movie_df['movie id'].values.tolist(),
                                   movie_df[genre_headers].values.tolist()):
            # check if movie_id was listed in ratings file and therefore in mapping dictionary
            if movie_id in v_dict.keys():
                v_features[v_dict[movie_id], :] = g_vec

        # user features

        sep = r'|'
        users_file = 'data/' + dataset + '/u.user'
        users_headers = ['user id', 'age', 'gender', 'occupation', 'zip code']
        users_df = pd.read_csv(users_file,
                               sep=sep,
                               header=None,
                               names=users_headers,
                               engine='python')

        occupation = set(users_df['occupation'].values.tolist())

        age = users_df['age'].values
        age_max = age.max()

        gender_dict = {'M': 0., 'F': 1.}
        occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}

        num_feats = 2 + len(occupation_dict)

        u_features = np.zeros((num_users, num_feats), dtype=np.float32)
        for _, row in users_df.iterrows():
            u_id = row['user id']
            if u_id in u_dict.keys():
                # age
                u_features[u_dict[u_id], 0] = row['age'] / np.float(age_max)
                # gender
                u_features[u_dict[u_id], 1] = gender_dict[row['gender']]
                # occupation
                u_features[u_dict[u_id],
                           occupation_dict[row['occupation']]] = 1.

    elif dataset == 'ml_1m':

        # load movie features
        movies_file = 'data/' + dataset + '/movies.dat'

        movies_headers = ['movie_id', 'title', 'genre']
        movies_df = pd.read_csv(movies_file,
                                sep=sep,
                                header=None,
                                names=movies_headers,
                                engine='python')

        # extracting all genres
        genres = []
        for s in movies_df['genre'].values:
            genres.extend(s.split('|'))

        genres = list(set(genres))
        num_genres = len(genres)

        genres_dict = {g: idx for idx, g in enumerate(genres)}

        # creating 0 or 1 valued features for all genres
        v_features = np.zeros((num_items, num_genres), dtype=np.float32)
        for movie_id, s in zip(movies_df['movie_id'].values.tolist(),
                               movies_df['genre'].values.tolist()):
            # check if movie_id was listed in ratings file and therefore in mapping dictionary
            if movie_id in v_dict.keys():
                gen = s.split('|')
                for g in gen:
                    v_features[v_dict[movie_id], genres_dict[g]] = 1.

        # load user features
        users_file = 'data/' + dataset + '/users.dat'
        users_headers = ['user_id', 'gender', 'age', 'occupation', 'zip-code']
        users_df = pd.read_csv(users_file,
                               sep=sep,
                               header=None,
                               names=users_headers,
                               engine='python')

        # extracting all features
        cols = users_df.columns.values[1:]

        cntr = 0
        feat_dicts = []
        for header in cols:
            d = dict()
            feats = np.unique(users_df[header].values).tolist()
            d.update({f: i for i, f in enumerate(feats, start=cntr)})
            feat_dicts.append(d)
            cntr += len(d)

        num_feats = sum(len(d) for d in feat_dicts)

        u_features = np.zeros((num_users, num_feats), dtype=np.float32)
        for _, row in users_df.iterrows():
            u_id = row['user_id']
            if u_id in u_dict.keys():
                for k, header in enumerate(cols):
                    u_features[u_dict[u_id], feat_dicts[k][row[header]]] = 1.
    else:
        raise ValueError('Invalid dataset option %s' % dataset)

    u_features = sp.csr_matrix(u_features)
    v_features = sp.csr_matrix(v_features)

    print("User features shape: " + str(u_features.shape))
    print("Item features shape: " + str(v_features.shape))

    return u_features, v_features, rating_mx_train, train_labels, u_train_idx, v_train_idx, \
        val_labels, u_val_idx, v_val_idx, test_labels, u_test_idx, v_test_idx, class_values
コード例 #3
0
ファイル: preprocessing.py プロジェクト: itayl13/Seminar
def load_data_books(testing=False):
    if not os.path.exists(
            os.path.join('data', 'book_crossing_edited',
                         'BX-Book-Ratings_filtered.csv')):
        edit_book_files()
    dtypes = {'u_nodes': np.int32, 'v_nodes': np.str, 'ratings': np.int32}

    matrix_source = np.array(
        pd.read_csv(
            open(
                os.path.join('data', 'book_crossing_edited',
                             'BX-Book-Ratings_filtered.csv'), 'r')))

    np.random.seed(42)
    test_indices = np.random.choice(np.arange(matrix_source.shape[0]),
                                    matrix_source.shape[0] // 10,
                                    replace=False)
    mask = np.array([(i in test_indices)
                     for i in np.arange(matrix_source.shape[0])])

    data_train = matrix_source[~mask, :]
    data_test = matrix_source[mask, :]

    data_array_train = data_train.tolist()
    data_array_train = np.array(data_array_train)
    data_array_test = data_test.tolist()
    data_array_test = np.array(data_array_test)

    data_array = np.concatenate([data_array_train, data_array_test], axis=0)

    u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
    v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
    ratings = data_array[:, 2].astype(dtypes['ratings'])

    u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
    v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

    u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(
        np.int64), v_nodes_ratings.astype(np.int32)
    ratings = ratings.astype(np.int32)

    u_nodes = u_nodes_ratings
    v_nodes = v_nodes_ratings

    neutral_rating = -1  # int(np.ceil(np.float(num_classes)/2.)) - 1

    # assumes that ratings_train contains at least one example of every rating type
    rating_dict = {
        r: i
        for i, r in enumerate(np.sort(np.unique(ratings)).tolist())
    }

    labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
    labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])

    for i in range(len(u_nodes)):
        assert (labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]])

    labels = labels.reshape([-1])

    # number of test and validation edges, see cf-nade code

    num_train = data_array_train.shape[0]
    num_test = data_array_test.shape[0]
    num_val = int(np.ceil(num_train * 0.2))
    num_train = num_train - num_val

    pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])
    idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

    for i in range(len(ratings)):
        assert (labels[idx_nonzero[i]] == rating_dict[ratings[i]])

    idx_nonzero_train = idx_nonzero[0:num_train + num_val]
    idx_nonzero_test = idx_nonzero[num_train + num_val:]

    pairs_nonzero_train = pairs_nonzero[0:num_train + num_val]
    pairs_nonzero_test = pairs_nonzero[num_train + num_val:]

    # Internally shuffle training set (before splitting off validation set)
    rand_idx = range(len(idx_nonzero_train))
    np.random.seed(42)
    np.random.shuffle(rand_idx)
    idx_nonzero_train = idx_nonzero_train[rand_idx]
    pairs_nonzero_train = pairs_nonzero_train[rand_idx]

    idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
    pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test],
                                   axis=0)

    val_idx = idx_nonzero[0:num_val]
    train_idx = idx_nonzero[num_val:num_train + num_val]
    test_idx = idx_nonzero[num_train + num_val:]

    assert (len(test_idx) == num_test)

    val_pairs_idx = pairs_nonzero[0:num_val]
    train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]
    test_pairs_idx = pairs_nonzero[num_train + num_val:]

    u_test_idx, v_test_idx = test_pairs_idx.transpose()
    u_val_idx, v_val_idx = val_pairs_idx.transpose()
    u_train_idx, v_train_idx = train_pairs_idx.transpose()

    # create labels
    train_labels = labels[train_idx]
    val_labels = labels[val_idx]
    test_labels = labels[test_idx]

    if testing:
        u_train_idx = np.hstack([u_train_idx, u_val_idx])
        v_train_idx = np.hstack([v_train_idx, v_val_idx])
        train_labels = np.hstack([train_labels, val_labels])
        # for adjacency matrix construction
        train_idx = np.hstack([train_idx, val_idx])

    # make training adjacency matrix
    rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
    rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
    rating_mx_train = sp.csr_matrix(
        rating_mx_train.reshape(num_users, num_items))

    class_values = np.sort(np.unique(ratings))

    # Side information features
    # book features
    book_df = pd.read_csv(
        open(
            os.path.join('data', 'book_crossing_edited',
                         'BX-Books_filtered.csv'), 'r'))

    author_dict = {
        f: i
        for i, f in enumerate(set(book_df['Book-Author'].values.tolist()),
                              start=2)
    }
    year = book_df['Year-Of-Publication'].values
    year_max = year.max()

    num_book_feats = 1 + len(
        author_dict)  # Year of publication (normed), Author (binary by name).

    v_features = np.zeros((num_items, num_book_feats), dtype=np.float32)
    for _, row in book_df.iterrows():
        v_id = row['ISBN']
        # check if book_id was listed in ratings file and therefore in mapping dictionary
        if v_id in v_dict.keys():
            # year
            v_features[v_dict[v_id],
                       0] = row['Year-Of-Publication'] / np.float(year_max)
            # author
            v_features[v_dict[v_id], author_dict[row['Book-Author']]] = 1.

    # user features
    users_df = pd.read_csv(
        open(
            os.path.join('data', 'book_crossing_edited',
                         'BX-Users_filtered.csv'), 'r'))

    age = users_df['Age'].values
    age_max = age.max()

    u_features = np.zeros((num_users, 1), dtype=np.float32)
    for _, row in users_df.iterrows():
        u_id = row['User-ID']
        if u_id in u_dict.keys():
            u_features[u_dict[u_id], 0] = row['Age'] / np.float(age_max)

    u_features = sp.csr_matrix(u_features)
    v_features = sp.csr_matrix(v_features)

    print("User features shape: " + str(u_features.shape))
    print("Item features shape: " + str(v_features.shape))

    return u_features, v_features, rating_mx_train, train_labels, u_train_idx, v_train_idx, \
           val_labels, u_val_idx, v_val_idx, test_labels, u_test_idx, v_test_idx, class_values
コード例 #4
0
def new_train_split():
    # 此设置指定numpy在打印时输出全部元素
    np.set_printoptions(threshold=np.inf)

    # 数据输入GCN前进行一次转换,手动构造dataframe
    u_nodes, v_nodes, ratings = [], [], []
    i = 0
    # 注意这里要换用二部图的输出
    #with open('C:/Users/Administrator/Desktop/HybridRecommendGCN/gcn/toGcn.csv', 'r') as f:
    # with open('gcn/toGcn.csv', 'r') as f:
    with open('file_saved/toGcn.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            u_nodes.append(int(row[0]))
            v_nodes.append(int(row[1]))
            ratings.append(int(row[2]))

    # 构造ID映射字典,从长ID映射为0开始的数字
    uSuperDict = {r: i for i, r in enumerate(list(set(u_nodes)))}
    vSuperDict = {r: i for i, r in enumerate(list(set(v_nodes)))}

    # 构建用户反向字典,从数字映射回ID
    u_listKey = []
    u_listValue = []
    for key in uSuperDict:
        u_listKey.append(uSuperDict[key])
        u_listValue.append(key)

    u_to_dictr = zip(u_listKey, u_listValue)
    u_dictr = dict(
        (u_listKey, u_listValue) for u_listKey, u_listValue in u_to_dictr)

    # 构造课程反向字典
    v_listKey = []
    v_listValue = []
    for key in vSuperDict:
        v_listKey.append(vSuperDict[key])
        v_listValue.append(key)
    v_to_dictr = zip(v_listKey, v_listValue)
    v_dictr = dict(
        (v_listKey, v_listValue) for v_listKey, v_listValue in v_to_dictr)

    # 保存反向字典
    np.save('u_dictr.npy', u_dictr)
    np.save('v_dictr.npy', v_dictr)

    # 抽取出映射过的ID,作为系统输入
    new_u_nodes, new_v_nodes = [], []
    for uid in u_nodes:
        new_u_nodes.append(uSuperDict[uid])
    for vid in v_nodes:
        new_v_nodes.append(vSuperDict[vid])
    u_nodes, v_nodes = new_u_nodes, new_v_nodes

    data_dict = {
        'u_nodes': np.int64(u_nodes),
        'v_nodes': np.int64(v_nodes),
        'ratings': np.float32(ratings)
    }
    # 根据转换过的ID重新构建评分表
    data_array = pd.DataFrame(data=data_dict)

    # 转换为三元组的二维数组,每个元组为[评分,UID,VID]
    data_array = data_array.as_matrix().tolist()
    data_array = np.array(data_array)
    # print(data_array)

    # 设定数据类型字典
    dtypes = {'u_nodes': np.int64, 'v_nodes': np.int64, 'ratings': np.float32}

    # 分离出用户ID、内容ID、评分3个向量
    u_nodes_ratings = data_array[:, 1].astype(dtypes['u_nodes'])
    v_nodes_ratings = data_array[:, 2].astype(dtypes['v_nodes'])
    ratings = data_array[:, 0].astype(dtypes['ratings'])
    # print(u_nodes_ratings, v_nodes_ratings, ratings)

    # 计算用户数量
    # 这里的字典和上边的重复了,没啥用,拟删除
    u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
    # print(u_nodes_ratings, u_dict)
    v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)
    print("num_users = {}".format(num_users))
    print("num_item = {}".format(num_items))

    # 转换数据类型
    u_nodes_ratings = u_nodes_ratings.astype(np.int64)
    v_nodes_ratings = v_nodes_ratings.astype(np.int32)
    ratings = ratings.astype(np.float64)

    # 将转换后的ID作为输入
    u_nodes = u_nodes_ratings
    v_nodes = v_nodes_ratings

    # 假设每个评分等级至少有一条用户-课程交互数据
    # 去重整理出评分等级
    rating_dict = {
        r: i
        for i, r in enumerate(np.sort(np.unique(ratings)).tolist())
    }
    # label数据初始化,用户数*课程数的矩阵,初始值为neutral_rating
    # label会作为后续训练集、测试集切分的数据来源
    neutral_rating = -1
    labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
    # 根据ratings赋值,构造出评分矩阵
    labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])

    # 验证数据是否相等
    for i in range(len(u_nodes)):
        assert (labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]])

    # 化为向量
    labels = labels.reshape([-1])
    print(labels.shape)

    # 这里我们分出训练集和验证集,训练集是整体数据的1/5
    num_train = data_array.shape[0]
    num_val = int(np.ceil(num_train * 0.2))
    num_train = num_train - num_val

    # 创建用户-课程对
    pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])
    # 用户-课程对从矩阵压缩成向量后的绝对位置计算,也就是labels中的索引位置
    idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

    # 验证转换是否正确
    for i in range(len(ratings)):
        assert (labels[idx_nonzero[i]] == rating_dict[ratings[i]])

    # 保留训练集的索引和数据对
    idx_nonzero_train = idx_nonzero[0:num_train]
    # print(idx_nonzero_train.shape)
    pairs_nonzero_train = pairs_nonzero[0:num_train]

    # 将训练集打散,也就是用户-课程对打散,同时评分在labels里的位置也打散
    rand_idx = range(len(idx_nonzero_train))
    # print(rand_idx)
    np.random.seed(42)
    np.random.shuffle(rand_idx)
    # print(idx_nonzero_train)
    idx_nonzero_train = idx_nonzero_train[rand_idx]
    # print(idx_nonzero_train)
    pairs_nonzero_train = pairs_nonzero_train[rand_idx]

    # 把打散的训练集和测试集合成完整的集合
    # 目前的处理下测试集为空,实际上全是打散的训练集
    idx_nonzero = idx_nonzero_train
    pairs_nonzero = pairs_nonzero_train

    # 取出label中测试集元素对应的位置,以及用户-课程对
    val_idx = idx_nonzero[0:num_val]
    train_idx = idx_nonzero[num_val:num_train + num_val]

    val_pairs_idx = pairs_nonzero[0:num_val]
    train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]

    # 通过转置,把数据集中的用户和课程分离在向量中
    u_train_idx, v_train_idx = train_pairs_idx.transpose()
    u_val_idx, v_val_idx = val_pairs_idx.transpose()

    # 对存储评分的label向量进行相同的切分
    train_labels = labels[train_idx]
    val_labels = labels[val_idx]
    # print(train_labels)

    # 建立评分向量,并将其变形为矩阵
    rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
    rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.

    # 进行矩阵压缩,得到矩阵三元组,即(i, j, Rij)
    rating_mx_train = sp.csr_matrix(
        rating_mx_train.reshape(num_users, num_items))

    # print(rating_mx_train)

    # 去重取出评分等级
    class_values = np.sort(np.unique(ratings))
    print(class_values)

    # 根据特征建立用户与课程的描述向量,并稀疏化
    u_features, v_features = makeFeature()
    u_features = sp.csr_matrix(u_features)
    v_features = sp.csr_matrix(v_features)

    print("User features shape: " + str(u_features.shape))
    print("Item features shape: " + str(v_features.shape))

    # 最后返回全部数据
    return u_features, v_features, rating_mx_train, train_labels, \
        u_train_idx, v_train_idx, val_labels, u_val_idx, v_val_idx, \
        class_values, uSuperDict, vSuperDict
コード例 #5
0
def load_official_trainvaltest_split(dataset, testing=False):
    """
    Loads official train/test split and uses 10% of training samples for validaiton
    For each split computes 1-of-num_classes labels. Also computes training
    adjacency matrix. Assumes flattening happens everywhere in row-major fashion.
    """
    sep = ','

    # Check if files exist and download otherwise
    files = ['/u1.base', '/u1.test', '/u.item', '/u.user']
    fname = dataset
    data_dir = 'data/' + fname

    ### here we make this download operation unavailable, use local files instead
    # download_dataset(fname, files, data_dir)

    dtypes = {'u_nodes': np.int64, 'v_nodes': np.int64, 'ratings': np.float32}

    # filename_train = 'mat.csv'
    # filename_test = 'mat.csv'

    # 数据输入GCN前进行一次转换,手动构造dataframe
    u_nodes, v_nodes, ratings = [], [], []
    i = 0
    with open('mat.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            u_nodes.append(int(row[0]))
            v_nodes.append(int(row[1]))
            ratings.append(int(row[2]))

    uSuperDict = {r: i for i, r in enumerate(list(set(u_nodes)))}
    vSuperDict = {r: i for i, r in enumerate(list(set(v_nodes)))}
    print(uSuperDict)
    #保存字典
    u_listKey = []
    u_listValue = []
    for key in uSuperDict:
        u_listKey.append(uSuperDict[key])
        u_listValue.append(key)

    u_to_dictr = zip(u_listKey, u_listValue)
    u_dictr = dict(
        (u_listKey, u_listValue) for u_listKey, u_listValue in u_to_dictr)

    v_listKey = []
    v_listValue = []
    for key in vSuperDict:
        v_listKey.append(vSuperDict[key])
        v_listValue.append(key)
    v_to_dictr = zip(v_listKey, v_listValue)
    v_dictr = dict(
        (v_listKey, v_listValue) for v_listKey, v_listValue in v_to_dictr)

    np.save('u_dictr.npy', u_dictr)
    np.save('v_dictr.npy', v_dictr)

    new_u_nodes, new_v_nodes = [], []
    for uid in u_nodes:
        new_u_nodes.append(uSuperDict[uid])
    for vid in v_nodes:
        new_v_nodes.append(vSuperDict[vid])
    u_nodes, v_nodes = new_u_nodes, new_v_nodes

    data_dict = {
        'u_nodes': np.int64(u_nodes),
        'v_nodes': np.int64(v_nodes),
        'ratings': np.float32(ratings)
    }
    data_train = pd.DataFrame(data=data_dict)
    data_test = pd.DataFrame(data=data_dict)

    # data_train = pd.read_csv(
    #     filename_train, sep=sep, header=None,
    #     names=['u_nodes', 'v_nodes', 'ratings'], dtype=dtypes)
    #
    # data_test = pd.read_csv(
    #     filename_test, sep=sep, header=None,
    #     names=['u_nodes', 'v_nodes', 'ratings'], dtype=dtypes)
    '''
    sep = '/t'
    # Check if files exist and download otherwise
    # files = ['/u1.base', '/u1.test', '/u.item', '/u.user']
    # fname = dataset
    # data_dir = 'data/' + fname
    # here we make this download operation unavailable, use local files instead
    # download_dataset(fname, files, data_dir)
    dtypes = {
        'u_nodes': np.int64, 'v_nodes': np.int32,
        'ratings': np.float32}
    filename_train = 'u1.base'
    filename_test = 'u1.test'
    data_train = pd.read_csv(
        filepath_or_buffer=filename_train, sep=sep, header=None,
        names=['u_nodes', 'v_nodes', 'ratings'], dtype=dtypes)
    data_test = pd.read_csv(
        filename_test, sep=sep, header=None,
        names=['u_nodes', 'v_nodes', 'ratings'], dtype=dtypes,
        engine='python')
    '''

    data_array_train = data_train.as_matrix().tolist()
    data_array_train = np.array(data_array_train)
    data_array_test = data_test.as_matrix().tolist()
    data_array_test = np.array(data_array_test)

    data_array = np.concatenate([data_array_train, data_array_test], axis=0)

    u_nodes_ratings = data_array[:, 1].astype(dtypes['u_nodes'])
    v_nodes_ratings = data_array[:, 2].astype(dtypes['v_nodes'])
    ratings = data_array[:, 0].astype(dtypes['ratings'])

    u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
    v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)
    print("num_users = {}".format(num_users))
    print("num_item = {}".format(num_items))

    u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(
        np.int64), v_nodes_ratings.astype(np.int32)
    ratings = ratings.astype(np.float64)

    u_nodes = u_nodes_ratings
    v_nodes = v_nodes_ratings

    neutral_rating = -1  # int(np.ceil(np.float(num_classes)/2.)) - 1

    # assumes that ratings_train contains at least one example of every rating type
    rating_dict = {
        r: i
        for i, r in enumerate(np.sort(np.unique(ratings)).tolist())
    }
    labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)

    labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])

    for i in range(len(u_nodes)):
        assert (labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]])

    labels = labels.reshape([-1])

    # number of test and validation edges, see cf-nade code

    num_train = data_array_train.shape[0]
    num_test = data_array_test.shape[0]
    num_val = int(np.ceil(num_train * 0.2))
    num_train = num_train - num_val

    pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])
    idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

    for i in range(len(ratings)):
        assert (labels[idx_nonzero[i]] == rating_dict[ratings[i]])

    idx_nonzero_train = idx_nonzero[0:num_train + num_val]
    idx_nonzero_test = idx_nonzero[num_train + num_val:]

    pairs_nonzero_train = pairs_nonzero[0:num_train + num_val]
    pairs_nonzero_test = pairs_nonzero[num_train + num_val:]

    # Internally shuffle training set (before splitting off validation set)
    rand_idx = range(len(idx_nonzero_train))
    np.random.seed(42)
    np.random.shuffle(rand_idx)
    idx_nonzero_train = idx_nonzero_train[rand_idx]
    pairs_nonzero_train = pairs_nonzero_train[rand_idx]

    idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
    pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test],
                                   axis=0)

    val_idx = idx_nonzero[0:num_val]
    train_idx = idx_nonzero[num_val:num_train + num_val]
    test_idx = idx_nonzero[num_train + num_val:]

    assert (len(test_idx) == num_test)

    val_pairs_idx = pairs_nonzero[0:num_val]
    train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]
    test_pairs_idx = pairs_nonzero[num_train + num_val:]

    u_test_idx, v_test_idx = test_pairs_idx.transpose()
    u_val_idx, v_val_idx = val_pairs_idx.transpose()
    u_train_idx, v_train_idx = train_pairs_idx.transpose()

    # create labels
    train_labels = labels[train_idx]
    val_labels = labels[val_idx]
    test_labels = labels[test_idx]

    if testing:
        u_train_idx = np.hstack([u_train_idx, u_val_idx])
        v_train_idx = np.hstack([v_train_idx, v_val_idx])
        train_labels = np.hstack([train_labels, val_labels])
        # for adjacency matrix construction
        train_idx = np.hstack([train_idx, val_idx])

    # make training adjacency matrix
    rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
    rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
    rating_mx_train = sp.csr_matrix(
        rating_mx_train.reshape(num_users, num_items))

    class_values = np.sort(np.unique(ratings))

    if dataset == 'fshl':
        '''
        # movie features (genres)
        sep = r'|'
        movie_file = 'data/' + dataset.replace('_', '-') + '/u.item'
        movie_headers = ['movie id', 'movie title', 'release date', 'video release date',
                         'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
                         'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                         'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                         'Thriller', 'War', 'Western']
        movie_df = pd.read_csv(movie_file, sep=sep, header=None,
                               names=movie_headers, engine='python')
        genre_headers = movie_df.columns.values[6:]
        num_genres = genre_headers.shape[0]
        v_features = np.zeros((num_items, num_genres), dtype=np.float32)
        for movie_id, g_vec in zip(movie_df['movie id'].values.tolist(), movie_df[genre_headers].values.tolist()):
            # check if movie_id was listed in ratings file and therefore in mapping dictionary
            if movie_id in v_dict.keys():
                v_features[v_dict[movie_id], :] = g_vec
        # user features
        sep = r'|'
        users_file = 'data/' + dataset.replace('_', '-') + '/u.user'
        users_headers = ['user id', 'age', 'gender', 'occupation', 'zip code']
        users_df = pd.read_csv(users_file, sep=sep, header=None,
                               names=users_headers, engine='python')
        occupation = set(users_df['occupation'].values.tolist())
        age = users_df['age'].values
        age_max = age.max()
        gender_dict = {'M': 0., 'F': 1.}
        occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}
        num_feats = 2 + len(occupation_dict)
        u_features = np.zeros((num_users, num_feats), dtype=np.float32)
        for _, row in users_df.iterrows():
            u_id = row['user id']
            if u_id in u_dict.keys():
                # age
                u_features[u_dict[u_id], 0] = row['age'] / np.float(age_max)
                # gender
                u_features[u_dict[u_id], 1] = gender_dict[row['gender']]
                # occupation
                u_features[u_dict[u_id], occupation_dict[row['occupation']]] = 1.
        '''
        u_features, v_features = makeFeature()
    else:
        raise ValueError('Invalid dataset option %s' % dataset)

    u_features = sp.csr_matrix(u_features)
    v_features = sp.csr_matrix(v_features)

    print("User features shape: " + str(u_features.shape))
    print("Item features shape: " + str(v_features.shape))

    return u_features, v_features, rating_mx_train, train_labels, \
        u_train_idx, v_train_idx, val_labels, u_val_idx, v_val_idx, \
        test_labels, u_test_idx, v_test_idx, class_values, uSuperDict, vSuperDict
コード例 #6
0
def load_official_trainvaltest_split(
    dataset,
    testing=False,
    rating_map=None,
    post_rating_map=None,
    ratio=1.0,
    is_cmf=False,
    is_debug=False,
):
    """
    Loads official train/test split and uses 10% of training samples for validaiton
    For each split computes 1-of-num_classes labels. Also computes training
    adjacency matrix. Assumes flattening happens everywhere in row-major fashion.
    """

    sep = "\t"

    # Check if files exist and download otherwise
    files = ["/u1.base", "/u1.test", "/u.item", "/u.user"]
    fname = dataset
    data_dir = "raw_data/" + fname

    download_dataset(fname, files, data_dir)

    dtypes = {
        "u_nodes": np.int32,
        "v_nodes": np.int32,
        "ratings": np.float32,
        "timestamp": np.float64,
    }

    filename_train = "raw_data/" + dataset + "/u1.base"
    filename_test = "raw_data/" + dataset + "/u1.test"

    data_train = pd.read_csv(
        filename_train,
        sep=sep,
        header=None,
        names=["u_nodes", "v_nodes", "ratings", "timestamp"],
        dtype=dtypes,
    )

    data_test = pd.read_csv(
        filename_test,
        sep=sep,
        header=None,
        names=["u_nodes", "v_nodes", "ratings", "timestamp"],
        dtype=dtypes,
    )

    data_array_train = data_train.values.tolist()
    data_array_train = np.array(data_array_train)
    data_array_test = data_test.values.tolist()
    data_array_test = np.array(data_array_test)

    if ratio < 1.0:
        data_array_train = data_array_train[
            data_array_train[:, -1].argsort()[: int(ratio * len(data_array_train))]
        ]

    data_array = np.concatenate([data_array_train, data_array_test], axis=0)

    u_nodes_ratings = data_array[:, 0].astype(dtypes["u_nodes"])
    v_nodes_ratings = data_array[:, 1].astype(dtypes["v_nodes"])
    ratings = data_array[:, 2].astype(dtypes["ratings"])
    if rating_map is not None:
        for i, x in enumerate(ratings):
            ratings[i] = rating_map[x]

    u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
    v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

    u_nodes_ratings, v_nodes_ratings = (
        u_nodes_ratings.astype(np.int64),
        v_nodes_ratings.astype(np.int32),
    )
    ratings = ratings.astype(np.float64)

    u_nodes = u_nodes_ratings
    v_nodes = v_nodes_ratings

    neutral_rating = -1  # int(np.ceil(np.float(num_classes)/2.)) - 1

    # assumes that ratings_train contains at least one example of every rating type
    rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

    labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
    labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])

    for i in range(len(u_nodes)):
        assert labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]]

    labels = labels.reshape([-1])

    # number of test and validation edges, see cf-nade code

    num_train = data_array_train.shape[0]
    num_test = data_array_test.shape[0]
    num_val = int(np.ceil(num_train * 0.2))
    num_train = num_train - num_val

    pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])
    idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

    for i in range(len(ratings)):
        assert labels[idx_nonzero[i]] == rating_dict[ratings[i]]

    idx_nonzero_train = idx_nonzero[0 : num_train + num_val]
    idx_nonzero_test = idx_nonzero[num_train + num_val :]

    pairs_nonzero_train = pairs_nonzero[0 : num_train + num_val]
    pairs_nonzero_test = pairs_nonzero[num_train + num_val :]

    # Internally shuffle training set (before splitting off validation set)
    rand_idx = list(range(len(idx_nonzero_train)))
    np.random.seed(42)
    np.random.shuffle(rand_idx)
    idx_nonzero_train = idx_nonzero_train[rand_idx]
    pairs_nonzero_train = pairs_nonzero_train[rand_idx]

    idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
    pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test], axis=0)

    val_idx = idx_nonzero[0:num_val]
    train_idx = idx_nonzero[num_val : num_train + num_val]
    test_idx = idx_nonzero[num_train + num_val :]

    assert len(test_idx) == num_test

    val_pairs_idx = pairs_nonzero[0:num_val]
    train_pairs_idx = pairs_nonzero[num_val : num_train + num_val]
    test_pairs_idx = pairs_nonzero[num_train + num_val :]

    u_test_idx, v_test_idx = test_pairs_idx.transpose()
    u_val_idx, v_val_idx = val_pairs_idx.transpose()
    u_train_idx, v_train_idx = train_pairs_idx.transpose()

    # create labels
    train_labels = labels[train_idx]
    val_labels = labels[val_idx]
    test_labels = labels[test_idx]

    if testing:
        u_train_idx = np.hstack([u_train_idx, u_val_idx])
        v_train_idx = np.hstack([v_train_idx, v_val_idx])
        train_labels = np.hstack([train_labels, val_labels])
        # for adjacency matrix construction
        train_idx = np.hstack([train_idx, val_idx])

    class_values = np.sort(np.unique(ratings))

    # make training adjacency matrix
    rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
    if post_rating_map is None:
        rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.0
    else:
        rating_mx_train[train_idx] = (
            np.array([post_rating_map[r] for r in class_values[labels[train_idx]]])
            + 1.0
        )
    rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items))

    if dataset == "ml_100k":

        # movie features (genres)
        sep = r"|"
        movie_file = "raw_data/" + dataset + "/u.item"
        movie_headers = [
            "movie id",
            "movie title",
            "release date",
            "video release date",
            "IMDb URL",
            "unknown",
            "Action",
            "Adventure",
            "Animation",
            "Childrens",
            "Comedy",
            "Crime",
            "Documentary",
            "Drama",
            "Fantasy",
            "Film-Noir",
            "Horror",
            "Musical",
            "Mystery",
            "Romance",
            "Sci-Fi",
            "Thriller",
            "War",
            "Western",
        ]
        movie_df = pd.read_csv(
            movie_file, sep=sep, header=None, names=movie_headers, engine="python"
        )

        genre_headers = movie_df.columns.values[5:]
        num_genres = genre_headers.shape[0]

        v_features = np.zeros((num_items, num_genres), dtype=np.float32)
        for movie_id, g_vec in zip(
            movie_df["movie id"].values.tolist(),
            movie_df[genre_headers].values.tolist(),
        ):
            # check if movie_id was listed in ratings file and therefore in mapping dictionary
            if movie_id in v_dict.keys():
                v_features[v_dict[movie_id], :] = g_vec

        # user features

        sep = r"|"
        users_file = "raw_data/" + dataset + "/u.user"
        users_headers = ["user id", "age", "gender", "occupation", "zip code"]
        users_df = pd.read_csv(
            users_file, sep=sep, header=None, names=users_headers, engine="python"
        )

        occupation = set(users_df["occupation"].values.tolist())

        age = users_df["age"].values
        age_max = age.max()

        gender_dict = {"M": 0.0, "F": 1.0}
        occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}

        num_feats = 2 + len(occupation_dict)

        u_features = np.zeros((num_users, num_feats), dtype=np.float32)
        for _, row in users_df.iterrows():
            u_id = row["user id"]
            if u_id in u_dict.keys():
                # age
                u_features[u_dict[u_id], 0] = row["age"] / np.float(age_max)
                # gender
                u_features[u_dict[u_id], 1] = gender_dict[row["gender"]]
                # occupation
                u_features[u_dict[u_id], occupation_dict[row["occupation"]]] = 1.0

    elif dataset == "ml_1m":

        # load movie features
        movies_file = "raw_data/" + dataset + "/movies.dat"

        movies_headers = ["movie_id", "title", "genre"]
        movies_df = pd.read_csv(
            movies_file, sep=sep, header=None, names=movies_headers, engine="python"
        )

        # extracting all genres
        genres = []
        for s in movies_df["genre"].values:
            genres.extend(s.split("|"))

        genres = list(set(genres))
        num_genres = len(genres)

        genres_dict = {g: idx for idx, g in enumerate(genres)}

        # creating 0 or 1 valued features for all genres
        v_features = np.zeros((num_items, num_genres), dtype=np.float32)
        for movie_id, s in zip(
            movies_df["movie_id"].values.tolist(), movies_df["genre"].values.tolist()
        ):
            # check if movie_id was listed in ratings file and therefore in mapping dictionary
            if movie_id in v_dict.keys():
                gen = s.split("|")
                for g in gen:
                    v_features[v_dict[movie_id], genres_dict[g]] = 1.0

        # load user features
        users_file = "raw_data/" + dataset + "/users.dat"
        users_headers = ["user_id", "gender", "age", "occupation", "zip-code"]
        users_df = pd.read_csv(
            users_file, sep=sep, header=None, names=users_headers, engine="python"
        )

        # extracting all features
        cols = users_df.columns.values[1:]

        cntr = 0
        feat_dicts = []
        for header in cols:
            d = dict()
            feats = np.unique(users_df[header].values).tolist()
            d.update({f: i for i, f in enumerate(feats, start=cntr)})
            feat_dicts.append(d)
            cntr += len(d)

        num_feats = sum(len(d) for d in feat_dicts)

        u_features = np.zeros((num_users, num_feats), dtype=np.float32)
        for _, row in users_df.iterrows():
            u_id = row["user_id"]
            if u_id in u_dict.keys():
                for k, header in enumerate(cols):
                    u_features[u_dict[u_id], feat_dicts[k][row[header]]] = 1.0
    else:
        raise ValueError("Invalid dataset option %s" % dataset)

    u_features = sp.csr_matrix(u_features)
    v_features = sp.csr_matrix(v_features)

    print("User features shape: " + str(u_features.shape))
    print("Item features shape: " + str(v_features.shape))

    return (
        u_features,
        v_features,
        rating_mx_train,
        train_labels,
        u_train_idx,
        v_train_idx,
        val_labels,
        u_val_idx,
        v_val_idx,
        test_labels,
        u_test_idx,
        v_test_idx,
        class_values,
    )