def preprocess_movielens(power: float = 1.0, seed: int = 12345) -> Dict[str, np.ndarray]: """Load and preprocess ML 100K.""" np.random.seed(seed) with open("../config.yaml", "rb") as f: config = yaml.safe_load(f) val_size = config["val_size"] hyperparams = config["mf_hyperparams"] with codecs.open(f"../data/ml-100k/ml-100k.data", "r", "utf-8", errors="ignore") as f: data = pd.read_csv(f, delimiter="\t", header=None).loc[:, :2] data.rename(columns={0: "user", 1: "item", 2: "rate"}, inplace=True) data.user, data.item = data.user - 1, data.item - 1 data = data.values num_users, num_items = data[:, 0].max() + 1, data[:, 1].max() + 1 user_item_ = (pd.DataFrame(np.zeros( (num_users, num_items))).stack().reset_index().values[:, :2]) # generate CVR by MF. ops.reset_default_graph() sess = tf.Session() tf.set_random_seed(seed) model = MF( num_users=num_users, num_items=num_items, dim=hyperparams["dim"], eta=hyperparams["eta"], lam=hyperparams["lam"], ) # initialise all the TF variables init_op = tf.global_variables_initializer() sess.run(init_op) for _ in np.arange(hyperparams["iters"]): idx = np.random.choice(np.arange(data.shape[0]), size=hyperparams["batch_size"]) _ = sess.run( model.apply_grads_mse, feed_dict={ model.users: data[idx, 0], model.items: data[idx, 1], model.labels: np.expand_dims(data[idx, 2], 1), model.pscore: np.ones((hyperparams["batch_size"], 1)), }, ) cvr = sess.run( model.preds, feed_dict={ model.users: user_item_[:, 0], model.items: user_item_[:, 1] }, ) cvr = np.clip(cvr.flatten(), 1, 5) cvr = transform_rating(cvr, eps=0.1) cv = np.random.binomial(n=1, p=cvr) # generate CTR by logistic MF. all_data = (pd.DataFrame(np.zeros( (num_users, num_items))).stack().reset_index().values[:, :2]) pos_data = data[:, :2] unlabeled_data = np.array( list(set(map(tuple, all_data)) - set(map(tuple, pos_data))), dtype=int) data = np.r_[np.c_[pos_data, np.ones(pos_data.shape[0])], np.c_[unlabeled_data, np.zeros(unlabeled_data.shape[0])], ] ops.reset_default_graph() sess = tf.Session() tf.set_random_seed(seed) model = MF( num_users=num_users, num_items=num_items, dim=hyperparams["dim"], eta=hyperparams["eta"], lam=hyperparams["lam"], ) # initialise all the TF variables init_op = tf.global_variables_initializer() sess.run(init_op) for _ in np.arange(hyperparams["iters"]): idx = np.random.choice(np.arange(data.shape[0]), size=hyperparams["batch_size"]) _ = sess.run( model.apply_grads_ce, feed_dict={ model.users: data[idx, 0], model.items: data[idx, 1], model.labels: np.expand_dims(data[idx, 2], 1), model.pscore: np.ones((hyperparams["batch_size"], 1)), }, ) ctr = sess.run( model.preds, feed_dict={ model.users: user_item_[:, 0], model.items: user_item_[:, 1] }, ) ctr = sigmoid(ctr.flatten())**power ct = np.random.binomial(n=1, p=ctr) train_indicator = np.random.binomial(n=1, p=(1.0 - val_size), size=ct.shape[0]) ct_train, ct_val = ct * train_indicator, ct * (1 - train_indicator) train = np.c_[user_item_, ct_train * cv] val = np.c_[user_item_, ct_val * cv, ct_val, cv, ctr * val_size, cvr] test = np.c_[user_item_, ct * cv, ct, cv, ctr, cvr] return train, val, test
latent_dim = 32 # 隐藏单元维度 # use bias use_bias = True learning_rate = 0.001 batch_size = 512 epochs = 10 # ========================== Create dataset ======================= feature_columns, train, test = create_explicit_ml_1m_dataset( file, latent_dim, test_size) train_X, train_y = train test_X, test_y = test # ============================Build Model========================== model = MF(feature_columns, use_bias) model.summary() # ============================model checkpoint====================== # check_path = '../save/mf_weights.epoch_{epoch:04d}.val_loss_{val_loss:.4f}.ckpt' # checkpoint = tf.keras.callbacks.ModelCheckpoint(check_path, save_weights_only=True, # verbose=1, period=5) # ============================Compile============================ model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate), metrics=['mse']) # ==============================Fit============================== model.fit( train_X, train_y, epochs=epochs, # callbacks=[checkpoint],
def preprocess_yahoo_coat(data: str, val_ratio: float = 0.3, seed: int = 12345) -> Tuple: """Load and preprocess Yahoo! R3 and Coat datasets.""" np.random.seed(seed) with open("../config.yaml", "rb") as f: hyperparams = yaml.safe_load(f)["mf_hyperparams"] if data == "yahoo": cols = {0: "user", 1: "item", 2: "rate"} with codecs.open(f"../data/yahoo/train.txt", "r", "utf-8", errors="ignore") as f: train_ = pd.read_csv(f, delimiter="\t", header=None) train_.rename(columns=cols, inplace=True) with codecs.open(f"../data/yahoo/test.txt", "r", "utf-8", errors="ignore") as f: test_ = pd.read_csv(f, delimiter="\t", header=None) test_.rename(columns=cols, inplace=True) for data_ in [train_, test_]: data_.user, data_.item = data_.user - 1, data_.item - 1 elif data == "coat": cols = {"level_0": "user", "level_1": "item", 2: "rate", 0: "rate"} with codecs.open(f"../data/coat/train.ascii", "r", "utf-8", errors="ignore") as f: train_ = pd.read_csv(f, delimiter=" ", header=None) train_ = train_.stack().reset_index().rename(columns=cols) train_ = train_[train_.rate != 0].reset_index(drop=True) with codecs.open(f"../data/coat/test.ascii", "r", "utf-8", errors="ignore") as f: test_ = pd.read_csv(f, delimiter=" ", header=None) test_ = test_.stack().reset_index().rename(columns=cols) test_ = test_[test_.rate != 0].reset_index(drop=True) # binarize ratings for data_ in [train_, test_]: data_.rate = np.array(data_.rate >= 4, dtype=int) # estimate propensity score by MF train, test = train_.values, test_.values pos_train = train_[train_.rate == 1].values pos_test = test_[test_.rate == 1].values # preprocess datasets unique_user_train, user_counts_train = np.unique(pos_train[:, 0], return_counts=True) unique_user_train = unique_user_train[user_counts_train >= 2] unique_user_test, user_counts_test = np.unique(pos_test[:, 0], return_counts=True) unique_user_test = unique_user_test[user_counts_test <= 9] valid_users = np.intersect1d(unique_user_train, unique_user_test) train = train[np.array([u in valid_users for u in train[:, 0]])] test = test[np.array([u in valid_users for u in test[:, 0]])] train[:, 0] = stats.rankdata(train[:, 0], method="dense") - 1 test[:, 0] = stats.rankdata(test[:, 0], method="dense") - 1 num_users, num_items = train[:, 0].max() + 1, train[:, 1].max() + 1 all_data = (pd.DataFrame(np.zeros( (num_users, num_items))).stack().reset_index().values[:, :2]) unobs_data = np.array( list(set(map(tuple, all_data)) - set(map(tuple, train[:, :2])))) train = np.r_[np.c_[train, np.ones(train.shape[0])], np.c_[unobs_data, np.zeros((unobs_data.shape[0], 2))], ] train, val = train_test_split(train, test_size=val_ratio, random_state=seed) unobs_data = np.array( list(set(map(tuple, all_data)) - set(map(tuple, val[:, :2])))) val = np.r_[val, np.c_[unobs_data, np.zeros((unobs_data.shape[0], 2))]] # define the matrix factorization model ops.reset_default_graph() sess = tf.Session() tf.set_random_seed(seed) model = MF( num_users=num_users, num_items=num_items, dim=hyperparams["dim"], eta=hyperparams["eta"], lam=hyperparams["lam"], ) # initialise all the TF variables init_op = tf.global_variables_initializer() sess.run(init_op) for _ in np.arange(hyperparams["iters"]): idx = np.random.choice(np.arange(val.shape[0]), size=hyperparams["batch_size"]) _ = sess.run( model.apply_grads_ce, feed_dict={ model.users: val[idx, 0], model.items: val[idx, 1], model.labels: np.expand_dims(val[idx, 3], 1), model.pscore: np.ones((hyperparams["batch_size"], 1)), }, ) # obtain dense user-item matrix ctr_hat = sess.run( model.preds, feed_dict={ model.users: val[:, 0].astype(int), model.items: val[:, 1].astype(int), }, ) val = np.c_[val, sigmoid(ctr_hat)] # estimate relevance parameter (gamma) by MF. ops.reset_default_graph() sess = tf.Session() tf.set_random_seed(seed) model = MF( num_users=num_users, num_items=num_items, dim=hyperparams["dim"], eta=hyperparams["eta"], lam=hyperparams["lam"], ) # observed data val_obs = val[val[:, 3] == 1] # initialise all the TF variables init_op = tf.global_variables_initializer() sess.run(init_op) for _ in np.arange(hyperparams["iters"]): idx = np.random.choice(np.arange(val_obs.shape[0]), size=hyperparams["batch_size"]) _ = sess.run( model.apply_grads_ce, feed_dict={ model.users: val_obs[idx, 0], model.items: val_obs[idx, 1], model.labels: np.expand_dims(val_obs[idx, 2], 1), model.pscore: np.expand_dims(val_obs[idx, 4], 1), }, ) # obtain dense user-item matrix gamma_hat = sess.run( model.preds, feed_dict={ model.users: val[:, 0].astype(int), model.items: val[:, 1].astype(int), }, ) val = np.c_[val, sigmoid(gamma_hat)] # create test data containing all items all_data = (pd.DataFrame(np.zeros( (num_users, num_items))).stack().reset_index().values[:, :2]) unobs_data = np.array( list(set(map(tuple, all_data)) - set(map(tuple, test[:, :2])))) test = np.r_[np.c_[test, np.ones(test.shape[0])], np.c_[unobs_data, np.zeros((unobs_data.shape[0], 2))], ] avg_test_pscore = test[:, -1].mean() test = np.c_[test, np.ones(test.shape[0]) * avg_test_pscore] return train, val, test
def run_mf(DATA_NAME, METHOD_NAME, dim_set, lbda_set, lr_set, C_set, protect_item_group=None, protect_user_group=None, protect_user_item_group=None, pos_thr=4): myData = dataset.Dataset(DATA_NAME) batch_size = 512 param_set = [(_d, _l, _lr, _C) for _d in dim_set for _l in lbda_set for _lr in lr_set for _C in C_set] if METHOD_NAME in ['MF']: user_item_train_map = myData.get_user_item_train_map() for (hidden_dim, lbda, learning_rate, C) in param_set: config = { 'hidden_dim': hidden_dim, 'lbda': lbda, 'learning_rate': learning_rate, 'batch_size': batch_size, 'C': C, 'protect_item_group': protect_item_group, 'protect_user_group': protect_user_group, 'protect_user_item_group': protect_user_item_group } configStr = "_".join([ str(config[k]) for k in sorted(list(config.keys())) if config[k] is not None ]) outputStr = os.path.join( OUTPUT_DIR, DATA_NAME + "_" + METHOD_NAME + "_" + configStr) myModel = MF(DATA_NAME, config) columns = ['user_id', 'item_id', 'rating'] myModel.assign_data( myData.n_user, myData.n_item, myData.user_attr, myData.item_attr, myData.user_attr_ids, myData.item_attr_ids, myData.data[columns].loc[myData.data['split'] == 0].values.astype(int), myData.data[columns].loc[myData.data['split'] == 1].values.astype(int)) myModel.train() columns = [ 'user_id', 'item_id', 'rating', 'model_attr', 'user_attr' ] _res = myModel.evaluate_rating( myData.data[columns].loc[myData.data['split'] == 1], myData.data[columns].loc[myData.data['split'] == 2]) pd.DataFrame(_res, index=['validation', 'test'], columns=['MSE', 'MAE', 'F-stat', 'p-value' ]).to_csv(outputStr + "_rating_results.csv") myModel.assign_user_item_train_map(user_item_train_map) _res = myModel.evaluate_ranking( myData.data[columns].loc[(myData.data['split'] == 1) & (myData.data['rating'] >= pos_thr)], myData.data[columns].loc[(myData.data['split'] == 2) & (myData.data['rating'] >= pos_thr)]) pd.DataFrame(_res, index=['validation', 'test'], columns=['NDCG', 'AUC', 'KL' ]).to_csv(outputStr + "_ranking_results.csv")