k = 10 learning_rate = 0.001 batch_size = 4096 epochs = 10 # ========================== Create dataset ======================= feature_columns, train, test = create_criteo_dataset(file=file, read_part=read_part, sample_num=sample_num, test_size=test_size) train_X, train_y = train test_X, test_y = test # ============================Build Model========================== mirrored_strategy = tf.distribute.MirroredStrategy() with mirrored_strategy.scope(): model = FM(feature_columns=feature_columns, k=k) model.summary() # ============================Compile============================ model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), metrics=[AUC()]) # ============================model checkpoint====================== # check_path = '../save/fm_weights.epoch_{epoch:04d}.val_loss_{val_loss:.4f}.ckpt' # checkpoint = tf.keras.callbacks.ModelCheckpoint(check_path, save_weights_only=True, # verbose=1, period=5) # ==============================Fit============================== model.fit( train_X, train_y, epochs=epochs, callbacks=[
k = 10 learning_rate = 0.001 batch_size = 512 epochs = 5 # ========================== Create dataset ======================= feature_columns, train, test = create_criteo_dataset(file=file, read_part=read_part, sample_num=sample_num, test_size=test_size) train_X, train_y = train test_X, test_y = test # ============================Build Model========================== model = FM(feature_columns=feature_columns, k=k) # ============================model checkpoint====================== # check_path = '../save/fm_weights.epoch_{epoch:04d}.val_loss_{val_loss:.4f}.ckpt' # checkpoint = tf.keras.callbacks.ModelCheckpoint(check_path, save_weights_only=True, # verbose=1, period=5) # ============================Compile============================ model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), metrics=[AUC()]) # ==============================Fit============================== model.fit( train_X, train_y, epochs=epochs, # callbacks=[checkpoint], batch_size=batch_size, validation_split=0.1
import argparse parser = argparse.ArgumentParser(description='命令行参数') parser.add_argument('-k', type=int, help='v_dim', default=8) parser.add_argument('-w_reg', type=float, help='w正则', default=1e-4) parser.add_argument('-v_reg', type=float, help='v正则', default=1e-4) args=parser.parse_args() if __name__ == '__main__': file_path = 'train.txt' (X_train, y_train), (X_test, y_test) = create_criteo_dataset(file_path, test_size=0.5) k = args.k w_reg = args.w_reg v_reg = args.v_reg model = FM(k, w_reg, v_reg) optimizer = optimizers.SGD(0.01) # train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)) # train_dataset = train_dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE) # model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy']) # model.fit(train_dataset, epochs=200) # print(model.evaluate(X_test, y_test)) # model.summary() summary_writer = tf.summary.create_file_writer('E:\\PycharmProjects\\tensorboard') for i in range(100): with tf.GradientTape() as tape: y_pre = model(X_train) loss = tf.reduce_mean(losses.binary_crossentropy(y_true=y_train, y_pred=y_pre)) print(loss.numpy()) with summary_writer.as_default():
train_dataset = train_dataset.shuffle(buffer_size=100) train_dataset = train_dataset.batch(batch_size) X_test = tf.data.Dataset.from_tensor_slices(X_test) y_test = tf.data.Dataset.from_tensor_slices(y_test) test_dataset = tf.data.Dataset.zip((X_test, y_test)) test_dataset = test_dataset.batch(batch_size) # Directory where the checkpoints will be saved checkpoint_dir = './training_checkpoints' # Name of the checkpoint files checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{step}") model = FM(field_dims=field_dims, embedding_dim=embedding_dim) optimizer = tf.keras.optimizers.Adam(learning_rate=0.001) loss_fn = tf.losses.BinaryCrossentropy(from_logits=False) train_loss_results = [] train_accuracy_results = [] # train the model best_test_acc = 0 for epoch in range(num_epochs): epoch_loss = tf.keras.metrics.Mean() epoch_accuracy = tf.keras.metrics.BinaryAccuracy() for nb, (X, y) in enumerate(train_dataset): y_hat, loss = train_step(X, y) # update metrics
if __name__ == '__main__': parser = argparse.ArgumentParser(description='FM') parser.add_argument('--test_size', type=int, default=0.2) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--embed_dim', type=int, default=10) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--lr', type=float, default=0.002) parser.add_argument('--file', type=str, default='./data/criteo_sampled_data.csv') args = parser.parse_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # =============================================Data================================================================= data, feat_columns, dense_feats, sparse_feats = create_dataset(file=args.file, embed_dim=args.embed_dim) train, valid = train_test_split(data, test_size=args.test_size) train_dataset = Data.TensorDataset(torch.LongTensor(train[sparse_feats].values), torch.FloatTensor(train[dense_feats].values), torch.FloatTensor(train['label'].values)) train_loader = Data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True) valid_dataset = Data.TensorDataset(torch.LongTensor(valid[sparse_feats].values), torch.FloatTensor(valid[dense_feats].values), torch.FloatTensor(valid['label'].values)) valid_loader = Data.DataLoader(dataset=valid_dataset, batch_size=args.batch_size, shuffle=False) # =============================================Model================================================================ dense_feat_columns, sparse_feat_columns = feat_columns N = len(dense_feat_columns) + sum(feat['feat_num'] for feat in sparse_feat_columns) model = FM(N, args.embed_dim) model.to(device) # =============================================Train================================================================ training(model, sparse_feat_columns, train_loader, valid_loader, args.batch_size, args.lr, args.epochs, device)
sample_num = 100000 test_size = 0.2 k = 32 learning_rate = 0.001 batch_size = 500 epochs = 100 feature_columns, train, test, val = create_criteo_dataset( file=file, read_part=read_part, sample_num=sample_num, test_size=test_size) train_X, train_y = train test_X, test_y = test val_X, val_y = val model = FM(feature_columns=feature_columns, k=k) model.summary() early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_auc', verbose=1, patience=10, mode='max', restore_best_weights=True) model.compile(loss=tf.keras.losses.binary_crossentropy, optimizer=tf.keras.optimizers.Adam(lr=learning_rate), metrics=[tf.keras.metrics.AUC()]) model.fit( train_X, train_y,
logging.info('df_train.shape ' + str(df_train.shape)) logging.info('train_labels.shape ' + str(train_labels.shape)) # 特征长度 feature_length = df_train.shape[1] hp.feature_length = feature_length # 样本数量 train_num = df_train.shape[0] # 数据生成器 batch_gen = batch_generator([df_train.values, train_labels], hp.batch_size) # initialize FM model logging.info('initialize FM model') fm_model = FM(hp) fm_model.build_graph() # begin session logging.info('# Session') saver = tf.train.Saver(max_to_keep=hp.max_to_keep) with tf.Session() as sess: # 恢复数据 ckpt = tf.train.latest_checkpoint(hp.logdir) if ckpt is None: logging.info('initialize fresh parameters for the fm model') sess.run(tf.global_variables_initializer()) else: saver.restore(sess, ckpt) # merga all the summaries and write them out to train_logs
from model import FM, DNN from utils import create_criteo_dataset import tensorflow as tf from tensorflow.keras import optimizers, losses, metrics from sklearn.metrics import accuracy_score if __name__ == '__main__': file_path = 'E:\\PycharmProjects\\推荐算法\\data\\criteo_sample.txt' (X_train, y_train), (X_test, y_test) = create_criteo_dataset(file_path, test_size=0.5) k = 8 #**************** Statement 1 of Training *****************# model = FM(k) optimizer = optimizers.SGD(0.01) train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)) train_dataset = train_dataset.batch(32).prefetch( tf.data.experimental.AUTOTUNE) model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) model.fit(train_dataset, epochs=200) # 评估 fm_pre = model(X_test) fm_pre = [1 if x > 0.5 else 0 for x in fm_pre] #**************** Statement 2 of Training *****************#
mean = data[feat].mean() std = data[feat].std() data[feat] = (data[feat] - mean) / (std + 1e-12) # print(data.shape) # print(data.head()) train, valid = train_test_split(data, test_size=0.1, random_state=42) # print(train.shape) # (540000, 40) # print(valid.shape) # (60000, 40) train_dataset = TensorDataset( torch.LongTensor(train[sparse_features].values), torch.FloatTensor(train[dense_features].values), torch.FloatTensor(train['label'].values)) train_loader = DataLoader(dataset=train_dataset, batch_size=args.train_batch_size, shuffle=True) valid_dataset = TensorDataset( torch.LongTensor(valid[sparse_features].values), torch.FloatTensor(valid[dense_features].values), torch.FloatTensor(valid['label'].values)) valid_loader = DataLoader(dataset=valid_dataset, batch_size=args.eval_batch_size, shuffle=False) cat_fea_unique = [data[f].nunique() for f in sparse_features] model = FM(cat_fea_unique, num_fea_size=len(dense_features)) train_model(model)