def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) dy_model_class = load_dy_model_class(args.abs_dir) config["config_abs_dir"] = args.abs_dir # tools.vars use_gpu = config.get("runner.use_gpu", True) train_data_dir = config.get("runner.train_data_dir", None) epochs = config.get("runner.epochs", None) print_interval = config.get("runner.print_interval", None) model_save_path = config.get("runner.model_save_path", "model_output") model_init_path = config.get("runner.model_init_path", None) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}". format(use_gpu, train_data_dir, epochs, print_interval, model_save_path)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) load_model(model_init_path, dy_model) # example dnn model forward dy_model = paddle.jit.to_static( dy_model, input_spec=[[ paddle.static.InputSpec( shape=[None, 1], dtype='int64') for jj in range(26) ], paddle.static.InputSpec( shape=[None, 13], dtype='float32')]) save_jit_model(dy_model, model_save_path, prefix='tostatic')
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) dy_model_class = load_dy_model_class(args.abs_dir) config["config_abs_dir"] = args.abs_dir # modify config from command if args.opt: for parameter in args.opt: parameter = parameter.strip() key, value = parameter.split("=") if type(config.get(key)) is int: value = int(value) if type(config.get(key)) is bool: value = (True if value.lower() == "true" else False) config[key] = value # tools.vars use_gpu = config.get("runner.use_gpu", True) train_data_dir = config.get("runner.train_data_dir", None) epochs = config.get("runner.epochs", None) print_interval = config.get("runner.print_interval", None) model_save_path = config.get("runner.model_save_path", "model_output") model_init_path = config.get("runner.model_init_path", None) end_epoch = config.get("runner.infer_end_epoch", 0) CE = config.get("runner.CE", False) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}" .format(use_gpu, train_data_dir, epochs, print_interval, model_save_path)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) if not CE: model_save_path = os.path.join(model_save_path, str(end_epoch - 1)) load_model(model_init_path, dy_model) dy_model = paddle.jit.to_static( dy_model, input_spec=[[ paddle.static.InputSpec(shape=[None, 15], dtype='int'), paddle.static.InputSpec(shape=[ None, ], dtype='float32'), paddle.static.InputSpec(shape=[None, 10, 10], dtype='int'), paddle.static.InputSpec(shape=[ None, ], dtype='int') ]]) save_jit_model(dy_model, model_save_path, prefix='tostatic')
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) dy_model_class = load_dy_model_class(args.abs_dir) config["config_abs_dir"] = args.abs_dir # tools.vars use_gpu = config.get("runner.use_gpu", True) test_data_dir = config.get("runner.test_data_dir", None) print_interval = config.get("runner.print_interval", None) model_load_path = config.get("runner.infer_load_path", "model_output") start_epoch = config.get("runner.infer_start_epoch", 0) end_epoch = config.get("runner.infer_end_epoch", 10) vocab_size = config.get("hyper_parameters.sparse_feature_number", 10) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}" .format(use_gpu, test_data_dir, start_epoch, end_epoch, print_interval, model_load_path)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') #dy_model = dy_model_class.create_model(config) dy_model = create_model(config) # to do : add optimizer function #optimizer = dy_model_class.create_optimizer(dy_model, config) logger.info("read data") test_dataloader = create_data_loader(config=config, place=place, mode="test") epoch_begin = time.time() interval_begin = time.time() metric_list, metric_list_name = dy_model_class.create_metrics() for epoch_id in range(start_epoch, end_epoch): logger.info("load model epoch {}".format(epoch_id)) model_path = os.path.join(model_load_path, str(epoch_id)) load_model(model_path, dy_model) dy_model.eval() accum_num_sum = 0 accum_num = 0 for batch_id, batch in enumerate(test_dataloader()): batch_size = len(batch[0]) inputs, all_label, inputs_word = create_feeds(batch, vocab_size) label = inputs[3].numpy() val, pred_idx = dy_model.forward(inputs[0], inputs[1], inputs[2], all_label) pre = pred_idx.numpy() for ii in range(len(label)): top4 = pre[ii][0] accum_num_sum += 1 for idx in top4: if int(idx) in inputs_word[ii]: continue if int(idx) == int(label[ii][0]): accum_num += 1 break if batch_id % print_interval == 0: logger.info( "infer epoch: {}, batch_id: {}, acc: {:.6f}, speed: {:.2f} ins/s" .format( epoch_id, batch_id, accum_num * 1.0 / accum_num_sum, print_interval * batch_size / (time.time() - interval_begin))) interval_begin = time.time() logger.info( "infer epoch: {} done, acc: {:.6f}, : epoch time{:.2f} s".format( epoch_id, accum_num * 1.0 / accum_num_sum, time.time() - epoch_begin)) epoch_begin = time.time()
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) config["config_abs_dir"] = args.abs_dir # load static model class dy_model_class = load_dy_model_class(config) use_gpu = config.get("runner.use_gpu", True) test_data_dir = config.get("runner.test_data_dir", None) print_interval = config.get("runner.print_interval", None) model_load_path = config.get("runner.infer_load_path", "model_output") start_epoch = config.get("runner.infer_start_epoch", 0) end_epoch = config.get("runner.infer_end_epoch", 10) batch_size = config.get("runner.infer_batch_size", None) os.environ["CPU_NUM"] = str(config.get("runner.thread_num", 1)) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}". format(use_gpu, test_data_dir, start_epoch, end_epoch, print_interval, model_load_path)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) test_dataloader = create_data_loader( config=config, place=place, mode="test") logger.info("read data") epoch_begin = time.time() interval_begin = time.time() for epoch_id in range(start_epoch, end_epoch): logger.info("load model epoch {}".format(epoch_id)) model_path = os.path.join(model_load_path, str(epoch_id)) load_model(model_path, dy_model) b = dy_model.item_emb.weight.numpy() import faiss if use_gpu: res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.device = 0 faiss_index = faiss.GpuIndexFlatIP(res, b.shape[-1], flat_config) faiss_index.add(b) else: faiss_index = faiss.IndexFlatIP(b.shape[-1]) faiss_index.add(b) total = 1 total_recall = 0.0 total_ndcg = 0.0 total_hitrate = 0 for batch_id, batch_data in enumerate(test_dataloader()): user_embs, _ = dy_model_class.infer_forward(dy_model, None, batch_data, config) user_embs = user_embs.numpy() target_items = np.squeeze(batch_data[-1].numpy(), axis=1) if len(user_embs.shape) == 2: D, I = faiss_index.search(user_embs, args.top_n) for i, iid_list in enumerate(target_items): recall = 0 dcg = 0.0 item_list = set(I[i]) iid_list = list(filter(lambda x: x != 0, list(iid_list))) for no, iid in enumerate(iid_list): if iid in item_list: recall += 1 dcg += 1.0 / math.log(no + 2, 2) idcg = 0.0 for no in range(recall): idcg += 1.0 / math.log(no + 2, 2) total_recall += recall * 1.0 / len(iid_list) if recall > 0: total_ndcg += dcg / idcg total_hitrate += 1 else: ni = user_embs.shape[1] user_embs = np.reshape(user_embs, [-1, user_embs.shape[-1]]) D, I = faiss_index.search(user_embs, args.top_n) for i, iid_list in enumerate(target_items): recall = 0 dcg = 0.0 item_list_set = set() item_list = list( zip( np.reshape(I[i * ni:(i + 1) * ni], -1), np.reshape(D[i * ni:(i + 1) * ni], -1))) item_list.sort(key=lambda x: x[1], reverse=True) for j in range(len(item_list)): if item_list[j][0] not in item_list_set and item_list[ j][0] != 0: item_list_set.add(item_list[j][0]) if len(item_list_set) >= args.top_n: break iid_list = list(filter(lambda x: x != 0, list(iid_list))) for no, iid in enumerate(iid_list): if iid == 0: break if iid in item_list_set: recall += 1 dcg += 1.0 / math.log(no + 2, 2) idcg = 0.0 for no in range(recall): idcg += 1.0 / math.log(no + 2, 2) total_recall += recall * 1.0 / len(iid_list) if recall > 0: total_ndcg += dcg / idcg total_hitrate += 1 total += target_items.shape[0] if batch_id % print_interval == 0: recall = total_recall / total ndcg = total_ndcg / total hitrate = total_hitrate * 1.0 / total metric_str = "" metric_str += "recall@%d: %.5f, " % (args.top_n, recall) metric_str += "ndcg@%d: %.5f, " % (args.top_n, ndcg) metric_str += "hitrate@%d: %.5f, " % (args.top_n, hitrate) logger.info("epoch: {}, batch_id: {}, ".format( epoch_id, batch_id) + metric_str + "speed: {:.2f} ins/s". format(print_interval * batch_size / (time.time( ) - interval_begin))) recall = total_recall / total ndcg = total_ndcg / total hitrate = total_hitrate * 1.0 / total metric_str = "" metric_str += "recall@%d: %.5f, " % (args.top_n, recall) metric_str += "ndcg@%d: %.5f, " % (args.top_n, ndcg) metric_str += "hitrate@%d: %.5f, " % (args.top_n, hitrate) logger.info("epoch: {} done, ".format(epoch_id) + metric_str + "epoch time: {:.2f} s".format(time.time() - epoch_begin))
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) dy_model_class = load_dy_model_class(args.abs_dir) config["config_abs_dir"] = args.abs_dir # tools.vars use_gpu = config.get("runner.use_gpu", True) test_data_dir = config.get("runner.test_data_dir", None) print_interval = config.get("runner.print_interval", None) model_load_path = config.get("runner.infer_load_path", "model_output") start_epoch = config.get("runner.infer_start_epoch", 0) end_epoch = config.get("runner.infer_end_epoch", 10) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}" .format(use_gpu, test_data_dir, start_epoch, end_epoch, print_interval, model_load_path)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) # to do : add optimizer function #optimizer = dy_model_class.create_optimizer(dy_model, config) logger.info("read data") test_dataloader = create_data_loader(config=config, place=place, mode="test") epoch_begin = time.time() interval_begin = time.time() metric_list, metric_list_name = dy_model_class.create_metrics() for epoch_id in range(start_epoch, end_epoch): logger.info("load model epoch {}".format(epoch_id)) model_path = os.path.join(model_load_path, str(epoch_id)) load_model(model_path, dy_model) dy_model.eval() for batch_id, batch in enumerate(test_dataloader()): batch_size = len(batch[0]) metric_list, tensor_print_dict = dy_model_class.infer_forward( dy_model, metric_list, batch, config) if batch_id % print_interval == 0: tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += (metric_list_name[metric_id] + ": {:.6f},".format( metric_list[metric_id].accumulate())) logger.info( "epoch: {}, batch_id: {}, ".format(epoch_id, batch_id) + metric_str + tensor_print_str + " speed: {:.2f} ins/s".format(print_interval * batch_size / (time.time() - interval_begin))) interval_begin = time.time() metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += ( metric_list_name[metric_id] + ": {:.6f},".format(metric_list[metric_id].accumulate())) tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") logger.info("epoch: {} done, ".format(epoch_id) + metric_str + tensor_print_str + " epoch time: {:.2f} s".format(time.time() - epoch_begin)) epoch_begin = time.time()
def main(args, lr): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) dy_model_class = load_dy_model_class(args.abs_dir) config["config_abs_dir"] = args.abs_dir # modify config from command if args.opt: for parameter in args.opt: parameter = parameter.strip() key, value = parameter.split("=") config[key] = value # tools.vars use_gpu = config.get("runner.use_gpu", True) use_visual = config.get("runner.use_visual", False) train_data_dir = config.get("runner.train_data_dir", None) epochs = config.get("runner.epochs", None) print_interval = config.get("runner.print_interval", None) train_batch_size = config.get("runner.train_batch_size", None) model_save_path = config.get("runner.model_save_path", "model_output") model_init_path = config.get("runner.model_init_path", None) save_checkpoint_interval = config.get("runner.save_checkpoint_interval", 1) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}, save_checkpoint_interval: {}" .format(use_gpu, use_visual, train_batch_size, train_data_dir, epochs, print_interval, model_save_path, save_checkpoint_interval)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) # print(paddle.summary(dy_model, (256, 1, 267), dtypes='int64')) # Create a log_visual object and store the data in the path if use_visual: from visualdl import LogWriter log_visual = LogWriter(args.abs_dir + "/visualDL_log/train") if model_init_path is not None: load_model(model_init_path, dy_model) if not lr: lr = config.get("hyper_parameters.optimizer.learning_rate", 0.001) optimizer = dy_model_class.create_optimizer(dy_model, config) else: optimizer = _create_optimizer(dy_model, lr) logger.info("read data") train_dataloader = create_data_loader(config=config, place=place) test_dataloader = create_data_loader(config=config, place=place, mode="test") last_epoch_id = config.get("last_epoch", -1) step_num = 0 best_metric = 0 for epoch_id in range(last_epoch_id + 1, epochs): # set train mode dy_model.train() metric_list, metric_list_name = dy_model_class.create_metrics() # auc_metric = paddle.metric.Auc("ROC") epoch_begin = time.time() interval_begin = time.time() train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, batch in enumerate(train_dataloader()): train_reader_cost += time.time() - reader_start optimizer.clear_grad() train_start = time.time() batch_size = len(batch[0]) loss, metric_list, tensor_print_dict = dy_model_class.train_forward( dy_model, metric_list, batch, config) # print(loss) loss.backward() optimizer.step() train_run_cost += time.time() - train_start total_samples += batch_size if batch_id % print_interval == 0: metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += (metric_list_name[metric_id] + ":{:.6f}, ".format( metric_list[metric_id].accumulate())) if use_visual: log_visual.add_scalar( tag="train/" + metric_list_name[metric_id], step=step_num, value=metric_list[metric_id].accumulate()) tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") if use_visual: log_visual.add_scalar(tag="train/" + var_name, step=step_num, value=var.numpy()) logger.info( "epoch: {}, batch_id: {}, ".format(epoch_id, batch_id) + metric_str + tensor_print_str + " avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} ins/s, loss: {:.6f}" .format( train_reader_cost / print_interval, (train_reader_cost + train_run_cost) / print_interval, total_samples / print_interval, total_samples / (train_reader_cost + train_run_cost), loss.numpy()[0])) # if batch_id > 80000: # tmp_auc = infer_test(dy_model, test_dataloader, dy_model_class, config, print_interval, epoch_id) # if tmp_auc > best_metric: # best_metric = tmp_auc # save_model(dy_model, optimizer, model_save_path, 1000+epoch_id, prefix='rec') # logger.info(f"saved best model, {metric_list_name[0]}: {best_metric}") train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() step_num = step_num + 1 metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += ( metric_list_name[metric_id] + ": {:.6f},".format(metric_list[metric_id].accumulate())) tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") logger.info("epoch: {} done, ".format(epoch_id) + metric_str + tensor_print_str + " epoch time: {:.2f} s".format(time.time() - epoch_begin)) # if metric_list[0].accumulate() > best_metric: # best_metric = metric_list[0].accumulate() # save_model( # dy_model, optimizer, model_save_path, 1000, prefix='rec') # best model # # save_jit_model(dy_model, model_save_path, prefix='tostatic') # logger.info(f"saved best model, {metric_list_name[0]}: {best_metric}") if epoch_id % save_checkpoint_interval == 0 and metric_list[ 0].accumulate() > 0.5: save_model(dy_model, optimizer, model_save_path, epoch_id, prefix='rec') # middle epochs if metric_list[0].accumulate() >= 0.95: print('Already over fitting, stop training!') break infer_auc = infer_test(dy_model, test_dataloader, dy_model_class, config, print_interval, epoch_id) return infer_auc, lr, train_batch_size, model_save_path
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) dy_model_class = load_dy_model_class(args.abs_dir) config["config_abs_dir"] = args.abs_dir # modify config from command if args.opt: for parameter in args.opt: parameter = parameter.strip() key, value = parameter.split("=") if type(config.get(key)) is int: value = int(value) if type(config.get(key)) is bool: value = (True if value.lower() == "true" else False) config[key] = value # tools.vars use_gpu = config.get("runner.use_gpu", True) use_xpu = config.get("runner.use_xpu", False) use_visual = config.get("runner.use_visual", False) train_data_dir = config.get("runner.train_data_dir", None) epochs = config.get("runner.epochs", None) print_interval = config.get("runner.print_interval", None) train_batch_size = config.get("runner.train_batch_size", None) model_save_path = config.get("runner.model_save_path", "model_output") model_init_path = config.get("runner.model_init_path", None) use_fleet = config.get("runner.use_fleet", False) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, use_xpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}" .format(use_gpu, use_xpu, use_visual, train_batch_size, train_data_dir, epochs, print_interval, model_save_path)) logger.info("**************common.configs**********") if use_xpu: xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0)) place = paddle.set_device(xpu_device) else: place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) # Create a log_visual object and store the data in the path if use_visual: from visualdl import LogWriter log_visual = LogWriter(args.abs_dir + "/visualDL_log/train") if model_init_path is not None: load_model(model_init_path, dy_model) # to do : add optimizer function optimizer = dy_model_class.create_optimizer(dy_model, config) # use fleet run collective if use_fleet: from paddle.distributed import fleet strategy = fleet.DistributedStrategy() fleet.init(is_collective=True, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer) dy_model = fleet.distributed_model(dy_model) logger.info("read data") train_dataloader = create_data_loader(config=config, place=place) last_epoch_id = config.get("last_epoch", -1) step_num = 0 for epoch_id in range(last_epoch_id + 1, epochs): # set train mode dy_model.train() metric_list, metric_list_name = dy_model_class.create_metrics() #auc_metric = paddle.metric.Auc("ROC") epoch_begin = time.time() interval_begin = time.time() train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, batch in enumerate(train_dataloader()): train_reader_cost += time.time() - reader_start optimizer.clear_grad() train_start = time.time() batch_size = len(batch[0]) loss, metric_list, tensor_print_dict = dy_model_class.train_forward( dy_model, metric_list, batch, config) loss.backward() optimizer.step() train_run_cost += time.time() - train_start total_samples += batch_size if batch_id % print_interval == 0: metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += (metric_list_name[metric_id] + ":{:.6f}, ".format( metric_list[metric_id].accumulate())) if use_visual: log_visual.add_scalar( tag="train/" + metric_list_name[metric_id], step=step_num, value=metric_list[metric_id].accumulate()) tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") if use_visual: log_visual.add_scalar(tag="train/" + var_name, step=step_num, value=var.numpy()) logger.info( "epoch: {}, batch_id: {}, ".format(epoch_id, batch_id) + metric_str + tensor_print_str + " avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} ins/s" .format( train_reader_cost / print_interval, (train_reader_cost + train_run_cost) / print_interval, total_samples / print_interval, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() step_num = step_num + 1 metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += ( metric_list_name[metric_id] + ": {:.6f},".format(metric_list[metric_id].accumulate())) tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") logger.info("epoch: {} done, ".format(epoch_id) + metric_str + tensor_print_str + " epoch time: {:.2f} s".format(time.time() - epoch_begin)) if use_fleet: trainer_id = paddle.distributed.get_rank() if trainer_id == 0: save_model(dy_model, optimizer, model_save_path, epoch_id, prefix='rec') else: save_model(dy_model, optimizer, model_save_path, epoch_id, prefix='rec')
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) dy_model_class = load_dy_model_class(args.abs_dir) config["config_abs_dir"] = args.abs_dir # tools.vars use_gpu = config.get("runner.use_gpu", True) train_data_dir = config.get("runner.train_data_dir", None) epochs = config.get("runner.epochs", None) print_interval = config.get("runner.print_interval", None) model_save_path = config.get("runner.model_save_path", "model_output") model_init_path = config.get("runner.model_init_path", None) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}" .format(use_gpu, train_data_dir, epochs, print_interval, model_save_path)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) if model_init_path is not None: load_model(model_init_path, dy_model) # to do : add optimizer function optimizer = dy_model_class.create_optimizer(dy_model, config) logger.info("read data") train_dataloader = create_data_loader(config=config, place=place) last_epoch_id = config.get("last_epoch", -1) for epoch_id in range(last_epoch_id + 1, epochs): # set train mode dy_model.train() metric_list, metric_list_name = dy_model_class.create_metrics() #auc_metric = paddle.metric.Auc("ROC") epoch_begin = time.time() interval_begin = time.time() train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, batch in enumerate(train_dataloader()): train_reader_cost += time.time() - reader_start optimizer.clear_grad() train_start = time.time() batch_size = len(batch[0]) loss, metric_list, tensor_print_dict = dy_model_class.train_forward( dy_model, metric_list, batch, config) loss.backward() optimizer.step() train_run_cost += time.time() - train_start total_samples += batch_size if batch_id % print_interval == 0: metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += (metric_list_name[metric_id] + ": {:.6f},".format( metric_list[metric_id].accumulate())) tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") logger.info( "epoch: {}, batch_id: {}, ".format(epoch_id, batch_id) + metric_str + tensor_print_str + " avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec" .format( train_reader_cost / print_interval, (train_reader_cost + train_run_cost) / print_interval, total_samples / print_interval, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += ( metric_list_name[metric_id] + ": {:.6f},".format(metric_list[metric_id].accumulate())) logger.info("epoch: {} done, ".format(epoch_id) + metric_str + "epoch time: {:.2f} s".format(time.time() - epoch_begin)) save_model(dy_model, optimizer, model_save_path, epoch_id, prefix='rec')
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) dy_model_class = load_dy_model_class(args.abs_dir) config["config_abs_dir"] = args.abs_dir # modify config from command if args.opt: for parameter in args.opt: parameter = parameter.strip() key, value = parameter.split("=") config[key] = value # tools.vars use_gpu = config.get("runner.use_gpu", True) use_visual = config.get("runner.use_visual", False) test_data_dir = config.get("runner.test_data_dir", None) print_interval = config.get("runner.print_interval", None) infer_batch_size = config.get("runner.infer_batch_size", None) model_load_path = config.get("runner.infer_load_path", "model_output") start_epoch = config.get("runner.infer_start_epoch", 0) end_epoch = config.get("runner.infer_end_epoch", 10) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, use_visual: {}, infer_batch_size: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}" .format(use_gpu, use_visual, infer_batch_size, test_data_dir, start_epoch, end_epoch, print_interval, model_load_path)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) # Create a log_visual object and store the data in the path if use_visual: from visualdl import LogWriter log_visual = LogWriter(args.abs_dir + "/visualDL_log/infer") # to do : add optimizer function #optimizer = dy_model_class.create_optimizer(dy_model, config) logger.info("read data") test_dataloader = create_data_loader(config=config, place=place, mode="test") epoch_begin = time.time() interval_begin = time.time() metric_list, metric_list_name = dy_model_class.create_metrics() step_num = 0 for epoch_id in range(start_epoch, end_epoch): logger.info("load model epoch {}".format(epoch_id)) model_path = os.path.join(model_load_path, str(epoch_id)) try: load_model(model_path, dy_model) except Exception as e: print(e) continue dy_model.eval() infer_reader_cost = 0.0 infer_run_cost = 0.0 reader_start = time.time() for batch_id, batch in enumerate(test_dataloader()): infer_reader_cost += time.time() - reader_start infer_start = time.time() batch_size = len(batch[0]) metric_list, tensor_print_dict = dy_model_class.infer_forward( dy_model, metric_list, batch, config) infer_run_cost += time.time() - infer_start if batch_id % print_interval == 0: tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") if use_visual: log_visual.add_scalar(tag="infer/" + var_name, step=step_num, value=var.numpy()) metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += (metric_list_name[metric_id] + ": {:.6f},".format( metric_list[metric_id].accumulate())) if use_visual: log_visual.add_scalar( tag="infer/" + metric_list_name[metric_id], step=step_num, value=metric_list[metric_id].accumulate()) logger.info( "epoch: {}, batch_id: {}, ".format(epoch_id, batch_id) + metric_str + tensor_print_str + " avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.2f} ins/s" .format( infer_reader_cost / print_interval, (infer_reader_cost + infer_run_cost) / print_interval, infer_batch_size, print_interval * batch_size / (time.time() - interval_begin))) interval_begin = time.time() infer_reader_cost = 0.0 infer_run_cost = 0.0 step_num = step_num + 1 reader_start = time.time() metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += ( metric_list_name[metric_id] + ": {:.6f},".format(metric_list[metric_id].accumulate())) tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") logger.info("epoch: {} done, ".format(epoch_id) + metric_str + tensor_print_str + " epoch time: {:.2f} s".format(time.time() - epoch_begin)) epoch_begin = time.time()
def main(args): paddle.seed(163) # load config config = load_yaml(args.config_yaml) config["config_abs_dir"] = args.abs_dir # load static model class dy_model_class = load_dy_model_class(config) use_gpu = config.get("runner.use_gpu", True) test_data_dir = config.get("runner.test_data_dir", None) print_interval = config.get("runner.print_interval", None) model_load_path = config.get("runner.infer_load_path", "model_output") start_epoch = config.get("runner.infer_start_epoch", 0) end_epoch = config.get("runner.infer_end_epoch", 10) batch_size = config.get("runner.infer_batch_size", None) top_k = config.get("runner.top_k", 10) os.environ["CPU_NUM"] = str(config.get("runner.thread_num", 1)) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}" .format(use_gpu, test_data_dir, start_epoch, end_epoch, print_interval, model_load_path)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) test_dataloader = create_data_loader(config=config, place=place, mode="test") logger.info("read data") epoch_begin = time.time() interval_begin = time.time() for epoch_id in range(start_epoch, end_epoch): logger.info("load model epoch {}".format(epoch_id)) model_path = os.path.join(model_load_path, str(epoch_id)) load_model(model_path, dy_model) recList = {} data = dy_model.data num_items = dy_model.num_items final_user_embeddings, final_item_embeddings = dy_model.infer_embedding( ) def _predict_one(u, i_embeddings, u_embeddings): # (1, emb_size) user_embedding = paddle.nn.functional.embedding( x=u, weight=u_embeddings) # (num_item, emb_size) * (emb_size, 1) -> (num_item, 1) candidates = paddle.matmul( i_embeddings, paddle.transpose(user_embedding, perm=[1, 0])) return candidates for user in data.testSet_u: if data.containsUser(user): candidates = paddle.squeeze( _predict_one(paddle.to_tensor([data.getUserId(user)]), final_item_embeddings, final_user_embeddings)).numpy() else: candidates = [data.globalMean] * num_items ratedList, rating_list = data.userRated(user) for item in ratedList: candidates[data.item[item]] = 0.0 ids, scores = find_k_largest(top_k, candidates) item_names = [data.id2item[iid] for iid in ids] recList[user] = list(zip(item_names, scores)) measure = Measure.rankingMeasure(data.testSet_u, recList, [top_k]) logger.info("\t".join(measure))