def train_async(args): # parameters from arguments logging.debug('enter train') model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model model_save_dir = args.model_save_dir if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) startup_prog = fluid.Program() train_prog = fluid.Program() tmp_prog = fluid.Program() train_loader, train_cost, global_lr, train_feas, train_label = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) test_loader, test_feas = build_program(is_train=False, main_prog=tmp_prog, startup_prog=startup_prog, args=args) test_prog = tmp_prog.clone(for_test=True) train_fetch_list = [ global_lr.name, train_cost.name, train_feas.name, train_label.name ] test_fetch_list = [test_feas.name] place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) if num_trainers <= 1 and args.use_gpu: places = fluid.framework.cuda_places() else: places = place exe.run(startup_prog) if checkpoint is not None: fluid.load(program=train_prog, model_path=checkpoint, executor=exe) if pretrained_model: load_params(exe, train_prog, pretrained_model) if args.use_gpu: devicenum = get_gpu_num() else: devicenum = int(os.environ.get('CPU_NUM', 1)) assert (args.train_batch_size % devicenum) == 0 train_batch_size = args.train_batch_size / devicenum test_batch_size = args.test_batch_size train_loader.set_sample_generator(reader.train(args), batch_size=train_batch_size, drop_last=True, places=places) test_loader.set_sample_generator(reader.test(args), batch_size=test_batch_size, drop_last=False, places=place) train_exe = fluid.ParallelExecutor(main_program=train_prog, use_cuda=args.use_gpu, loss_name=train_cost.name) totalruntime = 0 iter_no = 0 train_info = [0, 0, 0] while iter_no <= args.total_iter_num: for train_batch in train_loader(): t1 = time.time() lr, loss, feas, label = train_exe.run(feed=train_batch, fetch_list=train_fetch_list) t2 = time.time() period = t2 - t1 lr = np.mean(np.array(lr)) train_info[0] += np.mean(np.array(loss)) train_info[1] += recall_topk(feas, label, k=1) train_info[2] += 1 if iter_no % args.display_iter_step == 0: avgruntime = totalruntime / args.display_iter_step avg_loss = train_info[0] / train_info[2] avg_recall = train_info[1] / train_info[2] print("[%s] trainbatch %d, lr %.6f, loss %.6f, "\ "recall %.4f, time %2.2f sec" % \ (fmt_time(), iter_no, lr, avg_loss, avg_recall, avgruntime)) sys.stdout.flush() totalruntime = 0 if iter_no % 1000 == 0: train_info = [0, 0, 0] totalruntime += period if iter_no % args.test_iter_step == 0 and iter_no != 0: f, l = [], [] for batch_id, test_batch in enumerate(test_loader()): t1 = time.time() [feas] = exe.run(test_prog, feed=test_batch, fetch_list=test_fetch_list) label = np.asarray(test_batch[0]['label']) label = np.squeeze(label) f.append(feas) l.append(label) t2 = time.time() period = t2 - t1 if batch_id % 20 == 0: print("[%s] testbatch %d, time %2.2f sec" % \ (fmt_time(), batch_id, period)) f = np.vstack(f) l = np.hstack(l) recall = recall_topk(f, l, k=1) print("[%s] test_img_num %d, trainbatch %d, test_recall %.5f" % \ (fmt_time(), len(f), iter_no, recall)) sys.stdout.flush() if iter_no % args.save_iter_step == 0 and iter_no != 0: model_path = os.path.join(model_save_dir, model_name, str(iter_no)) fluid.save(program=train_prog, model_path=model_path) iter_no += 1
utility_predict.save_preds( self.t_params, self.m_params, self.li_predictions, self.li_timestamps_chunked[:len(self.li_predictions)], self.li_true_values, self.era5_eobs.li_loc, self.upload_batch_number) self.upload_batch_number = self.upload_batch_number + 1 self.li_timestamps_chunked = self.li_timestamps_chunked[ len(self.li_predictions):] self.li_predictions = [] self.li_true_values = [] if __name__ == "__main__": s_dir = utility.get_script_directory(sys.argv[0]) args_dict = utility.parse_arguments(s_dir) t_params, m_params = utility.load_params(args_dict, "test") #main(t_params(), m_params) test_tru_net = TestTruNet(t_params, m_params) mts = m_params['model_type_settings'] locations = mts.get('location_test', None) if mts.get( 'location_test', None) != None else mts.get('location') for loc in locations: test_tru_net.initialize_scheme_era5Eobs(location=[loc]) test_tru_net.predict(min_prob_for_rain=mts.get('prob_thresh', 0.5)) print(f"Completed Prediction for {loc}")
import engine import fnlp_engine import utility if __name__ == '__main__': args = utility.load_params(jsonFile='config.json') if args['data']['dataset'] == 'zh': runner = engine.Engine(args) elif args['data']['dataset'] == 'en': runner = fnlp_engine.Engine(args) else: print('Invalid dataset!') exit() if args['data']['dataset'] == 'zh' and args['predict']: runner.load_model() #x = raw_input('Input a sentence: ') x = input('Input a sentence: ') runner.predict(x) else: if args['train']: if args['continue']: runner.load_model() runner.train() #runner.save_model() runner.test() else: runner.load_model() runner.test()
self.mse_agg_val(mse) return True @tf.function def distributed_train_step(self, feature, target, mask, bounds, _init): gradients = self.strategy.run(self.train_step, args=(feature, target, mask, bounds, _init)) return gradients @tf.function def distributed_val_step(self, feature, target, mask, bounds): bool_completed = self.strategy.run(self.val_step, args=(feature, target, mask, bounds)) return bool_completed if __name__ == "__main__": s_dir = utility.get_script_directory(sys.argv[0]) args_dict = utility.parse_arguments(s_dir) # get training and model params t_params, m_params = utility.load_params(args_dict) # Initialize and train model weather_model = WeatherModel(t_params, m_params) weather_model.initialize_scheme_era5Eobs() weather_model.train_model()