def to_multidevice(batch_iter, num_trainer): """to_multidevice""" batch_dict = [] for batch in batch_iter(): batch_dict.append(batch) if len(batch_dict) == num_trainer: yield batch_dict batch_dict = [] if len(batch_dict) > 0: log.warning("The batch (%s) can't fill all device (%s)" "which will be discarded." % (len(batch_dict), num_trainer))
def run_epoch( py_reader, exe, program, prefix, model_dict, epoch, batch_size, log_per_step=100, save_per_step=10000, ): """run_epoch""" batch = 0 start = time.time() batch_end = time.time() for batch_feed_dict in py_reader(): if prefix == "train": if batch_feed_dict["src_index"].shape[0] != batch_size: log.warning( 'batch_feed_dict["src_index"].shape[0] != 1024, continue') continue batch_start = time.time() batch += 1 batch_loss, batch_auc = exe.run( program, feed=batch_feed_dict, fetch_list=[model_dict.loss.name, model_dict.auc.name]) batch_end = time.time() if batch % log_per_step == 0: log.info( "Batch %s %s-Loss %s \t %s-Auc %s \t Speed(per batch) %.5lf sec" % (batch, prefix, np.mean(batch_loss), prefix, np.mean(batch_auc), batch_end - batch_start)) if batch != 0 and batch % save_per_step == 0: fluid.io.save_params(exe, dirname='checkpoint', main_program=program) fluid.io.save_params(exe, dirname='checkpoint', main_program=program)