def print_on_training(self, tb_writer, cur_batch, sess, natoms, feed_dict_test, feed_dict_batch): # depreciated run_data = [ self.l2_l, self.l2_more['l2_ener_loss'], self.l2_more['l2_force_loss'], self.l2_more['l2_virial_loss'], self.l2_more['l2_atom_ener_loss'], self.l2_more['l2_pref_force_loss'] ] # first train data train_out = run_sess(sess, run_data, feed_dict=feed_dict_batch) error_train, error_e_train, error_f_train, error_v_train, error_ae_train, error_pf_train = train_out # than test data, if tensorboard log writter is present, commpute summary # and write tensorboard logs if tb_writer: summary_merged_op = tf.summary.merge([self.l2_loss_summary, self.l2_loss_ener_summary, self.l2_loss_force_summary, self.l2_loss_virial_summary]) run_data.insert(0, summary_merged_op) test_out = run_sess(sess, run_data, feed_dict=feed_dict_test) if tb_writer: summary = test_out.pop(0) tb_writer.add_summary(summary, cur_batch) error_test, error_e_test, error_f_test, error_v_test, error_ae_test, error_pf_test = test_out print_str = "" prop_fmt = " %11.2e %11.2e" print_str += prop_fmt % (np.sqrt(error_test), np.sqrt(error_train)) if self.has_e : print_str += prop_fmt % (np.sqrt(error_e_test) / natoms[0], np.sqrt(error_e_train) / natoms[0]) if self.has_ae : print_str += prop_fmt % (np.sqrt(error_ae_test), np.sqrt(error_ae_train)) if self.has_f : print_str += prop_fmt % (np.sqrt(error_f_test), np.sqrt(error_f_train)) if self.has_v : print_str += prop_fmt % (np.sqrt(error_v_test) / natoms[0], np.sqrt(error_v_train) / natoms[0]) if self.has_pf: print_str += prop_fmt % (np.sqrt(error_pf_test), np.sqrt(error_pf_train)) return print_str
def print_on_training(self, tb_writer, cur_batch, sess, natoms, feed_dict_test, feed_dict_batch): # depreciated # YHT: added to calculate the atoms number atoms = 0 if self.type_sel is not None: for w in self.type_sel: atoms += natoms[2 + w] else: atoms = natoms[0] run_data = [ self.l2_l, self.l2_more['local_loss'], self.l2_more['global_loss'] ] summary_list = [self.l2_loss_summary] if self.local_weight > 0.0: summary_list.append(self.l2_loss_local_summary) if self.global_weight > 0.0: summary_list.append(self.l2_loss_global_summary) # first train data error_train = run_sess(sess, run_data, feed_dict=feed_dict_batch) # than test data, if tensorboard log writter is present, commpute summary # and write tensorboard logs if tb_writer: #summary_merged_op = tf.summary.merge([self.l2_loss_summary]) summary_merged_op = tf.summary.merge(summary_list) run_data.insert(0, summary_merged_op) test_out = run_sess(sess, run_data, feed_dict=feed_dict_test) if tb_writer: summary = test_out.pop(0) tb_writer.add_summary(summary, cur_batch) error_test = test_out print_str = "" prop_fmt = " %11.2e %11.2e" print_str += prop_fmt % (np.sqrt(error_test[0]), np.sqrt( error_train[0])) if self.local_weight > 0.0: print_str += prop_fmt % (np.sqrt( error_test[1]), np.sqrt(error_train[1])) if self.global_weight > 0.0: print_str += prop_fmt % (np.sqrt(error_test[2]) / atoms, np.sqrt(error_train[2]) / atoms) return print_str
def _eval_fv(self, coords, cells, atom_types, ext_f): # reshape the inputs cells = np.reshape(cells, [-1, 9]) nframes = cells.shape[0] coords = np.reshape(coords, [nframes, -1]) natoms = coords.shape[1] // 3 # sort inputs coords, atom_types, imap, sel_at, sel_imap = self.sort_input( coords, atom_types, sel_atoms=self.get_sel_type()) # make natoms_vec and default_mesh natoms_vec = self.make_natoms_vec(atom_types) assert (natoms_vec[0] == natoms) default_mesh = make_default_mesh(cells) # evaluate tensor = [] feed_dict_test = {} feed_dict_test[self.t_natoms] = natoms_vec feed_dict_test[self.t_type] = np.tile(atom_types, [nframes, 1]).reshape([-1]) feed_dict_test[self.t_coord] = coords.reshape([-1]) feed_dict_test[self.t_box] = cells.reshape([-1]) feed_dict_test[self.t_mesh] = default_mesh.reshape([-1]) feed_dict_test[self.t_ef] = ext_f.reshape([-1]) # print(run_sess(self.sess, tf.shape(self.t_tensor), feed_dict = feed_dict_test)) fout, vout, avout \ = run_sess(self.sess, [self.force, self.virial, self.av], feed_dict = feed_dict_test) # print('fout: ', fout.shape, fout) fout = self.reverse_map(np.reshape(fout, [nframes, -1, 3]), imap) fout = np.reshape(fout, [nframes, -1]) return fout, vout, avout
def get_stat(self, data: DeepmdDataSystem) -> Tuple[float, List[int]]: """ get the data statistics of the training data, including nearest nbor distance between atoms, max nbor size of atoms Parameters ---------- data Class for manipulating many data systems. It is implemented with the help of DeepmdData. Returns ------- min_nbor_dist The nearest distance between neighbor atoms max_nbor_size A list with ntypes integers, denotes the actual achieved max sel """ self.min_nbor_dist = 100.0 self.max_nbor_size = [0] * self.ntypes # for ii in tqdm(range(len(data.system_dirs)), desc = 'DEEPMD INFO |-> deepmd.utils.neighbor_stat\t\t\tgetting neighbor status'): for ii in range(len(data.system_dirs)): for jj in data.data_systems[ii].dirs: data_set = data.data_systems[ii]._load_set(jj) for kk in range(np.array(data_set['type']).shape[0]): mn, dt \ = run_sess(self.sub_sess, [self._max_nbor_size, self._min_nbor_dist], feed_dict = { self.place_holders['coord']: np.array(data_set['coord'])[kk].reshape([-1, data.natoms[ii] * 3]), self.place_holders['type']: np.array(data_set['type'])[kk].reshape([-1, data.natoms[ii]]), self.place_holders['natoms_vec']: np.array(data.natoms_vec[ii]), self.place_holders['box']: np.array(data_set['box'])[kk].reshape([-1, 9]), self.place_holders['default_mesh']: np.array(data.default_mesh[ii]), }) if dt.size != 0: dt = np.min(dt) else: dt = self.rcut log.warning( "Atoms with no neighbors found in %s. Please make sure it's what you expected." % jj) if dt < self.min_nbor_dist: if math.isclose(dt, 0., rel_tol=1e-6): # it's unexpected that the distance between two atoms is zero # zero distance will cause nan (#874) raise RuntimeError( "Some atoms in %s are overlapping. Please check your" " training data to remove duplicated atoms." % jj) self.min_nbor_dist = dt for ww in range(self.ntypes): var = np.max(mn[:, ww]) if var > self.max_nbor_size[ww]: self.max_nbor_size[ww] = var log.info('training data with min nbor dist: ' + str(self.min_nbor_dist)) log.info('training data with max nbor size: ' + str(self.max_nbor_size)) return self.min_nbor_dist, self.max_nbor_size
def _compute_dstats_sys_se_r(self, data_coord, data_box, data_atype, natoms_vec, mesh): dd_all \ = run_sess(self.sub_sess, self.stat_descrpt, feed_dict = { self.place_holders['coord']: data_coord, self.place_holders['type']: data_atype, self.place_holders['natoms_vec']: natoms_vec, self.place_holders['box']: data_box, self.place_holders['default_mesh']: mesh, }) natoms = natoms_vec dd_all = np.reshape(dd_all, [-1, self.ndescrpt * natoms[0]]) start_index = 0 sysr = [] sysn = [] sysr2 = [] for type_i in range(self.ntypes): end_index = start_index + self.ndescrpt * natoms[2 + type_i] dd = dd_all[:, start_index:end_index] dd = np.reshape(dd, [-1, self.ndescrpt]) start_index = end_index # compute dd = np.reshape(dd, [-1, 1]) ddr = dd[:, :1] sumr = np.sum(ddr) sumn = dd.shape[0] sumr2 = np.sum(np.multiply(ddr, ddr)) sysr.append(sumr) sysn.append(sumn) sysr2.append(sumr2) return sysr, sysr2, sysn
def get_tensor_by_name_from_graph(graph: tf.Graph, tensor_name: str) -> tf.Tensor: """ Load tensor value from the given tf.Graph object Parameters ---------- graph : tf.Graph The input TensorFlow graph tensor_name : str Indicates which tensor which will be loaded from the frozen model Returns ------- tf.Tensor The tensor which was loaded from the frozen model Raises ------ GraphWithoutTensorError Whether the tensor_name is within the frozen model """ try: tensor = graph.get_tensor_by_name(tensor_name + ':0') except KeyError as e: raise GraphWithoutTensorError() from e with tf.Session(graph=graph) as sess: tensor = run_sess(sess, tensor) return tensor
def _eval_descriptor_inner(self, coords: np.ndarray, cells: np.ndarray, atom_types: List[int], fparam: Optional[np.ndarray] = None, aparam: Optional[np.ndarray] = None, efield: Optional[np.ndarray] = None, ) -> np.array: natoms, nframes = self._get_natoms_and_nframes(coords, atom_types) feed_dict_test, imap = self._prepare_feed_dict(coords, cells, atom_types, fparam, aparam, efield) descriptor, = run_sess(self.sess, [self.t_descriptor], feed_dict = feed_dict_test) return self.reverse_map(np.reshape(descriptor, [nframes, natoms, -1]), imap)
def model_type(self) -> str: """Get type of model. :type:str """ if not self._model_type: t_mt = self._get_tensor("model_attr/model_type:0") sess = tf.Session(graph=self.graph, config=default_tf_session_config) [mt] = run_sess(sess, [t_mt], feed_dict={}) self._model_type = mt.decode("utf-8") return self._model_type
def valid_on_the_fly(self, fp, train_batches, valid_batches, print_header=False): train_results = self.get_evaluation_results(train_batches) valid_results = self.get_evaluation_results(valid_batches) cur_batch = self.cur_batch current_lr = run_sess(self.sess, self.learning_rate) if print_header: self.print_header(fp, train_results, valid_results) self.print_on_training(fp, train_results, valid_results, cur_batch, current_lr)
def eval(self, sess, feed_dict, natoms): run_data = [ self.l2_l, self.l2_more['l2_ener_loss'], self.l2_more['l2_ener_dipole_loss'] ] error, error_e, error_ed = run_sess(sess, run_data, feed_dict=feed_dict) results = { 'natoms': natoms[0], 'rmse': np.sqrt(error), 'rmse_e': np.sqrt(error_e) / natoms[0], 'rmse_ed': np.sqrt(error_ed) } return results
def _eval_inner( self, coords, cells, atom_types, fparam=None, aparam=None, atomic=False, efield=None ): natoms, nframes = self._get_natoms_and_nframes(coords, atom_types) feed_dict_test, imap = self._prepare_feed_dict(coords, cells, atom_types, fparam, aparam, efield) t_out = [self.t_energy, self.t_force, self.t_virial] if atomic : t_out += [self.t_ae, self.t_av] v_out = run_sess(self.sess, t_out, feed_dict = feed_dict_test) energy = v_out[0] force = v_out[1] virial = v_out[2] if atomic: ae = v_out[3] av = v_out[4] # reverse map of the outputs force = self.reverse_map(np.reshape(force, [nframes,-1,3]), imap) if atomic : ae = self.reverse_map(np.reshape(ae, [nframes,-1,1]), imap) av = self.reverse_map(np.reshape(av, [nframes,-1,9]), imap) energy = np.reshape(energy, [nframes, 1]) force = np.reshape(force, [nframes, natoms, 3]) virial = np.reshape(virial, [nframes, 9]) if atomic: ae = np.reshape(ae, [nframes, natoms, 1]) av = np.reshape(av, [nframes, natoms, 9]) return energy, force, virial, ae, av else : return energy, force, virial
def eval(self, coord: np.ndarray, charge: np.ndarray, box: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Evaluate Parameters ---------- coord The coordinates of atoms charge The atomic charge box The simulation region. PBC is assumed Returns ------- e The energy f The force v The virial """ coord = np.array(coord) charge = np.array(charge) box = np.array(box) nframes = charge.shape[0] natoms = charge.shape[1] coord = np.reshape(coord, [nframes * 3 * natoms]) charge = np.reshape(charge, [nframes * natoms]) box = np.reshape(box, [nframes * 9]) [energy, force, virial] \ = run_sess(self.sess, [self.t_energy, self.t_force, self.t_virial], feed_dict = { self.t_coord: coord, self.t_charge: charge, self.t_box: box, self.t_nloc: [natoms], }) return energy, force, virial
def model_version(self) -> str: """Get version of model. Returns ------- str version of model """ if not self._model_version: try: t_mt = self._get_tensor("model_attr/model_version:0") except KeyError: # For deepmd-kit version 0.x - 1.x, set model version to 0.0 self._model_version = "0.0" else: sess = tf.Session(graph=self.graph, config=default_tf_session_config) [mt] = run_sess(sess, [t_mt], feed_dict={}) self._model_version = mt.decode("utf-8") return self._model_version
def __init__(self, model_name: str, model_charge_map: List[float], sys_charge_map: List[float], ewald_h: float = 1, ewald_beta: float = 1) -> None: """ Constructor """ # the dipole model is loaded with prefix 'dipole_charge' self.modifier_prefix = 'dipole_charge' # init dipole model DeepDipole.__init__(self, model_name, load_prefix=self.modifier_prefix, default_tf_graph=True) self.model_name = model_name self.model_charge_map = model_charge_map self.sys_charge_map = sys_charge_map self.sel_type = list(self.get_sel_type()) # init ewald recp self.ewald_h = ewald_h self.ewald_beta = ewald_beta self.er = EwaldRecp(self.ewald_h, self.ewald_beta) # dimension of dipole self.ext_dim = 3 self.t_ndesc = self.graph.get_tensor_by_name( os.path.join(self.modifier_prefix, 'descrpt_attr/ndescrpt:0')) self.t_sela = self.graph.get_tensor_by_name( os.path.join(self.modifier_prefix, 'descrpt_attr/sel:0')) [self.ndescrpt, self.sel_a] = run_sess(self.sess, [self.t_ndesc, self.t_sela]) self.sel_r = [0 for ii in range(len(self.sel_a))] self.nnei_a = np.cumsum(self.sel_a)[-1] self.nnei_r = np.cumsum(self.sel_r)[-1] self.nnei = self.nnei_a + self.nnei_r self.ndescrpt_a = self.nnei_a * 4 self.ndescrpt_r = self.nnei_r * 1 assert (self.ndescrpt == self.ndescrpt_a + self.ndescrpt_r) self.force = None self.ntypes = len(self.sel_a)
def eval(self, sess, feed_dict, natoms): atoms = 0 if self.type_sel is not None: for w in self.type_sel: atoms += natoms[2 + w] else: atoms = natoms[0] run_data = [ self.l2_l, self.l2_more['local_loss'], self.l2_more['global_loss'] ] error, error_lc, error_gl = run_sess(sess, run_data, feed_dict=feed_dict) results = {"natoms": atoms, "rmse": np.sqrt(error)} if self.local_weight > 0.0: results["rmse_lc"] = np.sqrt(error_lc) if self.global_weight > 0.0: results["rmse_gl"] = np.sqrt(error_gl) / atoms return results
def eval(self, sess, feed_dict, natoms): run_data = [ self.l2_l, self.l2_more['l2_ener_loss'], self.l2_more['l2_force_loss'], self.l2_more['l2_virial_loss'], self.l2_more['l2_atom_ener_loss'], self.l2_more['l2_pref_force_loss'] ] error, error_e, error_f, error_v, error_ae, error_pf = run_sess(sess, run_data, feed_dict=feed_dict) results = {"natoms": natoms[0], "rmse": np.sqrt(error)} if self.has_e: results["rmse_e"] = np.sqrt(error_e) / natoms[0] if self.has_ae: results["rmse_ae"] = np.sqrt(error_ae) if self.has_f: results["rmse_f"] = np.sqrt(error_f) if self.has_v: results["rmse_v"] = np.sqrt(error_v) / natoms[0] if self.has_pf: results["rmse_pf"] = np.sqrt(error_pf) return results
def train(self, train_data=None, valid_data=None): # if valid_data is None: # no validation set specified. # valid_data = train_data # using training set as validation set. stop_batch = self.stop_batch self._init_session() # Before data shard is enabled, only cheif do evaluation and record it # self.print_head() fp = None if self.run_opt.is_chief: fp = open(self.disp_file, "a") cur_batch = run_sess(self.sess, self.global_step) is_first_step = True self.cur_batch = cur_batch log.info( "start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e" % (run_sess(self.sess, self.learning_rate), self.lr.value(cur_batch), self.lr.decay_steps_, self.lr.decay_rate_, self.lr.value(stop_batch))) prf_options = None prf_run_metadata = None if self.profiling: prf_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) prf_run_metadata = tf.RunMetadata() # set tensorboard execution environment if self.tensorboard: summary_merged_op = tf.summary.merge_all() # Remove TB old logging directory from previous run try: shutil.rmtree(self.tensorboard_log_dir) except FileNotFoundError: pass # directory does not exist, this is OK except Exception as e: # general error when removing directory, warn user log.exception( f"Could not remove old tensorboard logging directory: " f"{self.tensorboard_log_dir}. Error: {e}") else: log.debug("Removing old tensorboard log directory.") tb_train_writer = tf.summary.FileWriter( self.tensorboard_log_dir + '/train', self.sess.graph) tb_valid_writer = tf.summary.FileWriter(self.tensorboard_log_dir + '/test') else: tb_train_writer = None tb_valid_writer = None if self.enable_profiler: # https://www.tensorflow.org/guide/profiler tfv2.profiler.experimental.start(self.tensorboard_log_dir) train_time = 0 while cur_batch < stop_batch: # first round validation: train_batch = train_data.get_batch() if self.display_in_training and is_first_step: if self.run_opt.is_chief: valid_batches = [ valid_data.get_batch() for ii in range(self.valid_numb_batch) ] if valid_data is not None else None self.valid_on_the_fly(fp, [train_batch], valid_batches, print_header=True) is_first_step = False if self.timing_in_training: tic = time.time() train_feed_dict = self.get_feed_dict(train_batch, is_training=True) # use tensorboard to visualize the training of deepmd-kit # it will takes some extra execution time to generate the tensorboard data if self.tensorboard and (cur_batch % self.tensorboard_freq == 0): summary, _ = run_sess(self.sess, [summary_merged_op, self.train_op], feed_dict=train_feed_dict, options=prf_options, run_metadata=prf_run_metadata) tb_train_writer.add_summary(summary, cur_batch) else: run_sess(self.sess, [self.train_op], feed_dict=train_feed_dict, options=prf_options, run_metadata=prf_run_metadata) if self.timing_in_training: toc = time.time() if self.timing_in_training: train_time += toc - tic cur_batch = run_sess(self.sess, self.global_step) self.cur_batch = cur_batch # on-the-fly validation if self.display_in_training and (cur_batch % self.disp_freq == 0): if self.timing_in_training: tic = time.time() if self.run_opt.is_chief: valid_batches = [ valid_data.get_batch() for ii in range(self.valid_numb_batch) ] if valid_data is not None else None self.valid_on_the_fly(fp, [train_batch], valid_batches) if self.timing_in_training: toc = time.time() test_time = toc - tic log.info( "batch %7d training time %.2f s, testing time %.2f s" % (cur_batch, train_time, test_time)) train_time = 0 if self.save_freq > 0 and cur_batch % self.save_freq == 0 and self.saver is not None: self.save_checkpoint(cur_batch) if (self.save_freq == 0 or cur_batch == 0 or cur_batch % self.save_freq != 0) and self.saver is not None: self.save_checkpoint(cur_batch) if self.run_opt.is_chief: fp.close() if self.profiling and self.run_opt.is_chief: fetched_timeline = timeline.Timeline(prf_run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open(self.profiling_file, 'w') as f: f.write(chrome_trace) if self.enable_profiler and self.run_opt.is_chief: tfv2.profiler.experimental.stop()
def _eval_inner(self, coords, cells, atom_types, fparam=None, aparam=None, atomic=False, efield=None): # standarize the shape of inputs atom_types = np.array(atom_types, dtype=int).reshape([-1]) natoms = atom_types.size coords = np.reshape(np.array(coords), [-1, natoms * 3]) nframes = coords.shape[0] if cells is None: pbc = False # make cells to work around the requirement of pbc cells = np.tile(np.eye(3), [nframes, 1]).reshape([nframes, 9]) else: pbc = True cells = np.array(cells).reshape([nframes, 9]) if self.has_fparam: assert (fparam is not None) fparam = np.array(fparam) if self.has_aparam: assert (aparam is not None) aparam = np.array(aparam) if self.has_efield: assert ( efield is not None ), "you are using a model with external field, parameter efield should be provided" efield = np.array(efield) # reshape the inputs if self.has_fparam: fdim = self.get_dim_fparam() if fparam.size == nframes * fdim: fparam = np.reshape(fparam, [nframes, fdim]) elif fparam.size == fdim: fparam = np.tile(fparam.reshape([-1]), [nframes, 1]) else: raise RuntimeError( 'got wrong size of frame param, should be either %d x %d or %d' % (nframes, fdim, fdim)) if self.has_aparam: fdim = self.get_dim_aparam() if aparam.size == nframes * natoms * fdim: aparam = np.reshape(aparam, [nframes, natoms * fdim]) elif aparam.size == natoms * fdim: aparam = np.tile(aparam.reshape([-1]), [nframes, 1]) elif aparam.size == fdim: aparam = np.tile(aparam.reshape([-1]), [nframes, natoms]) else: raise RuntimeError( 'got wrong size of frame param, should be either %d x %d x %d or %d x %d or %d' % (nframes, natoms, fdim, natoms, fdim, fdim)) # sort inputs coords, atom_types, imap = self.sort_input(coords, atom_types) if self.has_efield: efield = np.reshape(efield, [nframes, natoms, 3]) efield = efield[:, imap, :] efield = np.reshape(efield, [nframes, natoms * 3]) # make natoms_vec and default_mesh natoms_vec = self.make_natoms_vec(atom_types) assert (natoms_vec[0] == natoms) # evaluate feed_dict_test = {} feed_dict_test[self.t_natoms] = natoms_vec feed_dict_test[self.t_type] = np.tile(atom_types, [nframes, 1]).reshape([-1]) t_out = [self.t_energy, self.t_force, self.t_virial] if atomic: t_out += [self.t_ae, self.t_av] feed_dict_test[self.t_coord] = np.reshape(coords, [-1]) feed_dict_test[self.t_box] = np.reshape(cells, [-1]) if self.has_efield: feed_dict_test[self.t_efield] = np.reshape(efield, [-1]) if pbc: feed_dict_test[self.t_mesh] = make_default_mesh(cells) else: feed_dict_test[self.t_mesh] = np.array([], dtype=np.int32) if self.has_fparam: feed_dict_test[self.t_fparam] = np.reshape(fparam, [-1]) if self.has_aparam: feed_dict_test[self.t_aparam] = np.reshape(aparam, [-1]) v_out = run_sess(self.sess, t_out, feed_dict=feed_dict_test) energy = v_out[0] force = v_out[1] virial = v_out[2] if atomic: ae = v_out[3] av = v_out[4] # reverse map of the outputs force = self.reverse_map(np.reshape(force, [nframes, -1, 3]), imap) if atomic: ae = self.reverse_map(np.reshape(ae, [nframes, -1, 1]), imap) av = self.reverse_map(np.reshape(av, [nframes, -1, 9]), imap) energy = np.reshape(energy, [nframes, 1]) force = np.reshape(force, [nframes, natoms, 3]) virial = np.reshape(virial, [nframes, 9]) if atomic: ae = np.reshape(ae, [nframes, natoms, 1]) av = np.reshape(av, [nframes, natoms, 9]) return energy, force, virial, ae, av else: return energy, force, virial
def _init_session(self): config = get_tf_session_config() device, idx = self.run_opt.my_device.split(":", 1) if device == "gpu": config.gpu_options.visible_device_list = idx self.sess = tf.Session(config=config) # Initializes or restore global variables init_op = tf.global_variables_initializer() if self.run_opt.is_chief: self.saver = tf.train.Saver(save_relative_paths=True) if self.run_opt.init_mode == 'init_from_scratch' : log.info("initialize model from scratch") run_sess(self.sess, init_op) if not self.is_compress: fp = open(self.disp_file, "w") fp.close () elif self.run_opt.init_mode == 'init_from_model' : log.info("initialize from model %s" % self.run_opt.init_model) run_sess(self.sess, init_op) self.saver.restore (self.sess, self.run_opt.init_model) run_sess(self.sess, self.global_step.assign(0)) fp = open(self.disp_file, "w") fp.close () elif self.run_opt.init_mode == 'restart' : log.info("restart from model %s" % self.run_opt.restart) run_sess(self.sess, init_op) self.saver.restore (self.sess, self.run_opt.restart) elif self.run_opt.init_mode == 'init_from_frz_model' : log.info("initialize training from the frozen model") run_sess(self.sess, init_op) fp = open(self.disp_file, "w") fp.close () else : raise RuntimeError ("unkown init mode") else: run_sess(self.sess, init_op) self.saver = None # Ensure variable consistency among tasks when training starts if self.run_opt.is_distrib: bcast_op = self.run_opt._HVD.broadcast_global_variables(0) if self.run_opt.is_chief: log.info('broadcast global variables to other tasks') else: log.info('receive global variables from task#0') run_sess(self.sess, bcast_op)
def freeze(*, checkpoint_folder: str, output: str, node_names: Optional[str] = None, **kwargs): """Freeze the graph in supplied folder. Parameters ---------- checkpoint_folder : str location of the folder with model output : str output file name node_names : Optional[str], optional names of nodes to output, by default None """ # We retrieve our checkpoint fullpath checkpoint = tf.train.get_checkpoint_state(checkpoint_folder) input_checkpoint = checkpoint.model_checkpoint_path # expand the output file to full path output_graph = abspath(output) # Before exporting our graph, we need to precise what is our output node # This is how TF decides what part of the Graph he has to keep # and what part it can dump # NOTE: this variable is plural, because you can have multiple output nodes # node_names = "energy_test,force_test,virial_test,t_rcut" # We clear devices to allow TensorFlow to control # on which device it will load operations clear_devices = True # We import the meta graph and retrieve a Saver try: # In case paralle training import horovod.tensorflow as _ except ImportError: pass saver = tf.train.import_meta_graph(f"{input_checkpoint}.meta", clear_devices=clear_devices) # We retrieve the protobuf graph definition graph = tf.get_default_graph() input_graph_def = graph.as_graph_def() nodes = [n.name for n in input_graph_def.node] # We start a session and restore the graph weights with tf.Session() as sess: saver.restore(sess, input_checkpoint) model_type = run_sess(sess, "model_attr/model_type:0", feed_dict={}).decode("utf-8") if "modifier_attr/type" in nodes: modifier_type = run_sess(sess, "modifier_attr/type:0", feed_dict={}).decode("utf-8") else: modifier_type = None if node_names is None: output_node_list = _make_node_names(model_type, modifier_type) different_set = set(output_node_list) - set(nodes) if different_set: log.warning("The following nodes are not in the graph: %s. " "Skip freezeing these nodes. You may be freezing " "a checkpoint generated by an old version." % different_set) # use intersection as output list output_node_list = list(set(output_node_list) & set(nodes)) else: output_node_list = node_names.split(",") log.info(f"The following nodes will be frozen: {output_node_list}") # We use a built-in TF helper to export variables to constants output_graph_def = tf.graph_util.convert_variables_to_constants( sess, # The session is used to retrieve the weights input_graph_def, # The graph_def is used to retrieve the nodes output_node_list, # The output node names are used to select the usefull nodes ) # Finally we serialize and dump the output graph to the filesystem with tf.gfile.GFile(output_graph, "wb") as f: f.write(output_graph_def.SerializeToString()) log.info(f"{len(output_graph_def.node):d} ops in the final graph.")
def train (self, train_data = None, valid_data=None) : # if valid_data is None: # no validation set specified. # valid_data = train_data # using training set as validation set. stop_batch = self.stop_batch self._init_session() # Before data shard is enabled, only cheif do evaluation and record it # self.print_head() fp = None if self.run_opt.is_chief : fp = open(self.disp_file, "a") cur_batch = run_sess(self.sess, self.global_step) is_first_step = True self.cur_batch = cur_batch log.info("start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e" % (run_sess(self.sess, self.learning_rate), self.lr.value(cur_batch), self.lr.decay_steps_, self.lr.decay_rate_, self.lr.value(stop_batch)) ) prf_options = None prf_run_metadata = None if self.profiling: prf_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) prf_run_metadata = tf.RunMetadata() # set tensorboard execution environment if self.tensorboard: summary_merged_op = tf.summary.merge_all() # Remove TB old logging directory from previous run try: shutil.rmtree(self.tensorboard_log_dir) except FileNotFoundError: pass # directory does not exist, this is OK except Exception as e: # general error when removing directory, warn user log.exception( f"Could not remove old tensorboard logging directory: " f"{self.tensorboard_log_dir}. Error: {e}" ) else: log.debug("Removing old tensorboard log directory.") tb_train_writer = tf.summary.FileWriter(self.tensorboard_log_dir + '/train', self.sess.graph) tb_valid_writer = tf.summary.FileWriter(self.tensorboard_log_dir + '/test') else: tb_train_writer = None tb_valid_writer = None train_time = 0 while cur_batch < stop_batch : # first round validation: train_batch = train_data.get_batch() if self.display_in_training and is_first_step: if self.run_opt.is_chief: valid_batches = [valid_data.get_batch() for ii in range(self.valid_numb_batch)] if valid_data is not None else None self.valid_on_the_fly(fp, [train_batch], valid_batches, print_header=True) is_first_step = False if self.timing_in_training: tic = time.time() train_feed_dict = self.get_feed_dict(train_batch, is_training=True) # use tensorboard to visualize the training of deepmd-kit # it will takes some extra execution time to generate the tensorboard data if self.tensorboard and (cur_batch % self.tensorboard_freq == 0): summary, _ = run_sess(self.sess, [summary_merged_op, self.train_op], feed_dict=train_feed_dict, options=prf_options, run_metadata=prf_run_metadata) tb_train_writer.add_summary(summary, cur_batch) else: run_sess(self.sess, [self.train_op], feed_dict=train_feed_dict, options=prf_options, run_metadata=prf_run_metadata) if self.timing_in_training: toc = time.time() if self.timing_in_training: train_time += toc - tic cur_batch = run_sess(self.sess, self.global_step) self.cur_batch = cur_batch # on-the-fly validation if self.display_in_training and (cur_batch % self.disp_freq == 0): if self.timing_in_training: tic = time.time() if self.run_opt.is_chief: valid_batches = [valid_data.get_batch() for ii in range(self.valid_numb_batch)] if valid_data is not None else None self.valid_on_the_fly(fp, [train_batch], valid_batches) if self.timing_in_training: toc = time.time() test_time = toc - tic log.info("batch %7d training time %.2f s, testing time %.2f s" % (cur_batch, train_time, test_time)) train_time = 0 if self.save_freq > 0 and cur_batch % self.save_freq == 0 and self.saver is not None: try: ckpt_prefix = self.saver.save (self.sess, os.path.join(os.getcwd(), self.save_ckpt), global_step=cur_batch) except google.protobuf.message.DecodeError as e: raise GraphTooLargeError( "The graph size exceeds 2 GB, the hard limitation of protobuf." " Then a DecodeError was raised by protobuf. You should " "reduce the size of your model." ) from e # make symlinks from prefix with step to that without step to break nothing # get all checkpoint files original_files = glob.glob(ckpt_prefix + ".*") for ori_ff in original_files: new_ff = self.save_ckpt + ori_ff[len(ckpt_prefix):] try: # remove old one os.remove(new_ff) except OSError: pass os.symlink(ori_ff, new_ff) log.info("saved checkpoint %s" % self.save_ckpt) if self.run_opt.is_chief: fp.close () if self.profiling and self.run_opt.is_chief : fetched_timeline = timeline.Timeline(prf_run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open(self.profiling_file, 'w') as f: f.write(chrome_trace)
def get_global_step(self): return run_sess(self.sess, self.global_step)
def _run_default_sess(self): [self.ntypes, self.rcut, self.dfparam, self.daparam, self.tmap] = run_sess(self.sess, [ self.t_ntypes, self.t_rcut, self.t_dfparam, self.t_daparam, self.t_tmap ])
def _run_default_sess(self): [self.ntypes, self.rcut, self.tmap, self.tselt, self.output_dim] \ = run_sess(self.sess, [self.t_ntypes, self.t_rcut, self.t_tmap, self.t_sel_type, self.t_ouput_dim] )
def __init__( self, model_file: "Path", load_prefix: str = "load", default_tf_graph: bool = False, auto_batch_size: Union[bool, int, AutoBatchSize] = True, ) -> None: # add these tensors on top of what is defined by DeepTensor Class # use this in favor of dict update to move attribute from class to # instance namespace self.tensors = dict( { # descrpt attrs "t_ntypes": "descrpt_attr/ntypes:0", "t_rcut": "descrpt_attr/rcut:0", # fitting attrs "t_dfparam": "fitting_attr/dfparam:0", "t_daparam": "fitting_attr/daparam:0", # model attrs "t_tmap": "model_attr/tmap:0", # inputs "t_coord": "t_coord:0", "t_type": "t_type:0", "t_natoms": "t_natoms:0", "t_box": "t_box:0", "t_mesh": "t_mesh:0", # add output tensors "t_energy": "o_energy:0", "t_force": "o_force:0", "t_virial": "o_virial:0", "t_ae": "o_atom_energy:0", "t_av": "o_atom_virial:0" }, ) DeepEval.__init__( self, model_file, load_prefix=load_prefix, default_tf_graph=default_tf_graph, auto_batch_size=auto_batch_size, ) # load optional tensors operations = [op.name for op in self.graph.get_operations()] # check if the graph has these operations: # if yes add them if 't_efield' in operations: self._get_tensor("t_efield:0", "t_efield") self.has_efield = True else: log.debug(f"Could not get tensor 't_efield:0'") self.t_efield = None self.has_efield = False if 'load/t_fparam' in operations: self.tensors.update({"t_fparam": "t_fparam:0"}) self.has_fparam = True else: log.debug(f"Could not get tensor 't_fparam:0'") self.t_fparam = None self.has_fparam = False if 'load/t_aparam' in operations: self.tensors.update({"t_aparam": "t_aparam:0"}) self.has_aparam = True else: log.debug(f"Could not get tensor 't_aparam:0'") self.t_aparam = None self.has_aparam = False # now load tensors to object attributes for attr_name, tensor_name in self.tensors.items(): self._get_tensor(tensor_name, attr_name) # start a tf session associated to the graph self.sess = tf.Session(graph=self.graph, config=default_tf_session_config) self._run_default_sess() self.tmap = self.tmap.decode('UTF-8').split() # setup modifier try: t_modifier_type = self._get_tensor("modifier_attr/type:0") self.modifier_type = run_sess(self.sess, t_modifier_type).decode("UTF-8") except (ValueError, KeyError): self.modifier_type = None if self.modifier_type == "dipole_charge": t_mdl_name = self._get_tensor("modifier_attr/mdl_name:0") t_mdl_charge_map = self._get_tensor( "modifier_attr/mdl_charge_map:0") t_sys_charge_map = self._get_tensor( "modifier_attr/sys_charge_map:0") t_ewald_h = self._get_tensor("modifier_attr/ewald_h:0") t_ewald_beta = self._get_tensor("modifier_attr/ewald_beta:0") [mdl_name, mdl_charge_map, sys_charge_map, ewald_h, ewald_beta] = run_sess(self.sess, [ t_mdl_name, t_mdl_charge_map, t_sys_charge_map, t_ewald_h, t_ewald_beta ]) mdl_name = mdl_name.decode("UTF-8") mdl_charge_map = [ int(ii) for ii in mdl_charge_map.decode("UTF-8").split() ] sys_charge_map = [ int(ii) for ii in sys_charge_map.decode("UTF-8").split() ] self.dm = DipoleChargeModifier(mdl_name, mdl_charge_map, sys_charge_map, ewald_h=ewald_h, ewald_beta=ewald_beta)