def run_remote_worker(worker_id, worker_path, id): """Run worker on remote mochine.""" from zeus.common.utils import init_log init_log(level="info", log_file=".temp_{}.log".format(worker_id), log_path=worker_path) config = _load_config(worker_id, worker_path, id) os.environ["LD_LIBRARY_PATH"] = config["env"]["LD_LIBRARY_PATH"] os.environ["PWD"] = config["env"]["PWD"] os.chdir(os.environ["PWD"]) zeus.register_zeus(os.environ['BACKEND_TYPE'].lower()) if zeus.is_gpu_device(): sub_pid_list = call_in_gpu(config, id, worker_id, worker_path) elif zeus.is_npu_device(): os.environ["PYTHONPATH"] = config["env"]["PYTHONPATH"] os.environ["PATH"] = config["env"]["PATH"] os.environ["ASCEND_OPP_PATH"] = config["env"]["ASCEND_OPP_PATH"] sub_pid_list = call_in_npu(config, id, worker_id, worker_path) logging.info("DistributedWorker finished!") for sub_pid in sub_pid_list: kill_proc_tree(pid=sub_pid) logging.info("DistributedWorker subprocess cleaned!") return 0
def load_config(config_file): """Load config from file.""" import os import pickle with open(config_file, 'rb') as f: config = pickle.load(f) for (key, value) in config["env"].items(): if value: os.environ[key] = value from zeus import register_zeus register_zeus(os.environ['BACKEND_TYPE'].lower()) from zeus.common.class_factory import ClassFactory from zeus.common.general import General from zeus.datasets.conf.dataset import DatasetConfig from zeus.networks.model_config import ModelConfig from zeus.trainer.conf import TrainerConfig from zeus.evaluator.conf import EvaluatorConfig ClassFactory.__registry__ = config["class_factory"] General.from_dict(config["general"]) DatasetConfig.from_dict(config["dataset"]) ModelConfig.from_dict(config["model"]) TrainerConfig.from_dict(config["trainer"]) EvaluatorConfig.from_dict(config["evaluator"])
def set_backend(backend='pytorch', device_category='GPU'): """Set backend. :param backend: backend type, default pytorch :type backend: str """ if "BACKEND_TYPE" in os.environ: return if 'NPU_VISIBLE_DEVICES' in os.environ: os.environ['NPU-VISIBLE-DEVICES'] = os.environ['NPU_VISIBLE_DEVICES'] # CUDA visible if 'CUDA_VISIBLE_DEVICES' in os.environ: os.environ['DEVICE_CATEGORY'] = 'GPU' elif 'NPU-VISIBLE-DEVICES' in os.environ: os.environ['DEVICE_CATEGORY'] = 'NPU' os.environ['ORIGIN_RANK_TABLE_FILE'] = os.environ['RANK_TABLE_FILE'] os.environ['ORIGIN_RANK_SIZE'] = os.environ['RANK_SIZE'] # device if device_category is not None: os.environ['DEVICE_CATEGORY'] = device_category # backend if backend == 'pytorch': os.environ['BACKEND_TYPE'] = 'PYTORCH' elif backend == 'tensorflow': os.environ['BACKEND_TYPE'] = 'TENSORFLOW' import warnings warnings.filterwarnings("ignore", category=FutureWarning) elif backend == 'mindspore': os.environ['BACKEND_TYPE'] = 'MINDSPORE' else: raise Exception('backend must be pytorch, tensorflow or mindspore') set_data_format() register_zeus(backend) # vega import vega.core.search_algs.ps_differential import vega.algorithms from zeus.common.config_serializable import backup_configs backup_configs()
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. from xt.model.model_zeus import XTModelZeus from zeus.common.util.common import import_config from zeus.common.util.register import Registers from zeus import set_backend, register_zeus from zeus.trainer.trainer_api import Trainer from zeus.common.class_factory import ClassFactory, ClassType from zeus.trainer.modules.conf.loss import LossConfig from zeus.trainer.modules.conf.optim import OptimConfig set_backend(backend='tensorflow', device_category='GPU') register_zeus('tensorflow') @Registers.model class DqnZeus(XTModelZeus): """Docstring for DqnMlp.""" def __init__(self, model_info): model_config = model_info.get('model_config', None) import_config(globals(), model_config) self.state_dim = model_info['state_dim'] self.action_dim = model_info['action_dim'] super().__init__(model_info) def create_model(self, model_info): """Create Deep-Q network."""