Esempio n. 1
0
    def _run_pytorch_to_caffe(self, model, name, output_dir, input_size,
                              debug):
        self.logger.info("-------- Run pytorch to caffe --------")
        inputs = Variable(torch.ones([1, 3, input_size, input_size]))

        if not debug:
            backup_stdout = sys.stdout
            sys.stdout = open("/dev/null", "w")
        pytorch_to_caffe.trans_net(model, inputs, name)
        if not debug:
            sys.stdout = backup_stdout

        utils.makedir(output_dir)
        out_proto = "{}/{}.prototxt".format(output_dir, name)
        out_caffemodel = "{}/{}.caffemodel".format(output_dir, name)
        out_torch_to_caffe = "{}/{}_torch2caffe.pkl".format(output_dir, name)
        pytorch_to_caffe.save_prototxt(out_proto)
        pytorch_to_caffe.save_caffemodel(out_caffemodel)
        with open(out_proto, "r") as fr:
            a = fr.read()
        a = a.replace(
            'input: "blob1"\ninput_dim: 1\ninput_dim: 3\ninput_dim: %d\ninput_dim: %d'
            % (input_size, input_size),
            'layer {\n\tname: "blob1"\n\ttype: "Input"\n\ttop: "blob1"\n\tinput_param {\n\t\tshape {\n\t\t\tdim: 1\n\t\t\tdim: 3\n\t\t\tdim: %d\n\t\t\tdim: %d\n\t\t}\n\t}\n}'
            % (input_size, input_size),
        )
        with open(out_proto, "w") as fw:
            fw.write(a)
        with open(out_torch_to_caffe, "wb") as fw:
            pickle.dump(pytorch_to_caffe.torch_to_caffe_names, fw,
                        pickle.HIGHEST_PROTOCOL)
        self.logger.info(
            "Finish convert pytorch model to caffe, check {}, {} and {}.".
            format(out_proto, out_caffemodel, out_torch_to_caffe))
        return out_proto, out_caffemodel, pytorch_to_caffe.torch_to_caffe_names
Esempio n. 2
0
 def save(self, path, start_index=None):
     """
     Save this population to path.
     """
     path = utils.makedir(path)  # create dir if not exists
     backuped = 0
     saved = 0
     start_save_index = self.start_save_index if start_index is None else start_index
     for ind, record in six.iteritems(self.model_records):
         if ind < start_save_index:
             continue
         # save this model record
         save_path = os.path.join(path, "{}.yaml".format(ind))
         if os.path.exists(save_path):
             backup_dir = utils.makedir(
                 os.path.join(path, "overwrite_backup"))
             backup_path = os.path.join(backup_dir, "{}.yaml".format(ind))
             self.logger.warning(
                 "%s already exists; overwrite and backup to %s", save_path,
                 backup_path)
             shutil.copyfile(save_path, backup_path)
             backuped += 1
         record.save(save_path)
         saved += 1
     self.logger.info(
         "Saving start from index %d. %d/%d records saved "
         "(%d records overwrited and backuped). By default "
         "next save will start from index %d.", self.start_save_index,
         saved, len(self.model_records), backuped, self._next_index)
     self.start_save_index = self._next_index
     return saved
Esempio n. 3
0
    def final_save(self):
        """
        Pickle dump the controller/evaluator directly. Usually, evaluator use dataset,
        it will failed to be pickled if no handling is specified using `__getsate__`,
        in that case, will fallback to call `evaluator.save`.

        The dumped checkpoint can be loaded directly using `model = torch.load(checkpoint)`,
        without instantiate the correct class with correct configuration first.
        This checkpoint is convenient for test/usage.

        Visualization writer is not kept after save/load, so take care when these checkpoints
        are used in the middle of training process that has visualization writer. Better using
        the checkpoints dumped by `maybe_save` when finetuning.
        """
        if self.train_dir:
            # final saving
            dir_ = utils.makedir(os.path.join(self.train_dir, "final"))
            torch.save(self.controller, os.path.join(dir_, "controller.pt"))
            rank = os.environ.get("LOCAL_RANK")
            if rank is None or rank == '0':
                self.controller.save(os.path.join(dir_, "controller"))
            try:
                torch.save(self.evaluator, os.path.join(dir_, "evaluator.pt"))
                if rank is None or rank == '0':
                    self.evaluator.save(os.path.join(dir_, "evaluator"))
            except pickle.PicklingError as e:
                self.logger.warning(
                    "Final saving: torch.save(evaluator) fail, fallback to call "
                    "`evaluator.save`: %s", e)
                self.evaluator.save(os.path.join(dir_, "evaluator.pt"))
            self.logger.info("Final Saving: Dump controller to directory %s",
                             dir_)
Esempio n. 4
0
    def setup(self,
              load=None,
              save_every=None,
              train_dir=None,
              writer=None,
              load_components=None,
              interleave_report_every=None):
        # TODO: handle load components
        assert train_dir is not None, \
            ("You'd better provide a path using `--train-dir` to save all the checkpoint "
             "when using async trainer")

        super(AsyncTrainer,
              self).setup(load, save_every, train_dir, writer, load_components,
                          interleave_report_every)
        self.train_dir = train_dir
        ckpt_dir = utils.makedir(os.path.join(train_dir, "checkpoints"))
        self.dispatcher.init(self.evaluator, ckpt_dir)
        self._register_signals()  # register signal handlers for clean up
        current_avail_parallelism = self.dispatcher.parallelism
        self.logger.info(
            "Arch rollout parallelism: %d; Current available dispatcher "
            "parallelism: %d.", self.parallelism, current_avail_parallelism)
        if self.parallelism > self.dispatcher.parallelism:
            self.logger.warning(
                "Arch rollout parallelism (%d) is configuredbigger "
                "than available dispatcher/evaluation parallelism (%d).",
                self.parallelism, current_avail_parallelism)

        self.save_every = save_every
Esempio n. 5
0
def init_population_dir(tmp_path, request):
    import torch
    from aw_nas.common import get_search_space
    from aw_nas import utils
    from aw_nas.main import _init_component

    cfg = getattr(request, "param", {})
    scfg = cfg.pop("search_space_cfg", {})
    search_space = get_search_space(cls="cnn", **scfg)
    path = utils.makedir(os.path.join(tmp_path, "init_population_dir"))
    ckpt_dir = utils.makedir(os.path.join(tmp_path, "init_ckpt_path"))

    # dump config template
    with open(os.path.join(path, "template.yaml"), "w") as wf:
        wf.write(sample_config)

    # generate mock records, ckpts
    num_records = cfg.get("num_records", 3)
    cfg_template = ConfigTemplate(yaml.load(StringIO(sample_config)))
    model_records = collections.OrderedDict()
    for ind in range(num_records):
        rollout = search_space.random_sample()
        cfg = cfg_template.create_cfg(rollout.genotype)
        ckpt_path = os.path.join(ckpt_dir, str(ind))
        cnn_model = _init_component(cfg,
                                    "final_model",
                                    search_space=search_space,
                                    device=torch.device("cpu"))
        torch.save(cnn_model, ckpt_path)
        model_records[ind] = ModelRecord(rollout.genotype,
                                         cfg,
                                         search_space,
                                         checkpoint_path=ckpt_path,
                                         finished=True,
                                         confidence=1,
                                         perfs={
                                             "acc": np.random.rand(),
                                             "loss": np.random.uniform(0, 10)
                                         })
    # initialize population
    population = Population(search_space, model_records, cfg_template)
    # save population
    population.save(path, 0)

    # ugly: return ss for reference
    return (path, search_space)
Esempio n. 6
0
 def save(self, path):
     path = utils.makedir(path)
     torch.save(self.model, os.path.join(path, "model.pt"))
     torch.save({
         "epoch": self.epoch,
         "optimizer":self.optimizer.state_dict()
     }, os.path.join(path, "optimizer.pt"))
     if self.scheduler is not None:
         torch.save(self.scheduler.state_dict(), os.path.join(path, "scheduler.pt"))
     self.logger.info("Saved checkpoint to %s", path)
Esempio n. 7
0
    def save(self, path):
        # write new model records to disk
        _ = self.population.save(self.result_population_dir)

        # write the indexes of the model records in the current population to checkpoint
        path = utils.makedir(path)
        with open(os.path.join(path, "indexes.yaml"), "w") as w_f:
            yaml.safe_dump([int(ind) for ind in self.indexes], stream=w_f)

        # save mutation sampler state
        self.mutation_sampler.save(os.path.join(path, "mutation_sampler"))
Esempio n. 8
0
    def setup(self,
              load=None,
              save_every=None,
              save_controller_every=None,
              train_dir=None,
              writer=None,
              load_components=None,
              interleave_report_every=None):
        """
        Setup the scaffold: saving/loading/visualization settings.
        """
        if load is not None:
            all_components = ("controller", "evaluator", "trainer")
            load_components = all_components\
                              if load_components is None else load_components
            expect(
                set(load_components).issubset(all_components),
                "Invalid `load_components`")

            if "controller" in load_components:
                path = os.path.join(load, "controller")
                self.logger.info("Load controller from %s", path)
                try:
                    self.controller.load(path)
                except Exception as e:
                    self.logger.error("Controller not loaded! %s", e)
            if "evaluator" in load_components:
                path = os.path.join(load, "evaluator")
                # if os.path.exists(path):
                self.logger.info("Load evaluator from %s", path)
                try:
                    self.evaluator.load(path)
                except Exception as e:
                    self.logger.error("Evaluator not loaded: %s", e)
            if "trainer" in load_components:
                path = os.path.join(load, "trainer")
                # if os.path.exists(path):
                self.logger.info("Load trainer from %s", path)
                try:
                    self.load(path)
                except Exception as e:
                    self.logger.error("Trainer not loaded: %s", e)

        self.save_every = save_every
        self.save_controller_every = save_controller_every
        self.train_dir = utils.makedir(
            train_dir) if train_dir is not None else train_dir
        if writer is not None:
            self.setup_writer(writer.get_sub_writer("trainer"))
            self.controller.setup_writer(writer.get_sub_writer("controller"))
            self.evaluator.setup_writer(writer.get_sub_writer("evaluator"))
        self.interleave_report_every = interleave_report_every
        self.is_setup = True
Esempio n. 9
0
    def evaluate_rollouts(self,
                          rollouts,
                          is_training,
                          portion=None,
                          eval_batches=None,
                          return_candidate_net=False,
                          callback=None):
        for rollout in rollouts:
            cand_net = self.weights_manager.assemble_candidate(rollout)
            ckpt_path = rollout.model_record.checkpoint_path
            train_dir = utils.makedir(ckpt_path + "-train-dir")

            # dump candidate net checkpoint to "`train_dir`/init.pt"
            init_ckpt_fname = os.path.join(train_dir, "init.pt")
            torch.save(cand_net.state_dict(), init_ckpt_fname)
            save_every = rollout.model_record.config.get("save_every", 5)
            seed = rollout.model_record.config.get("seed", 123)

            # dump config to "`train_dir`/train.yaml"
            c_fname = os.path.join(train_dir, "train.yaml")
            rollout.model_record.save_config(c_fname)

            actual_train_dir = os.path.join(train_dir, "train")
            subprocess.check_call((
                "awnas train {config} --save-every {save_every} --seed {seed} "
                "--gpus {gpus} --load-state-dict {load} "
                "--train-dir {train_dir} >/dev/null 2>&1").format(
                    config=c_fname,
                    save_every=save_every,
                    seed=seed,
                    gpus=self.device,
                    load=init_ckpt_fname,
                    train_dir=actual_train_dir),
                                  shell=True)
            # parse log to get final performance
            perfs = self._parse_log(os.path.join(actual_train_dir,
                                                 "train.log"))
            rollout.set_perfs(perfs)

            # copy final model to `ckpt_path`
            final_ckpt_fname = os.path.join(actual_train_dir, "final",
                                            "model.pt")
            if not os.path.exists(final_ckpt_fname):
                final_ckpt_fname = os.path.join(actual_train_dir, "final",
                                                "model_state.pt")
            shutil.copy(final_ckpt_fname, ckpt_path)

            # TODO: access model record through API is better
            rollout.model_record.finished = True
            rollout.model_record.confidence = 1.
        return rollouts
Esempio n. 10
0
 def save(self, path):
     rank = (os.environ.get("LOCAL_RANK"))
     if rank is not None and rank != '0':
         return
     path = utils.makedir(path)
     if self.save_as_state_dict:
         torch.save(self.model.state_dict(), os.path.join(path, "model_state.pt"))
     else:
         # save the model directly instead of the state_dict,
         # so that it can be loaded and run directly, without specificy configuration
         torch.save(self.model, os.path.join(path, "model.pt"))
     torch.save({
         "epoch": self.epoch,
         "optimizer":self.optimizer.state_dict()
     }, os.path.join(path, "optimizer.pt"))
     if self.scheduler is not None:
         torch.save(self.scheduler.state_dict(), os.path.join(path, "scheduler.pt"))
     self.logger.info("Saved checkpoint to %s", path)
Esempio n. 11
0
def derive(cfg_file, load, out_file, n, save_plot, test, steps, gpu, seed,
           dump_mode, runtime_save):
    LOGGER.info("CWD: %s", os.getcwd())
    LOGGER.info("CMD: %s", " ".join(sys.argv))

    setproctitle.setproctitle("awnas-derive config: {}; load: {}; cwd: {}"\
                              .format(cfg_file, load, os.getcwd()))

    # set gpu
    _set_gpu(gpu)
    device = torch.device(
        "cuda:{}".format(gpu) if torch.cuda.is_available() else "cpu")

    # set seed
    if seed is not None:
        LOGGER.info("Setting random seed: %d.", seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)

    # load components config
    LOGGER.info("Loading configuration files.")
    with open(cfg_file, "r") as f:
        cfg = yaml.safe_load(f)

    # initialize components
    LOGGER.info("Initializing components.")
    search_space, controller = _init_components_from_cfg(cfg,
                                                         device,
                                                         controller_only=True)

    # create the directory for saving plots
    if save_plot is not None:
        save_plot = utils.makedir(save_plot)

    if not test:
        controller_path = os.path.join(load, "controller")
        controller.load(controller_path)
        rollouts = controller.sample(n)
        with open(out_file, "w") as of:
            for i, r in enumerate(rollouts):
                if save_plot is not None:
                    r.plot_arch(filename=os.path.join(save_plot, str(i)),
                                label="Derive {}".format(i))
                of.write("# ---- Arch {} ----\n".format(i))
                _dump(r, dump_mode, of)
                of.write("\n")
    else:
        trainer = _init_components_from_cfg(
            cfg, device)[-1]  #, from_controller=True,
        #search_space=search_space, controller=controller)[-1]

        LOGGER.info("Loading from disk...")
        trainer.setup(load=load)
        LOGGER.info("Deriving and testing...")
        if runtime_save:
            rollouts = trainer.derive(n, steps, out_file=out_file)
        else:
            rollouts = trainer.derive(n, steps)
        accs = [r.get_perf() for r in rollouts]
        idxes = np.argsort(accs)[::-1]
        with open(out_file, "w") as of:
            for i, idx in enumerate(idxes):
                rollout = rollouts[idx]
                if save_plot is not None:
                    rollout.plot_arch(filename=os.path.join(save_plot, str(i)),
                                      label="Derive {}; Reward {:.3f}".format(
                                          i, rollout.get_perf()))
                of.write("# ---- Arch {} (Reward {}) ----\n".format(
                    i, rollout.get_perf()))
                _dump(rollout, dump_mode, of)
                of.write("\n")
Esempio n. 12
0
    def compile(self, compile_name, net_cfg, result_dir):
        # construct aw_nas final model

        if pytorch_to_caffe is None:
            self.logger.warn("the submodule pytorch_to_caffe does not exists.")
            return

        search_space = _init_component(net_cfg, "search_space")
        assert isinstance(search_space, GeneralSearchSpace)
        model = _init_component(
            net_cfg,
            "final_model",
            search_space=search_space,
            device="cuda:{}".format(self.gpu),
        )

        rollout = search_space.rollout_from_genotype(
            net_cfg["final_model_cfg"]["genotypes"])

        # pytorch to caffe
        input_size = self.input_size
        ptc_out_dir = utils.makedir(
            os.path.join(result_dir, "pytorch_to_caffe"))
        proto, caffemodel, torch_to_caffe = self._run_pytorch_to_caffe(
            model,
            compile_name,
            ptc_out_dir,
            input_size=input_size,
            debug=self._debug_output,
        )

        # map prims to torch layers, and combining with torch layer to caffe layer name.
        prims = rollout.genotype_list()
        prims_to_torch_layers = {}
        for idx, prim in enumerate(prims):
            torch_layer_names = list(model.layer_idx_to_named_modules(idx))
            prims_to_torch_layers[Prim(**prim)] = torch_layer_names

        prims_to_caffe_name = {}
        for prim, torch_layers in prims_to_torch_layers.items():
            prims_to_caffe_name[prim] = [
                torch_to_caffe[t] for t in torch_layers if t in torch_to_caffe
            ]
        with open("{}/{}_prim2names.pkl".format(ptc_out_dir, compile_name),
                  "wb") as fw:
            pickle.dump(prims_to_caffe_name, fw, pickle.HIGHEST_PROTOCOL)

        try:
            # caffe fix
            fix_out_dir = os.path.join(result_dir, "fix")
            proto, caffemodel = self._caffe_fix(
                proto,
                caffemodel,
                fix_out_dir,
                self.gpu,
                self.calib_iter,
                input_size,
                debug=self._debug_output,
            )

            # dnnc
            dnnc_out_dir = os.path.join(result_dir,
                                        "dnnc_{}".format(self.mode))
            self._run_dnnc(
                compile_name,
                proto,
                caffemodel,
                dnnc_out_dir,
                self.dcf,
                self.mode,
                debug=self._debug_output,
            )
        except Exception as e:
            self.logger.error(str(e))

        return proto, caffemodel, prims_to_caffe_name
Esempio n. 13
0
def train(gpus, seed, cfg_file, load, load_state_dict, save_every, train_dir):
    if train_dir:
        # backup config file, and if in `develop` mode, also backup the aw_nas source code
        train_dir = utils.makedir(train_dir, remove=True)
        shutil.copyfile(cfg_file, os.path.join(train_dir, "train_config.yaml"))

        # add log file handler
        log_file = os.path.join(train_dir, "train.log")
        _logger.addFile(log_file)

    LOGGER.info("CWD: %s", os.getcwd())
    LOGGER.info("CMD: %s", " ".join(sys.argv))

    setproctitle.setproctitle("awnas-train config: {}; train_dir: {}; cwd: {}"\
                              .format(cfg_file, train_dir, os.getcwd()))

    # set gpu
    gpu_list = [int(g) for g in gpus.split(",")]
    if not gpu_list:
        _set_gpu(None)
        device = "cpu"
    else:
        _set_gpu(gpu_list[0])
        device = torch.device("cuda:{}".format(gpu_list[0]) if torch.cuda.
                              is_available() else "cpu")

    # set seed
    if seed is not None:
        LOGGER.info("Setting random seed: %d.", seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)

    # load components config
    LOGGER.info("Loading configuration files.")
    with open(cfg_file, "r") as f:
        cfg = yaml.safe_load(f)

    # initialize components
    LOGGER.info("Initializing components.")
    search_space = _init_component(cfg, "search_space")
    whole_dataset = _init_component(cfg, "dataset")

    _data_type = whole_dataset.data_type()
    if _data_type == "sequence":
        # get the num_tokens
        num_tokens = whole_dataset.vocab_size
        LOGGER.info("Dataset %s: vocabulary size: %d", whole_dataset.NAME,
                    num_tokens)
        model = _init_component(cfg,
                                "final_model",
                                search_space=search_space,
                                device=device,
                                num_tokens=num_tokens)
    else:
        model = _init_component(cfg,
                                "final_model",
                                search_space=search_space,
                                device=device)
    # check model support for data type
    expect(_data_type in model.supported_data_types())
    objective = _init_component(cfg, "objective", search_space=search_space)
    trainer = _init_component(cfg,
                              "final_trainer",
                              dataset=whole_dataset,
                              model=model,
                              device=device,
                              gpus=gpu_list,
                              objective=objective)
    # check trainer support for data type
    expect(_data_type in trainer.supported_data_types())

    # start training
    LOGGER.info("Start training.")
    trainer.setup(load, load_state_dict, save_every, train_dir)
    trainer.train()
Esempio n. 14
0
def search(cfg_file, gpu, seed, load, save_every, interleave_report_every,
           train_dir, vis_dir, develop):
    # check dependency and initialize visualization writer
    if vis_dir:
        vis_dir = utils.makedir(vis_dir, remove=True)
        try:
            import tensorboardX
        except ImportError:
            LOGGER.error(
                "Cannot import module tensorboardX. Will IGNORE the `--vis-dir` option! "
                "Try installing the dependency manually, or `pip install aw_nas[vis]`"
            )
            _writer = None
        else:
            _writer = tensorboardX.SummaryWriter(log_dir=vis_dir)
    else:
        _writer = None
    writer = WrapWriter(_writer)

    if train_dir:
        # backup config file, and if in `develop` mode, also backup the aw_nas source code
        train_dir = utils.makedir(train_dir, remove=True)
        shutil.copyfile(cfg_file, os.path.join(train_dir, "config.yaml"))

        if develop:
            import pkg_resources
            src_path = pkg_resources.resource_filename("aw_nas", "")
            backup_code_path = os.path.join(train_dir, "aw_nas")
            if os.path.exists(backup_code_path):
                shutil.rmtree(backup_code_path)
            LOGGER.info("Copy `aw_nas` source code to %s", backup_code_path)
            shutil.copytree(src_path, backup_code_path, ignore=_onlycopy_py)

        # add log file handler
        log_file = os.path.join(train_dir, "search.log")
        _logger.addFile(log_file)

    LOGGER.info("CWD: %s", os.getcwd())
    LOGGER.info("CMD: %s", " ".join(sys.argv))

    setproctitle.setproctitle("awnas-search config: {}; train_dir: {}; vis_dir: {}; cwd: {}"\
                              .format(cfg_file, train_dir, vis_dir, os.getcwd()))

    # set gpu
    _set_gpu(gpu)
    device = torch.device(
        "cuda:{}".format(gpu) if torch.cuda.is_available() else "cpu")

    # set seed
    if seed is not None:
        LOGGER.info("Setting random seed: %d.", seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)

    # load components config
    LOGGER.info("Loading configuration files.")
    with open(cfg_file, "r") as f:
        cfg = yaml.safe_load(f)

    # initialize components
    LOGGER.info("Initializing components.")
    trainer = _init_components_from_cfg(cfg, device)[-1]

    # setup trainer and train
    trainer.setup(load,
                  save_every,
                  train_dir,
                  writer=writer,
                  interleave_report_every=interleave_report_every)
    trainer.train()
Esempio n. 15
0
 def _save_path(self, name=""):
     if self.train_dir is None:
         return None
     dir_ = utils.makedir(os.path.join(self.train_dir, str(self.epoch)))
     return os.path.join(dir_, name)
Esempio n. 16
0
def eval_arch(cfg_file, arch_file, load, gpu, seed, save_plot, save_state_dict,
              steps):
    setproctitle.setproctitle("awnas-eval-arch config: {}; arch_file: {}; load: {}; cwd: {}"\
                              .format(cfg_file, arch_file, load, os.getcwd()))

    # set gpu
    _set_gpu(gpu)
    device = torch.device(
        "cuda:{}".format(gpu) if torch.cuda.is_available() else "cpu")

    # set seed
    if seed is not None:
        LOGGER.info("Setting random seed: %d.", seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)

    # load components config
    LOGGER.info("Loading configuration files.")
    with open(cfg_file, "r") as f:
        cfg = yaml.safe_load(f)

    # load genotypes
    LOGGER.info("Loading archs from file: %s", arch_file)
    with open(arch_file, "r") as f:
        genotypes = yaml.safe_load(f)
    assert isinstance(genotypes, (list, tuple))

    # initialize and load evaluator
    res = _init_components_from_cfg(cfg, device, evaluator_only=True)
    search_space = res[0]  #pylint: disable=unused-variable
    evaluator = res[-1]
    path = os.path.join(load, "evaluator")
    LOGGER.info("Loading evalutor from %s", path)
    evaluator.load(path)

    # create the directory for saving plots
    if save_plot is not None:
        save_plot = utils.makedir(save_plot)

    # evaluate these rollouts using evaluator
    LOGGER.info("Eval...")
    rollouts = [
        rollout_from_genotype_str(geno, search_space) for geno in genotypes
    ]
    num_r = len(rollouts)

    for i, r in enumerate(rollouts):
        evaluator.evaluate_rollouts([r],
                                    is_training=False,
                                    eval_batches=steps,
                                    return_candidate_net=save_state_dict)[0]
        if save_state_dict is not None:
            # save state dict of the candidate network (active members only)
            # corresponding to each rollout to `save_state_dict` path
            torch.save(r.candidate_net.state_dict(),
                       os.path.join(save_state_dict, str(i)))
        if save_plot is not None:
            r.plot_arch(filename=os.path.join(save_plot, str(i)),
                        label="Derive {}; Reward {:.3f}".format(
                            i, r.get_perf(name="reward")))
        print("Finish test {}/{}\r".format(i + 1, num_r), end="")
    for i, r in enumerate(rollouts):
        LOGGER.info(
            "Arch %3d: %s", i,
            "; ".join(["{}: {:.3f}".format(n, v) for n, v in r.perf.items()]))
Esempio n. 17
0
def sample(load, out_file, n, save_plot, gpu, seed, dump_mode, prob_thresh,
           unique):
    LOGGER.info("CWD: %s", os.getcwd())
    LOGGER.info("CMD: %s", " ".join(sys.argv))

    setproctitle.setproctitle("awnas-sample load: {}; cwd: {}".format(
        load, os.getcwd()))

    # set gpu
    _set_gpu(gpu)
    device = torch.device(
        "cuda:{}".format(gpu) if torch.cuda.is_available() else "cpu")

    # set seed
    if seed is not None:
        LOGGER.info("Setting random seed: %d.", seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)

    # create the directory for saving plots
    if save_plot is not None:
        save_plot = utils.makedir(save_plot)

    controller_path = os.path.join(load)
    # load the model on cpu
    controller = torch.load(controller_path, map_location=torch.device("cpu"))
    # then set the device
    controller.set_device(device)

    if prob_thresh or unique:
        sampled = 0
        ignored = 0
        rollouts = []
        genotypes = []
        while sampled < n:
            rollout_cands = controller.sample(n - sampled)
            for r in rollout_cands:
                assert "log_probs" in r.info
                log_prob = np.array([
                    utils.get_numpy(cg_lp) for cg_lp in r.info["log_probs"]
                ]).sum()
                if np.exp(log_prob) < prob_thresh:
                    ignored += 1
                    LOGGER.info("(ignored %d) Ignore arch prob %.3e (< %.3e)",
                                ignored, np.exp(log_prob), prob_thresh)
                elif r.genotype in genotypes:
                    ignored += 1
                    LOGGER.info("(ignored %d) Ignore duplicated arch", ignored)
                else:
                    sampled += 1
                    LOGGER.info("(choosed %d) Choose arch prob %.3e (>= %.3e)",
                                sampled, np.exp(log_prob), prob_thresh)
                    rollouts.append(r)
                    genotypes.append(r.genotype)
    else:
        rollouts = controller.sample(n)

    with open(out_file, "w") as of:
        for i, r in enumerate(rollouts):
            if save_plot is not None:
                r.plot_arch(filename=os.path.join(save_plot, str(i)),
                            label="Derive {}".format(i))
            if "log_probs" in r.info:
                log_prob = np.array([
                    utils.get_numpy(cg_lp) for cg_lp in r.info["log_probs"]
                ]).sum()
                of.write("# ---- Arch {} log_prob: {:.3f} prob: {:.3e} ----\n".
                         format(i, log_prob, np.exp(log_prob)))
            else:
                of.write("# ---- Arch {} ----\n".format(i))
            _dump(r, dump_mode, of)
            of.write("\n")
Esempio n. 18
0
def mpsearch(cfg_file, seed, load, save_every, interleave_report_every,
             train_dir, vis_dir, develop):
    # check dependency and initialize visualization writer
    local_rank = int(os.environ["LOCAL_RANK"])
    # set gpu
    _set_gpu(local_rank)
    device = torch.cuda.current_device()
    torch.distributed.init_process_group(backend="nccl",
                                         rank=int(os.environ["RANK"]),
                                         world_size=int(
                                             os.environ["WORLD_SIZE"]))

    if vis_dir and local_rank == 0:
        vis_dir = utils.makedir(vis_dir, remove=True)
        try:
            import tensorboardX
        except ImportError:
            LOGGER.error(
                "Cannot import module tensorboardX. Will IGNORE the `--vis-dir` option! "
                "Try installing the dependency manually, or `pip install aw_nas[vis]`"
            )
            _writer = None
        else:
            _writer = tensorboardX.SummaryWriter(log_dir=vis_dir)
    else:
        _writer = None
    writer = WrapWriter(_writer)

    if train_dir:
        if local_rank == 0:
            # backup config file, and if in `develop` mode, also backup the aw_nas source code
            train_dir = utils.makedir(train_dir, remove=True)
            shutil.copyfile(cfg_file, os.path.join(train_dir, "config.yaml"))

            if develop:
                import pkg_resources
                src_path = pkg_resources.resource_filename("aw_nas", "")
                backup_code_path = os.path.join(train_dir, "aw_nas")
                if os.path.exists(backup_code_path):
                    shutil.rmtree(backup_code_path)
                LOGGER.info("Copy `aw_nas` source code to %s",
                            backup_code_path)
                shutil.copytree(src_path,
                                backup_code_path,
                                ignore=_onlycopy_py)

    torch.distributed.barrier()

    if train_dir:
        # add log file handler
        log_file = os.path.join(
            train_dir, "search{}.log".format("" if local_rank ==
                                             0 else "_{}".format(local_rank)))
        _logger.addFile(log_file)

    LOGGER.info("CWD: %s", os.getcwd())
    LOGGER.info("CMD: %s", " ".join(sys.argv))

    setproctitle.setproctitle("awnas-search config: {}; train_dir: {}; vis_dir: {}; cwd: {}"\
                              .format(cfg_file, train_dir, vis_dir, os.getcwd()))

    LOGGER.info(
        ("Start distributed parallel searching: (world size {}; MASTER {}:{})"
         " rank {} local_rank {} PID {}").format(int(os.environ["WORLD_SIZE"]),
                                                 os.environ["MASTER_ADDR"],
                                                 os.environ["MASTER_PORT"],
                                                 os.environ["RANK"],
                                                 local_rank, os.getpid()))

    # set seed
    if seed is not None:
        LOGGER.info("Setting random seed: %d.", seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)

    # load components config
    LOGGER.info("Loading configuration files.")
    with open(cfg_file, "r") as f:
        cfg = yaml.safe_load(f)

    cfg["weights_manager_cfg"]["multiprocess"] = True
    cfg["evaluator_cfg"]["multiprocess"] = True

    # initialize components
    LOGGER.info("Initializing components.")
    whole_dataset = _init_component(cfg, "dataset")
    rollout_type = cfg["rollout_type"]

    search_space = _init_component(cfg, "search_space")
    controller = _init_component(cfg,
                                 "controller",
                                 search_space=search_space,
                                 device=device,
                                 rollout_type=rollout_type)

    _data_type = whole_dataset.data_type()

    if _data_type == "sequence":
        # get the num_tokens
        num_tokens = whole_dataset.vocab_size
        LOGGER.info("Dataset %s: vocabulary size: %d", whole_dataset.NAME,
                    num_tokens)
        weights_manager = _init_component(cfg,
                                          "weights_manager",
                                          search_space=search_space,
                                          device=device,
                                          gpus=[device],
                                          rollout_type=rollout_type,
                                          num_tokens=num_tokens)
    else:
        weights_manager = _init_component(cfg,
                                          "weights_manager",
                                          search_space=search_space,
                                          device=device,
                                          gpus=[device],
                                          rollout_type=rollout_type)
    # check model support for data type
    expect(_data_type in weights_manager.supported_data_types())

    objective = _init_component(cfg, "objective", search_space=search_space)
    # evaluator
    evaluator = _init_component(cfg,
                                "evaluator",
                                dataset=whole_dataset,
                                weights_manager=weights_manager,
                                objective=objective,
                                rollout_type=rollout_type)
    expect(_data_type in evaluator.supported_data_types())

    trainer = _init_component(cfg,
                              "trainer",
                              evaluator=evaluator,
                              controller=controller,
                              rollout_type=rollout_type)

    # setup trainer and train
    if local_rank != 0:
        save_every = None
    trainer.setup(load,
                  save_every,
                  train_dir,
                  writer=writer,
                  interleave_report_every=interleave_report_every)
    trainer.train()
Esempio n. 19
0
def genprof(cfg_file, hwobj_cfg_file, result_dir, compile_hardware,
            num_sample):
    with open(cfg_file, "r") as ss_cfg_f:
        ss_cfg = yaml.load(ss_cfg_f)
    with open(hwobj_cfg_file, "r") as hw_cfg_f:
        hw_cfg = yaml.load(hw_cfg_f)

    ss = get_search_space(hw_cfg["mixin_search_space_type"],
                          **ss_cfg["search_space_cfg"],
                          **hw_cfg["mixin_search_space_cfg"])
    expect(isinstance(ss, MixinProfilingSearchSpace),
           "search space must be a subclass of MixinProfilingsearchspace")

    result_dir = utils.makedir(result_dir)
    # copy cfg files
    shutil.copyfile(cfg_file, os.path.join(result_dir, "config.yaml"))
    shutil.copyfile(hwobj_cfg_file,
                    os.path.join(result_dir, "hwobj_config.yaml"))

    # generate profiling primitive list
    assert 'prof_prims_cfg' in hw_cfg, "key prof_prims_cfg must be specified in hardware configuration file."
    hw_obj_cfg = hw_cfg['prof_prims_cfg']
    prof_prims = list(
        ss.generate_profiling_primitives(**hw_obj_cfg))
    prof_prim_fname = os.path.join(result_dir, "prof_prims.yaml")
    with open(prof_prim_fname, "w") as prof_prim_f:
        yaml.dump(prof_prims, prof_prim_f)
    LOGGER.info("Save the list of profiling primitives to %s", prof_prim_fname)

    if num_sample:
        prof_net_cfgs = sample_networks(
            ss,
            base_cfg_template=hw_cfg["profiling_net_cfg"]
            ["base_cfg_template"],
            num_sample=num_sample,
            **hw_obj_cfg)
    else:
        # assemble profiling nets
        # the primitives can actually be mapped to layers in model during the assembling process
        prof_net_cfgs = assemble_profiling_nets(prof_prims,
                                                **hw_cfg["profiling_net_cfg"])
    prof_net_cfgs = list(prof_net_cfgs)
    prof_net_dir = utils.makedir(os.path.join(result_dir, "prof_nets"),
                                 remove=True)
    prof_fnames = []
    for i_net, prof_net_cfg in enumerate(prof_net_cfgs):
        prof_fname = os.path.join(prof_net_dir, "{}.yaml".format(i_net))
        prof_fnames.append(prof_fname)
        with open(prof_fname, "w") as prof_net_f:
            yaml.dump(prof_net_cfg, prof_net_f)
    LOGGER.info("Save the profiling net configs to directory %s", prof_net_dir)

    # optional (hardware specific): call hardware-specific compiling process
    hw_cfgs = hw_cfg.get("hardware_compilers", [])
    if compile_hardware:
        hw_cfgs.extend([{
            "hardware_compiler_type": hw_name,
            'hardware_compiler_cfg': {}
        } for hw_name in compile_hardware])
    if hw_cfgs:
        hw_compile_dir = utils.makedir(os.path.join(result_dir, "hardwares"),
                                       remove=True)
        LOGGER.info("Call hardware compilers: total %d", len(hw_cfgs))
        for i_hw, hw_cfg in enumerate(hw_cfgs):
            hw_name = hw_cfg["hardware_compiler_type"]
            hw_kwargs = hw_cfg.get("hardware_compiler_cfg", {})
            hw_compiler = BaseHardwareCompiler.get_class_(hw_name)(**hw_kwargs)
            LOGGER.info("{}: Constructed hardware compiler {}{}".format(
                i_hw, hw_name, ":{}".format(hw_kwargs) if hw_kwargs else ""))
            hw_res_dir = utils.makedir(
                os.path.join(hw_compile_dir, "{}-{}".format(i_hw, hw_name)))
            for i_net, prof_cfg in enumerate(prof_net_cfgs):
                res_dir = utils.makedir(os.path.join(hw_res_dir, str(i_net)))
                hw_compiler.compile("{}-{}-{}".format(i_hw, hw_name, i_net),
                                    prof_cfg, res_dir)
Esempio n. 20
0
def mptrain(seed, cfg_file, load, load_state_dict, save_every, train_dir):
    local_rank = int(os.environ["LOCAL_RANK"])
    # set gpu
    _set_gpu(local_rank)
    device = torch.cuda.current_device()
    torch.distributed.init_process_group(backend="nccl")

    if train_dir:
        # backup config file, and if in `develop` mode, also backup the aw_nas source code
        if local_rank == 0:
            train_dir = utils.makedir(train_dir, remove=False)
            shutil.copyfile(cfg_file,
                            os.path.join(train_dir, "train_config.yaml"))

    torch.distributed.barrier()

    if train_dir:
        # add log file handler
        log_file = os.path.join(
            train_dir, "train{}.log".format("" if local_rank ==
                                            0 else "_{}".format(local_rank)))
        _logger.addFile(log_file)

    LOGGER.info("CWD: %s", os.getcwd())
    LOGGER.info("CMD: %s", " ".join(sys.argv))

    setproctitle.setproctitle("awnas-train config: {}; train_dir: {}; cwd: {}"\
                              .format(cfg_file, train_dir, os.getcwd()))

    LOGGER.info(
        ("Start distributed parallel training: (world size {}; MASTER {}:{})"
         " rank {} local_rank {} PID {}").format(int(os.environ["WORLD_SIZE"]),
                                                 os.environ["MASTER_ADDR"],
                                                 os.environ["MASTER_PORT"],
                                                 os.environ["RANK"],
                                                 local_rank, os.getpid()))

    # set seed
    if seed is not None:
        LOGGER.info("Setting random seed: %d.", seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)

    # load components config
    LOGGER.info("Loading configuration files.")
    with open(cfg_file, "r") as f:
        cfg = yaml.safe_load(f)

    cfg["final_trainer_cfg"]["multiprocess"] = True

    # initialize components
    LOGGER.info("Initializing components.")
    search_space = _init_component(cfg, "search_space")
    whole_dataset = _init_component(cfg, "dataset")

    _data_type = whole_dataset.data_type()
    if _data_type == "sequence":
        # get the num_tokens
        num_tokens = whole_dataset.vocab_size
        LOGGER.info("Dataset %s: vocabulary size: %d", whole_dataset.NAME,
                    num_tokens)
        model = _init_component(cfg,
                                "final_model",
                                search_space=search_space,
                                device=device,
                                num_tokens=num_tokens)
    else:
        model = _init_component(cfg,
                                "final_model",
                                search_space=search_space,
                                device=device)
    # check model support for data type
    expect(_data_type in model.supported_data_types())
    objective = _init_component(cfg, "objective", search_space=search_space)
    trainer = _init_component(cfg,
                              "final_trainer",
                              dataset=whole_dataset,
                              model=model,
                              device=device,
                              gpus=[device],
                              objective=objective)
    # check trainer support for data type
    expect(_data_type in trainer.supported_data_types())

    # start training
    LOGGER.info("Start training.")
    if local_rank != 0:
        save_every = None
    trainer.setup(load, load_state_dict, save_every, train_dir)
    trainer.train()