Esempio n. 1
0
    def __init__(self, args):
        self.args = args

        ######### Initialize the Multiagent Team of agents ########
        if self.args.ps == 'full' or self.args.ps == 'trunk':
            self.agents = [
                Agent(self.args, id)
            ]  #todo: is it one agent or more>? is it just agent? sharing all parameters
        elif self.args.ps == 'none':
            self.agents = [
                Agent(self.args, id)
                for id in range(self.args.config.num_agents)
            ]  # neural network for each agent
        else:
            sys.exit('Incorrect PS choice')
        self.test_agent = TestAgent(self.args, 991)

        ###### Buffer and Model Bucket as references to the corresponding agent's attributes ####
        if args.ps == "trunk":
            self.buffer_bucket = [
                buffer.tuples for buffer in self.agents[0].buffer
            ]
        else:
            self.buffer_bucket = [ag.buffer.tuples for ag in self.agents]

        # Specifying 3 different networks for evo, PG and test rollouts
        self.popn_bucket = [ag.popn for ag in self.agents]
        self.rollout_bucket = [ag.rollout_actor for ag in self.agents]
        self.test_bucket = self.test_agent.rollout_actor

        ######### EVOLUTIONARY WORKERS ############
        if self.args.popn_size > 0:
            self.evo_task_pipes = [
                Pipe() for _ in range(args.popn_size * args.num_evals)
            ]  # evals for computing the fitness
            self.evo_result_pipes = [
                Pipe() for _ in range(args.popn_size * args.num_evals)
            ]
            self.evo_workers = [
                Process(target=rollout_worker,
                        args=(self.args, i, 'evo', self.evo_task_pipes[i][1],
                              self.evo_result_pipes[i][0], self.buffer_bucket,
                              self.popn_bucket, True, RANDOM_BASELINE))
                for i in range(args.popn_size * args.num_evals)
            ]  # rollout for pop_size*num_evals, # popn_bucket is the neural network for evo
            for worker in self.evo_workers:
                worker.start()

        ######### POLICY GRADIENT WORKERS ############
        if self.args.rollout_size > 0:
            self.pg_task_pipes = Pipe()
            self.pg_result_pipes = Pipe()
            self.pg_workers = [
                Process(target=rollout_worker,
                        args=(self.args, 0, 'pg', self.pg_task_pipes[1],
                              self.pg_result_pipes[0], self.buffer_bucket,
                              self.rollout_bucket, self.args.rollout_size > 0,
                              RANDOM_BASELINE))
            ]  # rollout_bucket is the neural network for evo
            for worker in self.pg_workers:
                worker.start()

        ######### TEST WORKERS ############
        self.test_task_pipes = Pipe()
        self.test_result_pipes = Pipe()
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(self.args, 0, 'test', self.test_task_pipes[1],
                          self.test_result_pipes[0], None, self.test_bucket,
                          False, RANDOM_BASELINE))
        ]  # test_bucket is the neural network for evo
        for worker in self.test_workers:
            worker.start()

        #### STATS AND TRACKING WHICH ROLLOUT IS DONE ######
        self.best_score = -999
        self.total_frames = 0
        self.gen_frames = 0
        self.test_trace = []
Esempio n. 2
0
    def meta_fit(self, meta_dataset_generator):

        with tf.device('/cpu:0'):
            LOGGER.debug('My PID: %s' % os.getpid())

            self.timer.begin('main training')
            mp.set_start_method('spawn', force=True)
            
            self.timer.begin('build data pipeline')

            # these reservoirs are used to send data to sub-process
            train_data_process_reservoir = [queue.Queue(self.train_cache_size) for i in range(len(self.devices))]
            valid_data_process_reservoir = [queue.Queue(self.valid_cache_size) for i in range(len(self.devices))]
            
            meta_valid_reservoir = [queue.Queue(self.eval_tasks) for i in range(self.total_exp)]

            # these reserviors are used to only store the extracted data
            train_data_extract_reservoir = [queue.Queue(self.train_cache_size) for i in range(len(self.devices))]
            valid_data_extract_reservoir = [queue.Queue(self.valid_cache_size) for i in range(len(self.devices))]

            if self.fix_valid:
                valid_data_cache = [[] for _ in range(len(self.devices))]
                valid_data_pointer = [0 for _ in range(len(self.devices))]
            
            train_recv, valid_recv = [], []
            train_send, valid_send = [], []
            for i in range(len(self.devices)):
                recv, send = Pipe(True)
                # activate the first handshake
                recv.send(True)
                train_recv.append(recv)
                train_send.append(send)
                recv, send = Pipe(True)
                # activate the first handshake
                recv.send(True)
                valid_recv.append(recv)
                valid_send.append(send)

            def apply_device_to_hp(hp, device):
                hp['device'] = 'cuda:{}'.format(device)
                return hp
            
            self.timer.end('build data pipeline')

            self.timer.begin('build main proc pipeline')
            clsnum = get_base_class_number(meta_dataset_generator)
            LOGGER.info('base class number detected', clsnum)
            procs = [mp.Process(
                target=run_exp,
                args=(
                    self.modules[i].MyMetaLearner,
                    apply_device_to_hp(self.hp[i], dev),
                    train_recv[i], valid_recv[i],
                    clsnum, 
                    self.modules[i].process_data if self.process_protocol != 'process-in-main' else None
                )
            ) for i, dev in enumerate(self.devices)]

            for p in procs: p.daemon = True; p.start()

            self.timer.end('build main proc pipeline')
            LOGGER.info('build data', self.timer.query_time_by_name('build data pipeline'), 'build proc', self.timer.query_time_by_name('build main proc pipeline'))
            label_meta_valid = []

            data_generation = True

            self.timer.begin('prepare dataset')
            meta_train_dataset = meta_dataset_generator.meta_train_pipeline.batch(1)
            meta_train_generator = iter(meta_train_dataset)
            meta_valid_dataset = meta_dataset_generator.meta_valid_pipeline.batch(1)
            meta_valid_generator = iter(meta_valid_dataset)
            self.timer.end('prepare dataset')
            LOGGER.info('prepare dataset', self.timer.query_time_by_name('prepare dataset'))

            global valid_ens_data_load_number
            valid_ens_data_load_number = 0

            def train_pipe_fill():
                while data_generation:
                    data_train = process_task_batch(next(meta_train_generator), device=torch.device('cpu'), with_origin_label=True)
                    for dr in train_data_extract_reservoir:
                        try: dr.put_nowait(data_train)
                        except: pass
                    time.sleep(0.001)
            
            def valid_pipe_fill():
                global valid_ens_data_load_number
                while data_generation:
                    data_valid = process_task_batch(next(meta_valid_generator), device=torch.device('cpu'), with_origin_label=False)
                    for dr in valid_data_extract_reservoir:
                        try: dr.put_nowait(data_valid)
                        except: pass
                        if random.random() < 0.1 and valid_ens_data_load_number < self.eval_tasks:
                            # fill the meta-valid
                            valid_ens_data_load_number += 1
                            label_meta_valid.extend(data_valid[1][1].tolist())
                            for dr in meta_valid_reservoir:
                                    dr.put([data_valid[0][0], data_valid[0][1], data_valid[1][0]])
                    time.sleep(0.001)

            def put_data_train_passive(i):
                while data_generation:
                    try:
                        if train_send[i].recv(): train_send[i].send(train_data_process_reservoir[i].get())
                        else: return
                    except: pass

            def put_data_valid_passive(i):
                while data_generation:
                    try:
                        if valid_send[i].recv():
                            if self.fix_valid:
                                if len(valid_data_cache[i]) == self.hp[i]['eval_tasks']:
                                    # retrieve the ith element
                                    data = valid_data_cache[i][valid_data_pointer[i]]
                                    valid_data_pointer[i] = (valid_data_pointer[i] + 1) % self.hp[i]['eval_tasks']
                                    valid_send[i].send(data)
                                else:
                                    # fill the cache
                                    data = valid_data_process_reservoir[i].get()
                                    valid_data_cache[i].append(data)
                                    valid_send[i].send(data)
                            else:
                                valid_send[i].send(valid_data_process_reservoir[i].get())

                        else: return
                    except: pass
            
            def process_data(i, train=True):
                while data_generation:
                    extract_ = train_data_extract_reservoir[i] if train else valid_data_extract_reservoir[i]
                    process_ = train_data_process_reservoir[i] if train else valid_data_process_reservoir[i]
                    data = extract_.get()
                    if data == False: break
                    if self.process_protocol == 'process-in-main':
                        data = self.modules[i].process_data(data[0], data[1], train, apply_device_to_hp(self.hp[i], self.devices[i]))
                    process_.put(data)
            
            thread_pool = [threading.Thread(target=train_pipe_fill), threading.Thread(target=valid_pipe_fill)] + \
                [threading.Thread(target=put_data_train_passive, args=(i,)) for i in range(self.total_exp)] + \
                [threading.Thread(target=put_data_valid_passive, args=(i,)) for i in range(self.total_exp)] + \
                [threading.Thread(target=process_data, args=(i, train)) for i, train in itertools.product(range(self.total_exp), [True, False])]
            
            for th in thread_pool: th.daemon = True; th.start()

            try:
                # we leave about 20 min for decoding of test
                for p in procs: p.join(max(self.timer.time_left() - 60 * 20, 0.1))
            
                self.timer.begin('clear env')
                # terminate proc that is out-of-time
                LOGGER.info('Main meta-train is done', '' if self.timer.time_left() > 60 else 'time out exit')
                LOGGER.info('time left', self.timer.time_left(), 's')
                for p in procs:
                    if p.is_alive():
                        p.terminate()
                
                LOGGER.info('all process terminated')

                data_generation = False
                
                LOGGER.info('send necessary messages in case of block')
                # solve the pipe block
                try:
                    for s in train_recv + valid_recv: s.send(False)
                    for s in train_send + train_recv + valid_send + valid_recv: s.close()
                except:
                    LOGGER.error('wired, it should not fire any errors, but it just did')
                
                # solve the block of extract reservoir
                for q in train_data_extract_reservoir + valid_data_extract_reservoir:
                    if q.empty():
                        q.put(False)

                for q in train_data_process_reservoir + valid_data_process_reservoir:
                    if q.full():
                        q.get()
                    elif q.empty():
                        q.put(False)

                LOGGER.info('wait for all data thread')
                for p in thread_pool: p.join()
                LOGGER.info('wait for sub process to exit')
                for p in procs: p.join()
                self.timer.end('clear env')
                LOGGER.info('clear env', self.timer.query_time_by_name('clear env'))
                
                self.timer.end('main training')
            except Exception:
                LOGGER.info('error occured in main process')
                traceback.print_exc()

            LOGGER.info('spawn total {} meta valid tasks. main training time {}'.format(valid_ens_data_load_number, self.timer.query_time_by_name('main training')))
            
            self.timer.begin('load learner')

            self.meta_learners = [None] * self.total_exp

            def load_model(args):
                module, hp, i = args
                self.meta_learners[i] = module.load_model(hp)

            pool = [threading.Thread(target=load_model, args=((self.modules[i], self.hp[i], i), )) for i in range(self.total_exp)]
            for p in pool: p.daemon=True; p.start()
            for p in pool: p.join()

            self.timer.end('load learner')
            LOGGER.info('load learner done, time spent', self.timer.query_time_by_name('load learner'))
            
            if not isinstance(self.ensemble, int):
                # auto-ensemble by exhaustive search
                procs = []
                reses = [None] * len(self.meta_learners)
                
                self.timer.begin('validation')
                
                recv_list, sent_list = [], []
                for i in range(self.total_exp):
                    r, s = Pipe(True)
                    r.send(True)
                    recv_list.append(r)
                    sent_list.append(s)

                processes = [mp.Process(target=predict, args=(
                    self.meta_learners[i],
                    recv_list[i],
                    self.eval_tasks,
                    self.hp[i]['device'],
                    {
                        'time_fired': time.time(),
                        'taskid': i
                    }
                )) for i in range(self.total_exp)]

                for p in processes: p.daemon = True; p.start()
                
                # start sub thread to pass data
                def pass_meta_data(i):
                    for _ in range(self.eval_tasks):
                        if sent_list[i].recv():
                            sent_list[i].send(meta_valid_reservoir[i].get())
                
                threads = [threading.Thread(target=pass_meta_data, args=(i, )) for i in range(self.total_exp)]
                for t in threads: t.daemon = True; t.start()
                
                for _ in range(self.eval_tasks - valid_ens_data_load_number):
                    data_valid = next(meta_valid_generator)
                    data_valid = process_task_batch(data_valid, device=torch.device('cpu'), with_origin_label=False)
                    label_meta_valid.extend(data_valid[1][1].tolist())
                    for dr in meta_valid_reservoir:
                        dr.put([data_valid[0][0], data_valid[0][1], data_valid[1][0]])
                    # LOGGER.info('put data!')
                LOGGER.info('all data done!')
                LOGGER.info(len(label_meta_valid))
                
                # now we can receive data
                for t in threads: t.join()
                reses = [sent_list[i].recv()['res'] for i in range(self.total_exp)]
                for send in sent_list:
                    send.send(True)
                # for p in processes: p.join()
                # every res in reses is a np.array of shape (eval_task * WAY * QUERY) * WAY
                ENS_VALID_TASK = 100
                ENS_VALID_ELEMENT = ENS_VALID_TASK * 5 * 19
                reses_test_list = [deepcopy(res[-ENS_VALID_ELEMENT:]) for res in reses]

                self.timer.end('validation')
                LOGGER.info('valid data predict done', self.timer.query_time_by_name('validation'))
                
                weight = [1.] * len(self.meta_learners)
                labels = np.array(label_meta_valid, dtype=np.int)                            # 19000
                acc_o = ((np.array(weight)[:,None, None] / sum(weight) * np.array(reses)).sum(axis=0).argmax(axis=1) == labels).astype(np.float).mean()
                reses = np.array(reses, dtype=np.float).transpose((1, 0, 2))
                reses_test = reses[-ENS_VALID_ELEMENT:].reshape(ENS_VALID_ELEMENT, -1)
                reses = reses[:-ENS_VALID_ELEMENT]
                reses = reses.reshape(len(reses), -1)
                labels_test = labels[-ENS_VALID_ELEMENT:]
                labels = labels[:-ENS_VALID_ELEMENT]
                LOGGER.info('voting result', acc_o)

                self.timer.begin('ensemble')

                # mp.set_start_method('fork', True)
                pool = mp.Pool(3)
                result = pool.map(ensemble_on_data, [
                    # (GBMEnsembler(), reses, labels, 'gbm'), # currently, gbm has some problems when save/load
                    (GLMEnsembler(), reses, labels, 'glm'),
                    (NBEnsembler(), reses, labels, 'nb'),
                    (RFEnsembler(), reses, labels, 'rf') # too over-fit on simple dataset
                ])

                # test the ensemble model
                def acc(logit, label):
                    return (logit.argmax(axis=1) == label).mean()
                res_test = [x[0]._predict(reses_test) for x in result]
                acc_test = [acc(r, labels_test) for r in res_test]
                acc_single_test = [acc(np.array(r), labels_test) for r in reses_test_list]
                LOGGER.info('ensemble test', 'glm', 'nb', 'rf', acc_test)
                LOGGER.info('single test', acc_single_test)

                if max(acc_test) > max(acc_single_test):
                    LOGGER.info("will use ensemble model")
                    #idx_acc_max = np.argmax([x[1] for x in result])
                    idx_acc_max = np.argmax(acc_test)
                    self.timer.end('ensemble')
                    print('best ensembler', ['glm', 'nb', 'rf'][idx_acc_max], 'acc', acc_test[idx_acc_max])
                    print('ensemble done, time cost', self.timer.query_time_by_name('ensemble'))

                    return MyLearner(self.meta_learners, result[idx_acc_max][0], timers=self.timer)
                else:
                    LOGGER.info("will use single model")
                    idx_acc_max = np.argmax(acc_single_test)
                    self.timer.end('ensemble')
                    print('best single model id', idx_acc_max)
                    print('ensemble done, time cost', self.timer.query_time_by_name('ensemble'))

                    # return only the best meta learners
                    return MyLearner([self.meta_learners[idx_acc_max]], 0, self.timer)
            return MyLearner([self.meta_learners[self.ensemble]], 0, timers=self.timer)
Esempio n. 3
0
                                   lr=0.0007,
                                   alpha=0.99,
                                   eps=0.1,
                                   momentum=0.0)
    #Optimizer.share_memory()
    CriticOptimizer.share_memory()
    ActorOptimizer.share_memory()

    lock = Lock()

    num_cpu = 4
    agents = []
    for cpu in range(num_cpu):
        agents.append(Agent(cpu))

    receiver, sender = Pipe()

    agent_threads = []
    for agent in agents:
        thread = Process(target=agent.letsgo,
                         args=(
                             GlobalModel,
                             CriticOptimizer,
                             ActorOptimizer,
                             lock,
                             sender,
                             MAX_EPISODES,
                             MAX_ACTIONS,
                             DISCOUNT_FACTOR,
                             STEPS,
                             Optimizer,
Esempio n. 4
0
class OnlineVaeAlgorithm(TorchBatchRLAlgorithm):
    def __init__(self,
                 vae,
                 vae_trainer,
                 *base_args,
                 vae_save_period=1,
                 vae_training_schedule=vae_schedules.never_train,
                 oracle_data=False,
                 parallel_vae_train=True,
                 vae_min_num_steps_before_training=0,
                 uniform_dataset=None,
                 **base_kwargs):
        super().__init__(*base_args, **base_kwargs)
        assert isinstance(self.replay_buffer, ReplayBuffer)
        self.vae = vae
        self.vae_trainer = vae_trainer
        self.vae_trainer.model = self.vae
        self.vae_save_period = vae_save_period
        self.vae_training_schedule = vae_training_schedule
        self.oracle_data = oracle_data

        self.parallel_vae_train = parallel_vae_train
        self.vae_min_num_steps_before_training = vae_min_num_steps_before_training
        self.uniform_dataset = uniform_dataset

        self._vae_training_process = None
        self._update_subprocess_vae_thread = None
        self._vae_conn_pipe = None

    def _train(self):
        super()._train()
        self._cleanup()

    def _end_epoch(self, epoch):
        self._train_vae(epoch)
        gt.stamp('vae training')
        super()._end_epoch(epoch)

    def _log_stats(self, epoch):
        self._log_vae_stats()
        super()._log_stats(epoch)

    def to(self, device):
        self.vae.to(device)
        super().to(device)

    def _get_snapshot(self):
        snapshot = super()._get_snapshot()
        assert 'vae' not in snapshot
        snapshot['vae'] = self.vae

        snapshot['replay_buffer'] = dict(
            _obs=self.replay_buffer._obs,
            _actions=self.replay_buffer._actions,
            _next_obs=self.replay_buffer._next_obs,
            _terminals=self.replay_buffer._terminals,
            _size=self.replay_buffer._size,
            _top=self.replay_buffer._top,
            _idx_to_future_obs_idx=self.replay_buffer._idx_to_future_obs_idx)

        return snapshot

    """
    VAE-specific Code
    """

    def _train_vae(self, epoch):
        if self.parallel_vae_train and self._vae_training_process is None:
            self.init_vae_training_subprocess()
        should_train, amount_to_train = self.vae_training_schedule(epoch)
        rl_start_epoch = int(self.min_num_steps_before_training /
                             (self.num_expl_steps_per_train_loop *
                              self.num_train_loops_per_epoch))
        if should_train or epoch <= (rl_start_epoch - 1):
            if self.parallel_vae_train:
                assert self._vae_training_process.is_alive()
                # Make sure the last vae update has finished before starting
                # another one
                if self._update_subprocess_vae_thread is not None:
                    self._update_subprocess_vae_thread.join()
                self._update_subprocess_vae_thread = Thread(
                    target=OnlineVaeAlgorithm.
                    update_vae_in_training_subprocess,
                    args=(self, epoch, ptu.device))
                self._update_subprocess_vae_thread.start()
                self._vae_conn_pipe.send((amount_to_train, epoch))
            else:
                _train_vae(self.vae_trainer, self.replay_buffer, epoch,
                           amount_to_train)
                self.replay_buffer.refresh_latents(epoch)
                _test_vae(
                    self.vae_trainer,
                    epoch,
                    self.replay_buffer,
                    vae_save_period=self.vae_save_period,
                    uniform_dataset=self.uniform_dataset,
                )

    def _log_vae_stats(self):
        logger.record_dict(
            self.vae_trainer.get_diagnostics(),
            prefix='vae_trainer/',
        )

    def _cleanup(self):
        if self.parallel_vae_train:
            self._vae_conn_pipe.close()
            self._vae_training_process.terminate()

    def init_vae_training_subprocess(self):
        assert isinstance(self.replay_buffer, SharedObsDictRelabelingBuffer)

        self._vae_conn_pipe, process_pipe = Pipe()
        self._vae_training_process = Process(
            target=subprocess_train_vae_loop,
            args=(
                process_pipe,
                self.vae,
                self.vae.state_dict(),
                self.replay_buffer,
                self.replay_buffer.get_mp_info(),
                ptu.device,
            ))
        self._vae_training_process.start()
        self._vae_conn_pipe.send(self.vae_trainer)

    def update_vae_in_training_subprocess(self, epoch, device):
        self.vae.__setstate__(self._vae_conn_pipe.recv())
        self.vae.to(device)
        _test_vae(
            self.vae_trainer,
            epoch,
            self.replay_buffer,
            vae_save_period=self.vae_save_period,
            uniform_dataset=self.uniform_dataset,
        )
Esempio n. 5
0
def main():
    args = get_args()
    device = torch.device('cuda' if args.cuda else 'cpu')

    env = gym.make(args.env_name)

    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in args.env_name:
        output_size -= 1

    env.close()

    is_render = False
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    model_path = os.path.join(args.save_dir, args.env_name + '.model')
    predictor_path = os.path.join(args.save_dir, args.env_name + '.pred')
    target_path = os.path.join(args.save_dir, args.env_name + '.target')

    writer = SummaryWriter(log_dir=args.log_dir)

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    discounted_reward = RewardForwardFilter(args.ext_gamma)

    model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net)
    rnd = RNDModel(input_size, output_size)
    model = model.to(device)
    rnd = rnd.to(device)
    optimizer = optim.Adam(list(model.parameters()) +
                           list(rnd.predictor.parameters()),
                           lr=args.lr)

    if args.load_model:
        if args.cuda:
            model.load_state_dict(torch.load(model_path))
        else:
            model.load_state_dict(torch.load(model_path, map_location='cpu'))

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(args.num_worker):
        parent_conn, child_conn = Pipe()
        work = AtariEnvironment(args.env_name,
                                is_render,
                                idx,
                                child_conn,
                                sticky_action=args.sticky_action,
                                p=args.sticky_action_prob,
                                max_episode_steps=args.max_episode_steps)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([args.num_worker, 4, 84, 84])

    sample_env_index = 0  # Sample Environment index to log
    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize observation
    print('Initializes observation normalization...')
    next_obs = []
    for step in range(args.num_step * args.pre_obs_norm_steps):
        actions = np.random.randint(0, output_size, size=(args.num_worker, ))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            next_state, reward, done, realdone, log_reward = parent_conn.recv()
            next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (args.num_step * args.num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []

    print('Training...')
    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], []
        global_step += (args.num_worker * args.num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(args.num_step):
            actions, value_ext, value_int, action_probs = get_action(
                model, device,
                np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                next_state, reward, done, real_done, log_reward = parent_conn.recv(
                )
                next_states.append(next_state)
                rewards.append(reward)
                dones.append(done)
                real_dones.append(real_done)
                log_rewards.append(log_reward)
                next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = compute_intrinsic_reward(
                rnd, device,
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_index]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_action_probs.append(action_probs)

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_index]

            sample_step += 1
            if real_dones[sample_env_index]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall,
                                  sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall,
                                  global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = get_action(model, device,
                                                np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape(
            [-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose(
            [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_action_probs = np.vstack(total_action_probs)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([
            discounted_reward.update(reward_per_step)
            for reward_per_step in total_int_reward.T
        ])
        mean, std, count = np.mean(total_reward_per_env), np.std(
            total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std**2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi',
                          np.sum(total_int_reward) / args.num_worker,
                          sample_episode)
        writer.add_scalar('data/int_reward_per_rollout',
                          np.sum(total_int_reward) / args.num_worker,
                          global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob',
                          total_logging_action_probs.max(1).mean(),
                          sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward, total_done,
                                              total_ext_values, args.ext_gamma,
                                              args.gae_lambda, args.num_step,
                                              args.num_worker, args.use_gae)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values, args.int_gamma,
                                              args.gae_lambda, args.num_step,
                                              args.num_worker, args.use_gae)

        # add ext adv and int adv
        total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        train_model(args, device, output_size, model, rnd, optimizer,
                    np.float32(total_state) / 255., ext_target, int_target,
                    total_action, total_adv,
                    ((total_next_obs - obs_rms.mean) /
                     np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs)

        if global_step % (args.num_worker * args.num_step *
                          args.save_interval) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(model.state_dict(), model_path)
            torch.save(rnd.predictor.state_dict(), predictor_path)
            torch.save(rnd.target.state_dict(), target_path)
Esempio n. 6
0
class OnlineVaeOffpolicyAlgorithm(TorchBatchRLAlgorithm):
    def __init__(self,
                 vae,
                 vae_trainer,
                 *base_args,
                 vae_save_period=1,
                 vae_training_schedule=vae_schedules.never_train,
                 oracle_data=False,
                 parallel_vae_train=True,
                 vae_min_num_steps_before_training=0,
                 uniform_dataset=None,
                 dataset_path=None,
                 rl_offpolicy_num_training_steps=0,
                 **base_kwargs):
        super().__init__(*base_args, **base_kwargs)
        assert isinstance(self.replay_buffer, OnlineVaeRelabelingBuffer)
        self.vae = vae
        self.vae_trainer = vae_trainer
        self.vae_trainer.model = self.vae
        self.vae_save_period = vae_save_period
        self.vae_training_schedule = vae_training_schedule
        self.oracle_data = oracle_data

        self.parallel_vae_train = parallel_vae_train
        self.vae_min_num_steps_before_training = vae_min_num_steps_before_training
        self.uniform_dataset = uniform_dataset

        self._vae_training_process = None
        self._update_subprocess_vae_thread = None
        self._vae_conn_pipe = None

        self.dataset_path = dataset_path
        if self.dataset_path:
            self.load_dataset(dataset_path)

        # train Q and policy rl_offpolicy_num_training_steps times
        self.rl_offpolicy_num_training_steps = rl_offpolicy_num_training_steps

    def pretrain(self):
        for _ in range(self.rl_offpolicy_num_training_steps):
            train_data = self.replay_buffer.random_batch(self.batch_size)
            self.trainer.train(train_data)

    def load_dataset(self, dataset_path):
        dataset = load_local_or_remote_file(dataset_path)
        dataset = dataset.item()

        observations = dataset['observations']
        actions = dataset['actions']

        # dataset['observations'].shape # (2000, 50, 6912)
        # dataset['actions'].shape # (2000, 50, 2)
        # dataset['env'].shape # (2000, 6912)
        N, H, imlength = observations.shape

        self.vae.eval()
        for n in range(N):
            x0 = ptu.from_numpy(dataset['env'][n:n + 1, :] / 255.0)
            x = ptu.from_numpy(observations[n, :, :] / 255.0)
            latents = self.vae.encode(x, x0, distrib=False)

            r1, r2 = self.vae.latent_sizes
            conditioning = latents[0, r1:]
            goal = torch.cat(
                [ptu.randn(self.vae.latent_sizes[0]), conditioning])
            goal = ptu.get_numpy(goal)  # latents[-1, :]

            latents = ptu.get_numpy(latents)
            latent_delta = latents - goal
            distances = np.zeros((H - 1, 1))
            for i in range(H - 1):
                distances[i, 0] = np.linalg.norm(latent_delta[i + 1, :])

            terminals = np.zeros((H - 1, 1))
            # terminals[-1, 0] = 1
            path = dict(
                observations=[],
                actions=actions[n, :H - 1, :],
                next_observations=[],
                rewards=-distances,
                terminals=terminals,
            )

            for t in range(H - 1):
                # reward = -np.linalg.norm(latent_delta[i, :])

                obs = dict(
                    latent_observation=latents[t, :],
                    latent_achieved_goal=latents[t, :],
                    latent_desired_goal=goal,
                )
                next_obs = dict(
                    latent_observation=latents[t + 1, :],
                    latent_achieved_goal=latents[t + 1, :],
                    latent_desired_goal=goal,
                )

                path['observations'].append(obs)
                path['next_observations'].append(next_obs)

            # import ipdb; ipdb.set_trace()
            self.replay_buffer.add_path(path)

    def _end_epoch(self):
        self._train_vae(self.epoch)
        timer.stamp('vae training')
        super()._end_epoch()

    def _get_diagnostics(self):
        vae_log = self._get_vae_diagnostics().copy()
        vae_log.update(super()._get_diagnostics())
        return vae_log

    def to(self, device):
        self.vae.to(device)
        super().to(device)

    """
    VAE-specific Code
    """

    def _train_vae(self, epoch):
        if self.parallel_vae_train and self._vae_training_process is None:
            self.init_vae_training_subprocess()
        should_train, amount_to_train = self.vae_training_schedule(epoch)
        rl_start_epoch = int(self.min_num_steps_before_training /
                             (self.num_expl_steps_per_train_loop *
                              self.num_train_loops_per_epoch))
        if should_train:  # or epoch <= (rl_start_epoch - 1):
            if self.parallel_vae_train:
                assert self._vae_training_process.is_alive()
                # Make sure the last vae update has finished before starting
                # another one
                if self._update_subprocess_vae_thread is not None:
                    self._update_subprocess_vae_thread.join()
                self._update_subprocess_vae_thread = Thread(
                    target=OnlineVaeAlgorithm.
                    update_vae_in_training_subprocess,
                    args=(self, epoch, ptu.device))
                self._update_subprocess_vae_thread.start()
                self._vae_conn_pipe.send((amount_to_train, epoch))
            else:
                _train_vae(self.vae_trainer, epoch, self.replay_buffer,
                           amount_to_train)
                self.replay_buffer.refresh_latents(epoch)
                _test_vae(
                    self.vae_trainer,
                    epoch,
                    self.replay_buffer,
                    vae_save_period=self.vae_save_period,
                    uniform_dataset=self.uniform_dataset,
                )

    def _get_vae_diagnostics(self):
        return add_prefix(
            self.vae_trainer.get_diagnostics(),
            prefix='vae_trainer/',
        )

    def _cleanup(self):
        if self.parallel_vae_train:
            self._vae_conn_pipe.close()
            self._vae_training_process.terminate()

    def init_vae_training_subprocess(self):
        assert isinstance(self.replay_buffer, SharedObsDictRelabelingBuffer)

        self._vae_conn_pipe, process_pipe = Pipe()
        self._vae_training_process = Process(
            target=subprocess_train_vae_loop,
            args=(
                process_pipe,
                self.vae,
                self.vae.state_dict(),
                self.replay_buffer,
                self.replay_buffer.get_mp_info(),
                ptu.device,
            ))
        self._vae_training_process.start()
        self._vae_conn_pipe.send(self.vae_trainer)

    def update_vae_in_training_subprocess(self, epoch, device):
        self.vae.__setstate__(self._vae_conn_pipe.recv())
        self.vae.to(device)
        _test_vae(
            self.vae_trainer,
            epoch,
            self.replay_buffer,
            vae_save_period=self.vae_save_period,
            uniform_dataset=self.uniform_dataset,
        )
Esempio n. 7
0
    def __init__(self, args, model_constructor, env_constructor):
        self.args = args
        self.policy_string = self.compute_policy_type()

        #Evolution
        self.evolver = SSNE(self.args)

        #MP TOOLS
        self.manager = Manager()

        #Genealogy tool
        self.genealogy = Genealogy()

        #Initialize population
        self.population = self.manager.list()
        seed = True
        for _ in range(args.pop_size):
            self.population.append(
                model_constructor.make_model(self.policy_string, seed=seed))
            seed = False

        #SEED
        #self.population[0].load_state_dict(torch.load('Results/Auxiliary/_bestcerl_td3_s2019_roll10_pop10_portfolio10'))

        #Save best policy
        self.best_policy = model_constructor.make_model(self.policy_string)

        #Turn off gradients and put in eval mod
        for actor in self.population:
            actor = actor.cpu()
            actor.eval()

        #Init BUFFER
        self.replay_buffer = Buffer(args.buffer_size)
        self.data_bucket = self.replay_buffer.tuples

        #Intialize portfolio of learners
        self.portfolio = []
        self.portfolio = initialize_portfolio(self.portfolio, self.args,
                                              self.genealogy,
                                              args.portfolio_id,
                                              model_constructor)

        #Initialize Rollout Bucket
        self.rollout_bucket = self.manager.list()
        for _ in range(len(self.portfolio)):
            self.rollout_bucket.append(
                model_constructor.make_model(self.policy_string))

        ############## MULTIPROCESSING TOOLS ###################

        #Evolutionary population Rollout workers
        self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_workers = [
            Process(target=rollout_worker,
                    args=(id, 'evo', self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], self.data_bucket,
                          self.population, env_constructor))
            for id in range(args.pop_size)
        ]
        for worker in self.evo_workers:
            worker.start()
        self.evo_flag = [True for _ in range(args.pop_size)]

        #Learner rollout workers
        self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 'pg', self.task_pipes[id][1],
                          self.result_pipes[id][0], self.data_bucket,
                          self.rollout_bucket, env_constructor))
            for id in range(args.rollout_size)
        ]
        for worker in self.workers:
            worker.start()
        self.roll_flag = [True for _ in range(args.rollout_size)]

        #Test bucket
        self.test_bucket = self.manager.list()
        self.test_bucket.append(
            model_constructor.make_model(self.policy_string))

        #5 Test workers
        self.test_task_pipes = [
            Pipe() for _ in range(env_constructor.dummy_env.test_size)
        ]
        self.test_result_pipes = [
            Pipe() for _ in range(env_constructor.dummy_env.test_size)
        ]
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(id, 'test', self.test_task_pipes[id][1],
                          self.test_result_pipes[id][0], None,
                          self.test_bucket, env_constructor))
            for id in range(env_constructor.dummy_env.test_size)
        ]
        for worker in self.test_workers:
            worker.start()
        self.test_flag = False

        #Meta-learning controller (Resource Distribution)
        self.allocation = [
        ]  #Allocation controls the resource allocation across learners
        for i in range(args.rollout_size):
            self.allocation.append(
                i % len(self.portfolio))  #Start uniformly (equal resources)

        #Trackers
        self.best_score = 0.0
        self.gen_frames = 0
        self.total_frames = 0
        self.test_score = None
        self.test_std = None
        self.best_r1_score = 0.0
        self.ep_len = 0
        self.r1_reward = 0
        self.num_footsteps = 0
        self.test_trace = []
Esempio n. 8
0
    if is_load_model:
        if use_cuda:
            agent.model.load_state_dict(torch.load(load_model_path))
        else:
            agent.model.load_state_dict(
                torch.load(load_model_path, map_location='cpu'))

    if not is_training:
        agent.model.eval()

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = MarioEnvironment(env_id, is_render, idx, child_conn)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    global_step = 0
    recent_prob = deque(maxlen=10)
Esempio n. 9
0
def ars(env_name,
        policy,
        n_epochs,
        env_config={},
        n_workers=8,
        step_size=.02,
        n_delta=32,
        n_top=16,
        exp_noise=0.03,
        zero_policy=True,
        learn_means=True,
        postprocess=postprocess_default):
    torch.autograd.set_grad_enabled(False)
    """
    Augmented Random Search
    https://arxiv.org/pdf/1803.07055

    Args:

    Returns:

    Example:
    """

    proc_list = []
    master_pipe_list = []

    for i in range(n_workers):
        master_con, worker_con = Pipe()
        proc = Process(target=worker_fn,
                       args=(worker_con, env_name, env_config, policy,
                             postprocess))
        proc.start()
        proc_list.append(proc)
        master_pipe_list.append(master_con)

    W = torch.nn.utils.parameters_to_vector(policy.parameters())
    n_param = W.shape[0]

    if zero_policy:
        W = torch.zeros_like(W)

    env = gym.make(env_name, **env_config)
    s_mean = policy.state_means
    s_std = policy.state_std
    total_steps = 0
    env.close()

    r_hist = []
    lr_hist = []

    exp_dist = torch.distributions.Normal(torch.zeros(n_delta, n_param),
                                          torch.ones(n_delta, n_param))

    for epoch in range(n_epochs):

        deltas = exp_dist.sample()
        pm_W = torch.cat((W + (deltas * exp_noise), W - (deltas * exp_noise)))

        for i, Ws in enumerate(pm_W):
            master_pipe_list[i % n_workers].send((Ws, s_mean, s_std))

        results = []
        for i, _ in enumerate(pm_W):
            results.append(master_pipe_list[i % n_workers].recv())

        states = torch.empty(0)
        p_returns = []
        m_returns = []
        l_returns = []
        top_returns = []

        for p_result, m_result in zip(results[:n_delta], results[n_delta:]):
            ps, pr, plr = p_result
            ms, mr, mlr = m_result

            states = torch.cat((states, ms, ps), dim=0)
            p_returns.append(pr)
            m_returns.append(mr)
            l_returns.append(plr)
            l_returns.append(mlr)
            top_returns.append(max(pr, mr))

        top_idx = sorted(range(len(top_returns)),
                         key=lambda k: top_returns[k],
                         reverse=True)[:n_top]
        p_returns = torch.stack(p_returns)[top_idx]
        m_returns = torch.stack(m_returns)[top_idx]
        l_returns = torch.stack(l_returns)[top_idx]

        lr_hist.append(l_returns.mean())
        r_hist.append((p_returns.mean() + m_returns.mean()) / 2)

        ep_steps = states.shape[0]
        s_mean = update_mean(states, s_mean, total_steps)
        s_std = update_std(states, s_std, total_steps)
        total_steps += ep_steps

        if epoch % 5 == 0:
            print(
                f"epoch: {epoch}, reward: {lr_hist[-1].item()}, processed reward: {r_hist[-1].item()} "
            )

        W = W + (step_size / (n_delta * torch.cat(
            (p_returns, m_returns)).std() + 1e-6)) * torch.sum(
                (p_returns - m_returns) * deltas[top_idx].T, dim=1)

    for pipe in master_pipe_list:
        pipe.send("STOP")
    policy.state_means = s_mean
    policy.state_std = s_std
    torch.nn.utils.vector_to_parameters(W, policy.parameters())
    return policy, r_hist, lr_hist
Esempio n. 10
0
def main():
    print({section: dict(config[section]) for section in config.sections()})
    env_id = default_config['EnvID']
    env_type = default_config['EnvType']

    if env_type == 'mario':
        env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id),
                                            COMPLEX_MOVEMENT)
    elif env_type == 'atari':
        env = gym.make(env_id)
    else:
        raise NotImplementedError
    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in env_id:
        output_size -= 1

    env.close()

    is_render = True
    model_path = 'models/{}.model'.format(env_id)
    predictor_path = 'models/{}.pred'.format(env_id)
    target_path = 'models/{}.target'.format(env_id)

    use_cuda = False
    use_gae = default_config.getboolean('UseGAE')
    #use_noisy_net = default_config.getboolean('UseNoisyNet')

    lam = float(default_config['Lambda'])
    num_worker = 1

    num_step = int(default_config['NumStep'])

    ppo_eps = float(default_config['PPOEps'])
    epoch = int(default_config['Epoch'])
    mini_batch = int(default_config['MiniBatch'])
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = float(default_config['LearningRate'])
    entropy_coef = float(default_config['Entropy'])
    gamma = float(default_config['Gamma'])
    clip_grad_norm = float(default_config['ClipGradNorm'])

    sticky_action = False
    action_prob = float(default_config['ActionProb'])
    life_done = default_config.getboolean('LifeDone')

    agent = RNDAgent

    if default_config['EnvType'] == 'atari':
        env_type = AtariEnvironment
    elif default_config['EnvType'] == 'mario':
        env_type = MarioEnvironment
    else:
        raise NotImplementedError

    agent = agent(input_size,
                  output_size,
                  num_worker,
                  num_step,
                  gamma,
                  lam=lam,
                  learning_rate=learning_rate,
                  ent_coef=entropy_coef,
                  clip_grad_norm=clip_grad_norm,
                  epoch=epoch,
                  batch_size=batch_size,
                  ppo_eps=ppo_eps,
                  use_cuda=use_cuda,
                  use_gae=use_gae)

    print('Loading Pre-trained model....')
    if use_cuda:
        agent.model.load_state_dict(torch.load(model_path))
        agent.rnd.predictor.load_state_dict(torch.load(predictor_path))
        agent.rnd.target.load_state_dict(torch.load(target_path))
    else:
        agent.model.load_state_dict(torch.load(model_path, map_location='cpu'))
        agent.rnd.predictor.load_state_dict(
            torch.load(predictor_path, map_location='cpu'))
        agent.rnd.target.load_state_dict(
            torch.load(target_path, map_location='cpu'))
    print('End load...')

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(env_id,
                        is_render,
                        idx,
                        child_conn,
                        sticky_action=sticky_action,
                        p=action_prob,
                        life_done=life_done)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker, 4, 84, 84])

    steps = 0
    rall = 0
    rd = False
    intrinsic_reward_list = []
    while not rd:
        steps += 1
        actions, value_ext, value_int, policy = agent.get_action(
            np.float32(states) / 255.)

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            rall += r
            next_states = s.reshape([1, 4, 84, 84])
            next_obs = s[3, :, :].reshape([1, 1, 84, 84])

        # total reward = int reward + ext Reward
        intrinsic_reward = agent.compute_intrinsic_reward(next_obs)
        intrinsic_reward_list.append(intrinsic_reward)
        states = next_states[:, :, :, :]

        if rd:
            intrinsic_reward_list = (
                intrinsic_reward_list -
                np.mean(intrinsic_reward_list)) / np.std(intrinsic_reward_list)
            with open('int_reward', 'wb') as f:
                pickle.dump(intrinsic_reward_list, f)
            steps = 0
            rall = 0
Esempio n. 11
0
	def __init__(self, args):
		self.args = args
		self.evolver = SSNE(self.args)

		#MP TOOLS
		self.manager = Manager()

		#Genealogy tool
		self.genealogy = Genealogy()

		#Initialize population
		self.pop = self.manager.list()
		for _ in range(args.pop_size):
			wwid = self.genealogy.new_id('evo')
			if ALGO == 'SAC': self.pop.append(GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, wwid))
			else: self.pop.append(Actor(args.state_dim, args.action_dim, wwid))

		if ALGO == "SAC": self.best_policy = GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)
		else:
			self.best_policy = Actor(args.state_dim, args.action_dim, -1)


		#Turn off gradients and put in eval mod
		for actor in self.pop:
			actor = actor.cpu()
			actor.eval()

		#Init BUFFER
		self.replay_buffer = Buffer(1000000, self.args.buffer_gpu)

		#Intialize portfolio of learners
		self.portfolio = []
		self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, PORTFOLIO_ID)
		self.rollout_bucket = self.manager.list()
		for _ in range(len(self.portfolio)):
			if ALGO == 'SAC': self.rollout_bucket.append(GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1))
			else: self.rollout_bucket.append(Actor(args.state_dim, args.action_dim, -1))



		# Initialize shared data bucket
		self.data_bucket = self.replay_buffer.tuples

		############## MULTIPROCESSING TOOLS ###################


		#Evolutionary population Rollout workers
		self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
		self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]
		self.evo_workers = [Process(target=rollout_worker, args=(id, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.pop, ENV_NAME, None, ALGO)) for id in range(args.pop_size)]
		for worker in self.evo_workers: worker.start()
		self.evo_flag = [True for _ in range(args.pop_size)]

		#Learner rollout workers
		self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
		self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
		self.workers = [Process(target=rollout_worker, args=(id, self.task_pipes[id][1], self.result_pipes[id][0], True, self.data_bucket, self.rollout_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(args.rollout_size)]
		for worker in self.workers: worker.start()
		self.roll_flag = [True for _ in range(args.rollout_size)]

		#Test bucket
		self.test_bucket = self.manager.list()
		if ALGO == 'SAC':
			self.test_bucket.append(GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1))
		else:
			self.test_bucket.append(Actor(args.state_dim, args.action_dim, -1))

		#5 Test workers
		self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)]
		self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)]
		self.test_workers = [Process(target=rollout_worker, args=(id, self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, None, self.test_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(TEST_SIZE)]
		for worker in self.test_workers: worker.start()
		self.test_flag = False

		#Meta-learning controller (Resource Distribution)
		self.allocation = [] #Allocation controls the resource allocation across learners
		for i in range(args.rollout_size): self.allocation.append(i % len(self.portfolio)) #Start uniformly (equal resources)
		#self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores

		#Trackers
		self.best_score = 0.0; self.gen_frames = 0; self.total_frames = 0; self.best_shaped_score = None; self.test_score = None; self.test_std = None
Esempio n. 12
0
    def learn(self, n_epochs):
        torch.autograd.set_grad_enabled(False)

        proc_list = []
        master_pipe_list = []
        learn_start_idx = copy.copy(self.total_epochs)

        for i in range(self.n_workers):
            master_con, worker_con = Pipe()
            proc = Process(target=worker_fn,
                           args=(worker_con, self.env_name, self.env_config,
                                 self.policy, self.postprocessor, self.seed))
            proc.start()
            proc_list.append(proc)
            master_pipe_list.append(master_con)

        W = torch.nn.utils.parameters_to_vector(self.policy.parameters())
        n_param = W.shape[0]

        torch.manual_seed(self.seed)
        exp_dist = torch.distributions.Normal(
            torch.zeros(self.n_delta, n_param),
            torch.ones(self.n_delta, n_param))

        for _ in range(n_epochs):

            deltas = exp_dist.sample()
            pm_W = torch.cat(
                (W + (deltas * self.exp_noise), W - (deltas * self.exp_noise)))

            for i, Ws in enumerate(pm_W):
                master_pipe_list[i % self.n_workers].send(
                    (Ws, self.policy.state_means, self.policy.state_std))

            results = []
            for i, _ in enumerate(pm_W):
                results.append(master_pipe_list[i % self.n_workers].recv())

            states = torch.empty(0)
            p_returns = []
            m_returns = []
            l_returns = []
            top_returns = []

            for p_result, m_result in zip(results[:self.n_delta],
                                          results[self.n_delta:]):
                ps, pr, plr = p_result
                ms, mr, mlr = m_result

                states = torch.cat((states, ms, ps), dim=0)
                p_returns.append(pr)
                m_returns.append(mr)
                l_returns.append(plr)
                l_returns.append(mlr)
                top_returns.append(max(pr, mr))

            top_idx = sorted(range(len(top_returns)),
                             key=lambda k: top_returns[k],
                             reverse=True)[:self.n_top]
            p_returns = torch.stack(p_returns)[top_idx]
            m_returns = torch.stack(m_returns)[top_idx]
            l_returns = torch.stack(l_returns)[top_idx]

            self.lr_hist.append(l_returns.mean())
            self.r_hist.append((p_returns.mean() + m_returns.mean()) / 2)

            ep_steps = states.shape[0]
            self.policy.state_means = update_mean(states,
                                                  self.policy.state_means,
                                                  self.total_steps)
            self.policy.state_std = update_std(states, self.policy.state_std,
                                               self.total_steps)

            self.total_steps += ep_steps
            self.total_epochs += 1

            W = W + (self.step_size / (self.n_delta * torch.cat(
                (p_returns, m_returns)).std() + 1e-6)) * torch.sum(
                    (p_returns - m_returns) * deltas[top_idx].T, dim=1)

        for pipe in master_pipe_list:
            pipe.send("STOP")
        for proc in proc_list:
            proc.join()

        torch.nn.utils.vector_to_parameters(W, self.policy.parameters())
        return self.lr_hist[learn_start_idx:]
Esempio n. 13
0
    def train(self,
              training_steps,
              no_actors,
              learning_rate,
              epsilons,
              n_step,
              beta,
              alpha,
              batch_size,
              policy_update,
              discount_factor,
              max_actions_per_episode,
              size_local_memory_buffer,
              eval_freq,
              replay_size_before_sample=None,
              no_envs=1):

        if batch_size > self.replay_mem_size:
            raise ValueError(
                "Please make sure replay memory size is larger than batch size."
            )
        if 1 > n_step:
            raise ValueError("Please have n_step >= 1.")
        if 1 >= size_local_memory_buffer:
            raise ValueError("Please let size_local_memory_buffer > 1.")
        if not isinstance(epsilons, list):
            raise ValueError("Please provide epsilons as a list.")
        if len(epsilons) != no_envs * no_actors:
            raise ValueError(
                "Mismatch in epsilons and no_envs*no_actors. Please let len(epsilons) == no_envs*no_actors."
            )

        world_size = no_actors + 2  #(+ Learner proces and Memory process)
        actor_processes = []

        # Communication channels between processes
        transition_queue_to_memory = Queue()
        transition_queue_from_memory = Queue()
        update_priorities_queue_to_memory = Queue()

        # Communication pipes from learner to actors, one for each actor
        # For sending new network weights to the actors
        # The pipes are one way comunication (duplex = False)
        con_learner_actor = []
        con_actor_learner = []
        for a in range(no_actors):
            con_1, con_2 = Pipe(duplex=True)
            con_learner_actor.append(con_1)
            con_actor_learner.append(con_2)

        con_learner_memory, con_memory_learner = Pipe(duplex=True)
        """
            Learner Process
        """
        learner_args = {
            "no_actors": no_actors,
            "train_steps": training_steps,
            "batch_size": batch_size,
            "learning_rate": learning_rate,
            "policy_update": policy_update,
            "discount_factor": discount_factor,
            "optimizer": self.optimizer,
            "policy_net": self.policy_net,
            "policy_config": self.policy_config,
            "device": self.device,
            "transition_queue_from_memory": transition_queue_from_memory,
            "update_priorities_queue_to_memory":
            update_priorities_queue_to_memory,
            "con_actors": con_learner_actor,
            "con_replay_memory": con_learner_memory,
            "eval_freq": eval_freq,
            "env": self.env,
            "env_config": self.env_config,
            "tb_log_dir": self.tb_log_dir,
            "update_tb": self.update_tb
        }

        learner_process = Process(target=self._init_process,
                                  args=(0, world_size, learner, learner_args))
        learner_process.start()
        """
            Memory Process
        """
        mem_args = {
            "capacity":
            self.replay_mem_size,
            "alpha":
            alpha,
            "beta":
            beta,
            "batch_size":
            batch_size,
            "transition_queue_to_memory":
            transition_queue_to_memory,
            "transition_queue_from_memory":
            transition_queue_from_memory,
            "update_priorities_queue_to_memory":
            update_priorities_queue_to_memory,
            "con_learner":
            con_memory_learner,
            "replay_size_before_sampling":
            replay_size_before_sample
            if not (replay_size_before_sample is None) else min(
                batch_size, int(self.replay_memory * 0.25)),
            "tb_log_dir":
            self.tb_log_dir,
            "update_tb":
            self.update_tb
        }

        print("Memory Process")
        memory_process = Process(target=self._init_process,
                                 args=(1, world_size, experienceReplayBuffer,
                                       mem_args))

        memory_process.start()
        """
            Actor Processes
        """
        actor_args = {
            "train_steps": training_steps,
            "max_actions_per_episode": max_actions_per_episode,
            "update_policy": policy_update,
            "size_local_memory_buffer": size_local_memory_buffer,
            "model": self.policy_net,
            "model_config": self.policy_config,
            "env": self.env,
            "env_config": self.env_config,
            "no_envs": no_envs,
            "device": self.device,
            "discount_factor": discount_factor,
            "transition_queue_to_memory": transition_queue_to_memory,
            "n_step": n_step
        }

        split = 0
        for rank in range(no_actors):
            next_split = split + no_envs
            actor_args["epsilon"] = epsilons[split:next_split]
            actor_args["con_learner"] = con_actor_learner[rank]

            split = next_split
            actor_process = Process(target=self._init_process,
                                    args=(rank + 2, world_size, actor,
                                          actor_args))

            actor_process.start()
            print("starting actor ", (rank + 2))
            actor_processes.append(actor_process)

        for a in actor_processes:
            a.join()
            print(a, "joined")

        memory_process.join()
        learner_process.join()
def main():
    print({section: dict(config[section]) for section in config.sections()})
    train_method = default_config['TrainMethod']
    env_id = default_config['EnvID']
    env_type = default_config['EnvType']

    if env_type == 'mario':
        env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id),
                                            COMPLEX_MOVEMENT)
    elif env_type == 'atari':
        env = gym.make(env_id)
    else:
        raise NotImplementedError
    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in env_id:
        output_size -= 1

    env.close()

    is_load_model = False
    is_render = False
    model_path = 'models/{}.model'.format(env_id)
    icm_path = 'models/{}.icm'.format(env_id)

    writer = SummaryWriter()

    use_cuda = default_config.getboolean('UseGPU')
    use_gae = default_config.getboolean('UseGAE')
    use_noisy_net = default_config.getboolean('UseNoisyNet')

    lam = float(default_config['Lambda'])
    num_worker = int(default_config['NumEnv'])

    num_step = int(default_config['NumStep'])

    ppo_eps = float(default_config['PPOEps'])
    epoch = int(default_config['Epoch'])
    mini_batch = int(default_config['MiniBatch'])
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = float(default_config['LearningRate'])
    entropy_coef = float(default_config['Entropy'])
    gamma = float(default_config['Gamma'])
    eta = float(default_config['ETA'])

    clip_grad_norm = float(default_config['ClipGradNorm'])

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 4, 84, 84))

    pre_obs_norm_step = int(default_config['ObsNormStep'])
    discounted_reward = RewardForwardFilter(gamma)

    agent = ICMAgent

    if default_config['EnvType'] == 'atari':
        env_type = AtariEnvironment
    elif default_config['EnvType'] == 'mario':
        env_type = MarioEnvironment
    else:
        raise NotImplementedError

    agent = agent(input_size,
                  output_size,
                  num_worker,
                  num_step,
                  gamma,
                  lam=lam,
                  learning_rate=learning_rate,
                  ent_coef=entropy_coef,
                  clip_grad_norm=clip_grad_norm,
                  epoch=epoch,
                  batch_size=batch_size,
                  ppo_eps=ppo_eps,
                  eta=eta,
                  use_cuda=use_cuda,
                  use_gae=use_gae,
                  use_noisy_net=use_noisy_net)

    if is_load_model:
        if use_cuda:
            agent.model.load_state_dict(torch.load(model_path))
        else:
            agent.model.load_state_dict(
                torch.load(model_path, map_location='cpu'))

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(env_id, is_render, idx, child_conn)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize obs
    print('Start to initailize observation normalization parameter.....')
    next_obs = []
    steps = 0
    while steps < pre_obs_norm_step:
        steps += num_worker
        actions = np.random.randint(0, output_size, size=(num_worker, ))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            next_obs.append(s[:])

    next_obs = np.stack(next_obs)
    obs_rms.update(next_obs)
    print('End to initalize...')

    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_values, total_policy = \
            [], [], [], [], [], [], [], [], []
        global_step += (num_worker * num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(num_step):
            actions, value, policy = agent.get_action(
                (states - obs_rms.mean) / np.sqrt(obs_rms.var))

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)

            # total reward = int reward
            intrinsic_reward = agent.compute_intrinsic_reward(
                (states - obs_rms.mean) / np.sqrt(obs_rms.var),
                (next_states - obs_rms.mean) / np.sqrt(obs_rms.var), actions)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_next_state.append(next_states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_values.append(value)
            total_policy.append(policy)

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall,
                                  sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall,
                                  global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value, _ = agent.get_action(
            (states - obs_rms.mean) / np.sqrt(obs_rms.var))
        total_values.append(value)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape(
            [-1, 4, 84, 84])
        total_next_state = np.stack(total_next_state).transpose(
            [1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_values = np.stack(total_values).transpose()
        total_logging_policy = torch.stack(total_policy).view(
            -1, output_size).cpu().numpy()

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([
            discounted_reward.update(reward_per_step)
            for reward_per_step in total_int_reward.T
        ])
        mean, std, count = np.mean(total_reward_per_env), np.std(
            total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std**2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi',
                          np.sum(total_int_reward) / num_worker,
                          sample_episode)
        writer.add_scalar('data/int_reward_per_rollout',
                          np.sum(total_int_reward) / num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob',
                          softmax(total_logging_policy).max(1).mean(),
                          sample_episode)

        # Step 3. make target and advantage
        target, adv = make_train_data(total_int_reward,
                                      np.zeros_like(total_int_reward),
                                      total_values, gamma, num_step,
                                      num_worker)

        adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-8)
        # -----------------------------------------------

        # Step 5. Training!
        agent.train_model(
            (total_state - obs_rms.mean) / np.sqrt(obs_rms.var),
            (total_next_state - obs_rms.mean) / np.sqrt(obs_rms.var), target,
            total_action, adv, total_policy)

        if global_step % (num_worker * num_step * 100) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(agent.model.state_dict(), model_path)
            torch.save(agent.icm.state_dict(), icm_path)
Esempio n. 15
0
    def __init__(self, args, model_constructor, env_constructor):
        self.args = args

        #MP TOOLS
        self.manager = Manager()

        #Algo
        self.algo = TD3(model_constructor,
                        actor_lr=args.actor_lr,
                        critic_lr=args.critic_lr,
                        gamma=args.gamma,
                        tau=args.tau,
                        polciy_noise=0.1,
                        policy_noise_clip=0.2,
                        policy_ups_freq=2)

        #Save best policy
        self.best_policy = model_constructor.make_model('Gaussian_FF')
        self.best_policy.stochastic = False

        #Init BUFFER
        self.replay_buffer = Buffer(args.buffer_size)
        self.data_bucket = self.replay_buffer.tuples

        #Initialize Rollout Bucket
        self.rollout_bucket = self.manager.list()
        self.rollout_bucket.append(model_constructor.make_model('Gaussian_FF'))
        for actor in self.rollout_bucket:
            actor.stochastic = False
            actor.eval()

        ############## MULTIPROCESSING TOOLS ###################
        #Learner rollout workers
        self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 'pg', self.task_pipes[id][1],
                          self.result_pipes[id][0], self.data_bucket,
                          self.rollout_bucket, env_constructor))
            for id in range(args.rollout_size)
        ]
        for worker in self.workers:
            worker.start()
        self.roll_flag = [True for _ in range(args.rollout_size)]

        #Test bucket
        self.test_bucket = self.manager.list()
        self.test_bucket.append(model_constructor.make_model('Gaussian_FF'))
        for actor in self.test_bucket:
            actor.stochastic = False
            actor.eval()

        #5 Test workers
        self.test_task_pipes = [
            Pipe() for _ in range(env_constructor.dummy_env.test_size)
        ]
        self.test_result_pipes = [
            Pipe() for _ in range(env_constructor.dummy_env.test_size)
        ]
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(id, 'test', self.test_task_pipes[id][1],
                          self.test_result_pipes[id][0], None,
                          self.test_bucket, env_constructor))
            for id in range(env_constructor.dummy_env.test_size)
        ]
        for worker in self.test_workers:
            worker.start()
        self.test_flag = False

        #Trackers
        self.best_score = 0.0
        self.gen_frames = 0
        self.total_frames = 0
        self.test_score = None
        self.test_std = None
        self.test_trace = []
        self.ep_len = 0
        self.r1_reward = 0
        self.num_footsteps = 0
Esempio n. 16
0
def main():
    total_steps = int(sys.argv[1])
    env_id = str(sys.argv[2])
    int_coef = float(sys.argv[3])
    print("steps: ", total_steps)
    print(env_id)
    print(int_coef)
    print({section: dict(config[section]) for section in config.sections()})
    train_method = default_config['TrainMethod']
    # env_id = default_config['EnvID']
    env_type = default_config['EnvType']

    if env_type == 'mario':
        env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id),
                                            COMPLEX_MOVEMENT)
    elif env_type == 'atari':
        env = gym.make(env_id)
    else:
        raise NotImplementedError
    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in env_id:
        output_size -= 1

    env.close()

    is_load_model = False
    is_render = False
    model_path = 'models/{}.model'.format(env_id)
    predictor_path = 'models/{}.pred'.format(env_id)
    target_path = 'models/{}.target'.format(env_id)

    writer = SummaryWriter()

    use_cuda = default_config.getboolean('UseGPU')
    use_gae = default_config.getboolean('UseGAE')
    use_noisy_net = default_config.getboolean('UseNoisyNet')

    lam = float(default_config['Lambda'])
    num_worker = int(default_config['NumEnv'])

    num_step = int(default_config['NumStep'])

    ppo_eps = float(default_config['PPOEps'])
    epoch = int(default_config['Epoch'])
    mini_batch = int(default_config['MiniBatch'])
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = float(default_config['LearningRate'])
    entropy_coef = float(default_config['Entropy'])
    gamma = float(default_config['Gamma'])
    int_gamma = float(default_config['IntGamma'])
    clip_grad_norm = float(default_config['ClipGradNorm'])
    ext_coef = float(default_config['ExtCoef'])
    # int_coef = float(default_config['IntCoef'])

    sticky_action = default_config.getboolean('StickyAction')
    action_prob = float(default_config['ActionProb'])
    life_done = default_config.getboolean('LifeDone')

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    pre_obs_norm_step = int(default_config['ObsNormStep'])
    discounted_reward = RewardForwardFilter(int_gamma)

    agent = RNDAgent

    if default_config['EnvType'] == 'atari':
        env_type = AtariEnvironment
    elif default_config['EnvType'] == 'mario':
        env_type = MarioEnvironment
    else:
        raise NotImplementedError

    agent = agent(input_size,
                  output_size,
                  num_worker,
                  num_step,
                  gamma,
                  lam=lam,
                  learning_rate=learning_rate,
                  ent_coef=entropy_coef,
                  clip_grad_norm=clip_grad_norm,
                  epoch=epoch,
                  batch_size=batch_size,
                  ppo_eps=ppo_eps,
                  use_cuda=use_cuda,
                  use_gae=use_gae,
                  use_noisy_net=use_noisy_net)

    if is_load_model:
        print('load model...')
        if use_cuda:
            agent.model.load_state_dict(torch.load(model_path))
            agent.rnd.predictor.load_state_dict(torch.load(predictor_path))
            agent.rnd.target.load_state_dict(torch.load(target_path))
        else:
            agent.model.load_state_dict(
                torch.load(model_path, map_location='cpu'))
            agent.rnd.predictor.load_state_dict(
                torch.load(predictor_path, map_location='cpu'))
            agent.rnd.target.load_state_dict(
                torch.load(target_path, map_location='cpu'))
        print('load finished!')

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(env_id,
                        is_render,
                        idx,
                        child_conn,
                        sticky_action=sticky_action,
                        p=action_prob,
                        life_done=life_done)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize obs
    print('Start to initailize observation normalization parameter.....')
    next_obs = []
    for step in range(num_step * pre_obs_norm_step):
        actions = np.random.randint(0, output_size, size=(num_worker, ))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            next_obs.append(s[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (num_step * num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []
    print('End to initalize...')

    for i in range(total_steps):
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy, total_policy_np = \
            [], [], [], [], [], [], [], [], [], [], []
        global_step += (num_worker * num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(num_step):
            actions, value_ext, value_int, policy = agent.get_action(
                np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)
                next_obs.append(s[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = agent.compute_intrinsic_reward(
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_policy.append(policy)
            total_policy_np.append(policy.cpu().numpy())

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall,
                                  sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall,
                                  global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = agent.get_action(
            np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape(
            [-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose(
            [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_policy = np.vstack(total_policy_np)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([
            discounted_reward.update(reward_per_step)
            for reward_per_step in total_int_reward.T
        ])
        mean, std, count = np.mean(total_reward_per_env), np.std(
            total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std**2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi',
                          np.sum(total_int_reward) / num_worker,
                          sample_episode)
        writer.add_scalar('data/int_reward_per_rollout',
                          np.sum(total_int_reward) / num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob',
                          softmax(total_logging_policy).max(1).mean(),
                          sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward, total_done,
                                              total_ext_values, gamma,
                                              num_step, num_worker)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values, int_gamma,
                                              num_step, num_worker)

        # add ext adv and int adv
        total_adv = int_adv * int_coef + ext_adv * ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        agent.train_model(
            np.float32(total_state) / 255., ext_target, int_target,
            total_action, total_adv,
            ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(
                -5, 5), total_policy)

        if global_step % (num_worker * num_step * 100) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(agent.model.state_dict(), model_path)
            torch.save(agent.rnd.predictor.state_dict(), predictor_path)
            torch.save(agent.rnd.target.state_dict(), target_path)
Esempio n. 17
0
    def meta_fit(self, meta_dataset_generator):

        catchable_sigs = set(signal.Signals) - {signal.SIGKILL, signal.SIGSTOP}
        for sig in catchable_sigs:
            signal.signal(
                sig,
                receive_signal)  # Substitute handler of choice for `print`
        LOGGER.debug('My PID: %s' % os.getpid())

        self.timer.begin('main training')
        mp.set_start_method('spawn', force=True)

        # >>> BUG: OS Error: Too many opened files
        # >>> SOLVED: by `ulimit -HSn 4096`
        # Now, we change all the queues to pipe
        self.timer.begin('build data pipeline')
        # every 10 epoch will produce one valid
        train_data_reservoir = [
            queue.Queue(32 * 10) for i in range(len(self.devices))
        ]
        valid_data_reservoir = [
            queue.Queue(200) for i in range(len(self.devices))
        ]
        meta_valid_reservoir = [
            queue.Queue(self.eval_tasks) for i in range(self.total_exp)
        ]
        train_recv, valid_recv = [], []
        train_send, valid_send = [], []
        for i in range(len(self.devices)):
            recv, send = Pipe(True)
            # activate the first handshake
            recv.send(True)
            train_recv.append(recv)
            train_send.append(send)
            recv, send = Pipe(True)
            # activate the first handshake
            recv.send(True)
            valid_recv.append(recv)
            valid_send.append(send)

        def apply_device_to_hp(hp, device):
            hp['device'] = 'cuda:{}'.format(device)
            return hp

        self.timer.end('build data pipeline')

        self.timer.begin('build main proc pipeline')
        clsnum = get_base_class_number(meta_dataset_generator)
        LOGGER.info('base class number detected', clsnum)
        procs = [
            mp.Process(target=run_exp,
                       args=(self.modules[i].MyMetaLearner,
                             apply_device_to_hp(self.hp[i], dev),
                             train_recv[i], valid_recv[i], clsnum))
            for i, dev in enumerate(self.devices)
        ]
        for p in procs:
            p.daemon = True
            p.start()

        self.timer.end('build main proc pipeline')
        LOGGER.info('build data',
                    self.timer.query_time_by_name('build data pipeline'),
                    'build proc',
                    self.timer.query_time_by_name('build main proc pipeline'))
        label_meta_valid = []

        data_generation = True

        self.timer.begin('prepare dataset')
        meta_train_dataset = meta_dataset_generator.meta_train_pipeline.batch(
            1)
        meta_train_generator = cycle(iter(meta_train_dataset))
        meta_valid_dataset = meta_dataset_generator.meta_valid_pipeline.batch(
            1)
        meta_valid_generator = cycle(iter(meta_valid_dataset))
        self.timer.end('prepare dataset')
        LOGGER.info('prepare dataset',
                    self.timer.query_time_by_name('prepare dataset'))

        valid_ens_data_load_number = 0

        def generate_data():
            # manage data globally
            while data_generation:
                for i in range(32 * 10):
                    # load train
                    if not data_generation:
                        break
                    data_train = process_task_batch(next(meta_train_generator),
                                                    device=torch.device('cpu'),
                                                    with_origin_label=True)
                    for dr in train_data_reservoir:
                        try:
                            dr.put_nowait(data_train)
                        except:
                            pass
                    time.sleep(0.0001)

                for i in range(200):
                    # load valid
                    if not data_generation:
                        break
                    data_valid = process_task_batch(next(meta_valid_generator),
                                                    device=torch.device('cpu'),
                                                    with_origin_label=False)
                    for dr in valid_data_reservoir:
                        try:
                            dr.put_nowait(data_valid)
                        except:
                            pass
                    if random.random() < 0.1:
                        for dr in meta_valid_reservoir:
                            try:
                                if dr.qsize() < self.eval_tasks:
                                    valid_ens_data_load_number += 1
                                    dr.put_nowait([
                                        data_valid[0][0], data_valid[0][1],
                                        data_valid[1][0]
                                    ])
                                    label_meta_valid.extend(
                                        data_valid[1][1].tolist())
                            except:
                                pass
                    time.sleep(0.0001)

        def put_data_train_passive(i):
            while data_generation:
                try:
                    if train_send[i].recv():
                        supp, quer = train_data_reservoir[i].get()
                        data = self.modules[i].process_data(
                            supp, quer, True, self.hp[i])
                        train_send[i].send(data)
                    else:
                        return
                except:
                    pass

        def put_data_valid_passive(i):
            while data_generation:
                try:
                    if valid_send[i].recv():
                        supp, quer = valid_data_reservoir[i].get()
                        data = self.modules[i].process_data(
                            supp, quer, False, self.hp[i])
                        valid_send[i].send(data)
                    else:
                        return
                except:
                    pass

        thread_pool = [threading.Thread(target=generate_data)] + \
            [threading.Thread(target=put_data_train_passive, args=(i,)) for i in range(self.total_exp)] + \
            [threading.Thread(target=put_data_valid_passive, args=(i,)) for i in range(self.total_exp)]

        for th in thread_pool:
            th.daemon = True
            th.start()

        try:
            # we leave about 20 min for decoding of test
            for p in procs:
                p.join(max(self.timer.time_left() - 60 * 10, 0.1))

            self.timer.begin('clear env')
            # terminate proc that is out-of-time
            LOGGER.info('Main meta-train is done',
                        '' if self.timer.time_left() > 60 else 'time out exit')
            LOGGER.info('time left', self.timer.time_left(), 's')
            for p in procs:
                if p.is_alive():
                    p.terminate()

            data_generation = False
            # in case there are blocking
            for q in train_data_reservoir + valid_data_reservoir:
                if q.empty():
                    q.put(False)
            for s in train_recv + valid_recv:
                s.send(False)
            for s in train_send + train_recv + valid_send + valid_recv:
                s.close()
            for p in thread_pool:
                p.join()
            self.timer.end('clear env')
            LOGGER.info('clear env',
                        self.timer.query_time_by_name('clear env'))

            self.timer.end('main training')
        except Exception:
            LOGGER.info('error occured in main process')
            traceback.print_exc()

        LOGGER.info(
            'spawn total {} meta valid tasks. main training time {}'.format(
                valid_ens_data_load_number,
                self.timer.query_time_by_name('main training')))

        self.timer.begin('load learner')

        self.meta_learners = [None] * self.total_exp

        def load_model(args):
            module, hp, i = args
            self.meta_learners[i] = module.load_model(hp)

        pool = [
            threading.Thread(target=load_model,
                             args=((self.modules[i], self.hp[i], i), ))
            for i in range(self.total_exp)
        ]
        for p in pool:
            p.daemon = True
            p.start()
        for p in pool:
            p.join()

        self.timer.end('load learner')
        LOGGER.info('load learner done, time spent',
                    self.timer.query_time_by_name('load learner'))

        if not isinstance(self.ensemble, int):
            # instead of just weighted sum, we plan to use stacking
            procs = []
            reses = [None] * len(self.meta_learners)

            self.timer.begin('validation')

            recv_list, sent_list = [], []
            for i in range(self.total_exp):
                r, s = Pipe(True)
                r.send(True)
                recv_list.append(r)
                sent_list.append(s)

            pool = mp.Pool(self.total_exp)
            procs = pool.starmap_async(
                predict, [(self.meta_learners[i], recv_list[i],
                           self.eval_tasks, self.hp[i]['device'], {
                               'time_fired': time.time(),
                               'taskid': i
                           }) for i in range(self.total_exp)])

            # start sub thread to pass data
            def pass_meta_data(i):
                for _ in range(self.eval_tasks):
                    if sent_list[i].recv():
                        # LOGGER.info(i, 'fire data signal get')
                        sent_list[i].send(meta_valid_reservoir[i].get())
                        # LOGGER.info(i, 'data is sent')

            threads = [
                threading.Thread(target=pass_meta_data, args=(i, ))
                for i in range(self.total_exp)
            ]
            for t in threads:
                t.daemon = True
                t.start()

            for _ in range(self.eval_tasks - valid_ens_data_load_number):
                data_valid = next(meta_valid_generator)
                data_valid = process_task_batch(data_valid,
                                                device=torch.device('cpu'),
                                                with_origin_label=False)
                label_meta_valid.extend(data_valid[1][1].tolist())
                for dr in meta_valid_reservoir:
                    dr.put(
                        [data_valid[0][0], data_valid[0][1], data_valid[1][0]])
                # LOGGER.info('put data!')
            # LOGGER.info('all data done!')

            # now we can receive data
            for t in threads:
                t.join()
            reses = [sent_list[i].recv()['res'] for i in range(self.total_exp)]
            # every res in reses is a np.array of shape (eval_task * WAY * QUERY) * WAY
            ENS_VALID_TASK = 50
            ENS_VALID_ELEMENT = ENS_VALID_TASK * 5 * 19
            reses_test_list = [
                deepcopy(res[-ENS_VALID_ELEMENT:]) for res in reses
            ]

            self.timer.end('validation')
            LOGGER.info('valid data predict done',
                        self.timer.query_time_by_name('validation'))

            weight = [1.] * len(self.meta_learners)
            labels = np.array(label_meta_valid, dtype=np.int)  # 19000
            acc_o = ((np.array(weight)[:, None, None] / sum(weight) *
                      np.array(reses)).sum(axis=0).argmax(
                          axis=1) == labels).mean()
            reses = np.array(reses, dtype=np.float).transpose((1, 0, 2))
            reses_test = reses[-ENS_VALID_ELEMENT:].reshape(
                ENS_VALID_ELEMENT, -1)
            reses = reses[:-ENS_VALID_ELEMENT]
            reses = reses.reshape(len(reses), -1)
            labels_test = labels[-ENS_VALID_ELEMENT:]
            labels = labels[:-ENS_VALID_ELEMENT]
            LOGGER.info('voting result', acc_o)

            self.timer.begin('ensemble')

            # mp.set_start_method('fork', True)
            result = pool.map(
                ensemble_on_data,
                [
                    (GBMEnsembler(), reses, labels, 'gbm'),
                    (GLMEnsembler(), reses, labels, 'glm'),
                    (NBEnsembler(), reses, labels, 'nb'),
                    (RFEnsembler(), reses, labels, 'rf'
                     )  # too over-fit on simple dataset
                ])

            # test the ensemble model
            def acc(logit, label):
                return (logit.argmax(axis=1) == label).mean()

            res_test = [x[0]._predict(reses_test) for x in result]
            acc_test = [acc(r, labels_test) for r in res_test]
            acc_single_test = [
                acc(np.array(r), labels_test) for r in reses_test_list
            ]
            LOGGER.info('ensemble test', 'gbm', 'glm', 'nb', 'rf', acc_test)
            LOGGER.info('single test', acc_single_test)

            if max(acc_test) > max(acc_single_test):
                LOGGER.info("will use ensemble model")
                #idx_acc_max = np.argmax([x[1] for x in result])
                idx_acc_max = np.argmax(acc_test)
                self.timer.end('ensemble')
                print('best ensembler', ['gbm', 'glm', 'nb',
                                         'rf'][idx_acc_max], 'acc',
                      acc_test[idx_acc_max])
                print('ensemble done, time cost',
                      self.timer.query_time_by_name('ensemble'))

                # currently we use mean of output as ensemble
                return MyLearner(self.meta_learners,
                                 result[idx_acc_max][0],
                                 timers=self.timer)
            else:
                LOGGER.info("will use single model")
                idx_acc_max = np.argmax(acc_single_test)
                self.timer.end('ensemble')
                print('best single model id', idx_acc_max)
                print('ensemble done, time cost',
                      self.timer.query_time_by_name('ensemble'))

                # return only the best meta learners
                return MyLearner([self.meta_learners[idx_acc_max]], 0,
                                 self.timer)
        return MyLearner([self.meta_learners[self.ensemble]],
                         0,
                         timers=self.timer)
def main_read():
    log_start(level=logging.INFO)
    read_conn, write_conn = Pipe()
    daemon_process_run(read_conn=read_conn, write_conn=write_conn,
                       testcase=1)  # 0 for guassian, 1 for PCIE
    canvas.show()
    app.run()


if __name__ == '__main__':
    import sys
    if sys.flags.interactive != 1:
        # main_read()
        log_start(level=logging.INFO)
        # multiprocessing.freeze_support()
        read_conn, write_conn = Pipe()
        daemon_process_run(read_conn=read_conn,
                           write_conn=write_conn,
                           lock=lock,
                           testcase=0)  # 0 for guassian, 1 for PCIE
        wview.show()
        app.run()

# To see the save file: #####################
# from Binload import Binload
# bf = Binload()
# bf.load('PCIE.bin',file_format='float32')
# bf.plot(n=(0,10000),chNo=4)
#############################################