コード例 #1
0
ファイル: NNController.py プロジェクト: chameleon6/robotics
    def __init__(self, conf):
        self.profiler = Profiler()
        self.profiler.tic('total controller run time')

        self.matlab_state_file = os.getcwd() + '/matlab_state_file.out'
        self.python_action_file = os.getcwd() + '/python_action_file.out'
        self.last_no_matlab_net_path = 'last_no_matlab_net.out'

        conf = read_conf(conf)

        self.conf = conf
        self.max_torque = conf['max_torque']
        self.model_name = conf['name']
        self.bang_action = conf['bang_action']
        #self.old_net_update_time = conf['old_net_update_time']
        self.old_net_update_delay = conf['old_net_update_delay']
        if self.old_net_update_delay > 0:
            self.use_old_net = True
        else:
            self.use_old_net = False

        self.min_train_gap = conf['min_train_gap']
        self.train_per_iter = conf['train_per_iter']
        self.min_action_gap = conf['min_action_gap']
        self.final_epsilon = conf['final_epsilon']
        self.epsilon_anneal_time = conf['epsilon_anneal_time']
        self.no_op_time = conf['no_op_time']
        self.n_minibatch = conf['minibatch_size']
        self.n_batches = conf['num_batches']
        self.n_megabatch = self.n_minibatch * self.n_batches
        self.gamma = conf['gamma']
        self.sim_dt = conf['sim_dt']
        self.start_epsilon = conf['start_epsilon']
        self.mse_freq = conf['mse_freq']
        self.error_type = conf['error_type']
        self.RL_train = conf['RL_train']

        assert self.start_epsilon >= self.final_epsilon

        self.epsilon = self.start_epsilon
        #last_self.old_net_update_time = 0.0
        self.last_old_net_update_count = 0
        self.net_update_count = 0
        self.last_train_time = 0.0
        self.last_action_time = 0.0
        self.time_step = 1
        self.times_trained = 0
        self.last_state = None
        self.last_action = None
        self.succeeded = False
        self.train_t = -self.no_op_time
        #self.sim_start_time = self.train_t - 10
        self.sim_num = 0
        self.sim_t = 0

        self.sa_queue = None
        self.ys_queue = None
        self.minibatch_index = self.n_batches

        self.a_hists = []
        self.current_a_hist= []
        self.s_hists = []
        self.current_s_hist= []
        self.t_hists = []
        self.current_t_hist= []
        self.mse_hist = []
        self.mse_hist2 = []
        self.rmse_rel_hist = []
        self.rmse_rel_hist2 = []
        self.learn_rate_hist = []
        self.certainty_hist = []
        self.correct_rate_hist = []
        self.q_mean_hist = []
        self.global_correct_rates = []
        self.global_mse_hist = []

        self.action_counts = np.array([1,1,1,1.])

        self.transitions = TransitionContainer(conf['max_history_len'])

        if 'ref_transitions_file' in conf:
            with open(conf['ref_transitions_file'], 'rb') as f:
                self.all_ref_transitions = pickle.load(f)
            np.random.shuffle(self.all_ref_transitions)
            self.ref_transitions = self.all_ref_transitions[:400]

        if 'transitions_seed' in conf:
            self.transitions.container = pickle.load(open(conf['transitions_seed'], 'rb'))

        action_conf = conf.copy()
        action_conf['n_a'] = 0
        action_conf['n_q'] = 6
        action_conf['learn_rate'] = 0.01
        if 'action_net_file' in conf:
            print 'loading action net'
            self.action_net = ControlNN(action_conf, load_path=conf['action_net_file'])
        else:
            self.action_net = ControlNN(action_conf, load_path=None)

        '''
        sas_conf = conf.copy()
        sas_conf['n_s'] = conf['n_s'] + conf['n_a']
        sas_conf['n_q'] = conf['n_s']
        sas_conf['n_a'] = 0
        #sas_conf['n_hidden'] = 200
        sas_conf['one_layer_only'] = 0
        sas_conf['initial_learn_rate'] = 0.0001
        if 'sas_net_file' in conf:
            print 'loading sas net'
            self.sas_net = ControlNN(sas_conf, load_path=conf['sas_net_file'])
        else:
            self.sas_net = ControlNN(sas_conf, load_path=None)

        certainty_conf = conf.copy()
        certainty_conf['n_q'] = 1
        certainty_conf['n_a'] = 0
        if 'certainty_net_file' in conf:
            print 'loading certainty net'
            self.certainty_net = ControlNN(certainty_conf, load_path=conf['certainty_net_file'])
        else:
            self.certainty_net = ControlNN(certainty_conf, load_path=None)

        '''

        if 'q_net_file' in conf:
            self.load_path = conf['q_net_file']
        else:
            self.load_path = None

        self.initialize_nets()
        self.transfer_path = 'tmp/model_transfer'

        rand_int = int(time.time() * 10000000) % 100000000
        self.save_path = 'models/model_%s.out' % rand_int
        self.t_path = 'transitions/transitions_%s.out' % rand_int
        self.log_path = self.model_name + '.log'
        logging.basicConfig(filename=self.log_path,level=logging.INFO)
        logging.info('log for %s', self.model_name)
        print 'saving to', self.save_path, 'logging to', self.log_path

        self.b_dim = b_dim = 2
        self.d_dim = d_dim = 1
        self.b_params = []
        self.d_params = []
        self.r_by_params = []
        self.cur_b = np.ones(self.b_dim)
        self.cur_d = np.ones(self.d_dim)
        self.cur_r = 0
コード例 #2
0
ファイル: NNController.py プロジェクト: chameleon6/robotics
class NNController:

    def __init__(self, conf):
        self.profiler = Profiler()
        self.profiler.tic('total controller run time')

        self.matlab_state_file = os.getcwd() + '/matlab_state_file.out'
        self.python_action_file = os.getcwd() + '/python_action_file.out'
        self.last_no_matlab_net_path = 'last_no_matlab_net.out'

        conf = read_conf(conf)

        self.conf = conf
        self.max_torque = conf['max_torque']
        self.model_name = conf['name']
        self.bang_action = conf['bang_action']
        #self.old_net_update_time = conf['old_net_update_time']
        self.old_net_update_delay = conf['old_net_update_delay']
        if self.old_net_update_delay > 0:
            self.use_old_net = True
        else:
            self.use_old_net = False

        self.min_train_gap = conf['min_train_gap']
        self.train_per_iter = conf['train_per_iter']
        self.min_action_gap = conf['min_action_gap']
        self.final_epsilon = conf['final_epsilon']
        self.epsilon_anneal_time = conf['epsilon_anneal_time']
        self.no_op_time = conf['no_op_time']
        self.n_minibatch = conf['minibatch_size']
        self.n_batches = conf['num_batches']
        self.n_megabatch = self.n_minibatch * self.n_batches
        self.gamma = conf['gamma']
        self.sim_dt = conf['sim_dt']
        self.start_epsilon = conf['start_epsilon']
        self.mse_freq = conf['mse_freq']
        self.error_type = conf['error_type']
        self.RL_train = conf['RL_train']

        assert self.start_epsilon >= self.final_epsilon

        self.epsilon = self.start_epsilon
        #last_self.old_net_update_time = 0.0
        self.last_old_net_update_count = 0
        self.net_update_count = 0
        self.last_train_time = 0.0
        self.last_action_time = 0.0
        self.time_step = 1
        self.times_trained = 0
        self.last_state = None
        self.last_action = None
        self.succeeded = False
        self.train_t = -self.no_op_time
        #self.sim_start_time = self.train_t - 10
        self.sim_num = 0
        self.sim_t = 0

        self.sa_queue = None
        self.ys_queue = None
        self.minibatch_index = self.n_batches

        self.a_hists = []
        self.current_a_hist= []
        self.s_hists = []
        self.current_s_hist= []
        self.t_hists = []
        self.current_t_hist= []
        self.mse_hist = []
        self.mse_hist2 = []
        self.rmse_rel_hist = []
        self.rmse_rel_hist2 = []
        self.learn_rate_hist = []
        self.certainty_hist = []
        self.correct_rate_hist = []
        self.q_mean_hist = []
        self.global_correct_rates = []
        self.global_mse_hist = []

        self.action_counts = np.array([1,1,1,1.])

        self.transitions = TransitionContainer(conf['max_history_len'])

        if 'ref_transitions_file' in conf:
            with open(conf['ref_transitions_file'], 'rb') as f:
                self.all_ref_transitions = pickle.load(f)
            np.random.shuffle(self.all_ref_transitions)
            self.ref_transitions = self.all_ref_transitions[:400]

        if 'transitions_seed' in conf:
            self.transitions.container = pickle.load(open(conf['transitions_seed'], 'rb'))

        action_conf = conf.copy()
        action_conf['n_a'] = 0
        action_conf['n_q'] = 6
        action_conf['learn_rate'] = 0.01
        if 'action_net_file' in conf:
            print 'loading action net'
            self.action_net = ControlNN(action_conf, load_path=conf['action_net_file'])
        else:
            self.action_net = ControlNN(action_conf, load_path=None)

        '''
        sas_conf = conf.copy()
        sas_conf['n_s'] = conf['n_s'] + conf['n_a']
        sas_conf['n_q'] = conf['n_s']
        sas_conf['n_a'] = 0
        #sas_conf['n_hidden'] = 200
        sas_conf['one_layer_only'] = 0
        sas_conf['initial_learn_rate'] = 0.0001
        if 'sas_net_file' in conf:
            print 'loading sas net'
            self.sas_net = ControlNN(sas_conf, load_path=conf['sas_net_file'])
        else:
            self.sas_net = ControlNN(sas_conf, load_path=None)

        certainty_conf = conf.copy()
        certainty_conf['n_q'] = 1
        certainty_conf['n_a'] = 0
        if 'certainty_net_file' in conf:
            print 'loading certainty net'
            self.certainty_net = ControlNN(certainty_conf, load_path=conf['certainty_net_file'])
        else:
            self.certainty_net = ControlNN(certainty_conf, load_path=None)

        '''

        if 'q_net_file' in conf:
            self.load_path = conf['q_net_file']
        else:
            self.load_path = None

        self.initialize_nets()
        self.transfer_path = 'tmp/model_transfer'

        rand_int = int(time.time() * 10000000) % 100000000
        self.save_path = 'models/model_%s.out' % rand_int
        self.t_path = 'transitions/transitions_%s.out' % rand_int
        self.log_path = self.model_name + '.log'
        logging.basicConfig(filename=self.log_path,level=logging.INFO)
        logging.info('log for %s', self.model_name)
        print 'saving to', self.save_path, 'logging to', self.log_path

        self.b_dim = b_dim = 2
        self.d_dim = d_dim = 1
        self.b_params = []
        self.d_params = []
        self.r_by_params = []
        self.cur_b = np.ones(self.b_dim)
        self.cur_d = np.ones(self.d_dim)
        self.cur_r = 0

    def do_gp(self):
        X = np.concatenate((np.array(self.b_params), np.array(self.d_params)),1)
        gp = gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1)
        gp.fit(X, self.r_by_params)
        sl = slice(-5, 5, 0.1)
        X_test = np.concatenate([x.flatten()[:,np.newaxis] for x in np.mgrid[sl, sl, sl]], 1)
        prof.tic('pred')
        ys, sigmas = gp.predict(X_test, eval_MSE=True)
        prof.toc('pred')
        y_max = max(self.r_by_params)
        us = (ys - y_max) / sigmas
        eis = sigmas * (us * norm.cdf(us) + norm.pdf(us))
        best_params = X_test[np.argmax(eis)]
        print 'best params', best_params
        return best_params[:2], best_params[2:]

    def b_transform(self, s):
        assert s.shape == (2,)
        return self.cur_b * s

    def d_transform(self, a):
        return np.maximum(-2., np.minimum(2., self.cur_d * a))

    def state_noise(self, s):
        return s * 2

    def initialize_nets(self):
        self.current_net = ControlNN(conf=self.conf, load_path=self.load_path)
        if self.use_old_net:
            self.old_net = ControlNN(conf=self.conf, load_path=self.load_path)
        else:
            self.old_net = self.current_net

    def should_update_old_net(self):
        return self.times_trained - self.last_old_net_update_count >= self.old_net_update_delay

    def will_update_old_net_next(self):
        return self.times_trained - self.last_old_net_update_count == self.old_net_update_delay-1

    def maybe_update_old_net(self):
        if not self.use_old_net:
            return

        if self.should_update_old_net():
            #self.profiler.tic('net transfer')
            self.current_net.save_model(self.transfer_path)
            self.old_net.load_model(self.transfer_path)
            #last_self.old_net_update_time = self.train_t
            self.last_old_net_update_count = self.times_trained
            self.net_update_count += 1
            #self.profiler.toc('net transfer')

    def test_mse(self, ts):
        #test_sa, test_ys = self.parse_transitions(self.ref_transitions)
        s, a, ys = self.parse_transitions(ts)
        #mse = self.current_net.mse_q(s, a, ys)
        qs = self.current_net.q_from_sa_discrete(s, a)
        #print qs
        #print a
        #print self.current_net.q_from_s_discrete(s)

        if self.error_type == 0:
            mse = np.sqrt(np.mean((ys - qs)**2))
        elif self.error_type == 1:
            mse = np.mean(np.abs(ys-qs))
        return mse, qs, ys

    def train_once(self, compute_mse=True):
        #self.profiler.tic('sa_and_ys_time')
        #sa, ys = self.get_sa_and_ys()
        ts = self.transitions.random_sample(self.n_minibatch)
        s, a, ys = self.parse_transitions(ts)
        #self.profiler.toc('sa_and_ys_time')

        if compute_mse:
            #self.profiler.tic('test_mse_time1')
            mse1, qs, test_ys = self.test_mse(self.ref_transitions)
            test_y_norm = np.linalg.norm(test_ys)
            rmse_rel1 = np.sqrt(mse1) / test_y_norm
            #print qs
            #print test_ys
            #print qs-test_ys
            print 'mse:', mse1 #, mse2
            print 'test_y_mean', np.mean(test_ys)
            #print 'rmse_rel:', rmse_rel1
            self.mse_hist.append(mse1)
            self.rmse_rel_hist.append(rmse_rel1)
            #self.profiler.toc('test_mse_time1')

        #self.profiler.tic('train_time')
        #self.current_net.train(sa, ys)
        self.current_net.train_discrete(s, a, ys)
        #self.profiler.toc('train_time')


        # self.profiler.tic('test_mse_time2')
        # mse2, qs, test_ys = test_mse()
        # self.profiler.toc('test_mse_time2')
        # rmse_rel2 = np.sqrt(mse2) / test_y_norm
        # self.mse_hist2.append(mse2)
        # self.rmse_rel_hist2.append(rmse_rel1)

        self.times_trained += 1
        self.epsilon = max(self.final_epsilon, self.start_epsilon - (self.start_epsilon-self.final_epsilon) / self.epsilon_anneal_time * self.train_t)
        self.last_train_time = self.train_t

    def get_sa_and_ys(self):
        self.profiler.tic('max_time_for_sa_and_ys')
        if self.minibatch_index == self.n_batches:
            ts = self.transitions.random_sample(self.n_megabatch)
            self.sa_queue, self.ys_queue = self.parse_transitions(ts)
            self.minibatch_index = 0
        self.profiler.toc('max_time_for_sa_and_ys')

        start_ind = self.minibatch_index * self.n_minibatch
        inds = slice(start_ind, start_ind + self.n_minibatch, 1)
        print 'using slice', inds
        self.minibatch_index += 1
        return self.sa_queue[inds], self.ys_queue[inds]

    def parse_transitions(self, ts):
        rs = np.array([t[2] for t in ts])[:, np.newaxis]
        terms = np.array([t[3] for t in ts])
        new_states = np.array([t[4] for t in ts])
        #best_q = self.old_net.get_best_a_p(new_states, is_p=True, num_tries=1)[1]
        all_q = self.old_net.q_from_s_discrete(new_states)[:,np.newaxis]
        best_q = self.old_net.get_best_q_discrete(new_states)[:,np.newaxis]
        #print 'rs', rs.shape, rs
        #print 'best_q', best_q.shape, best_q
        # TODO: fix for non-binary rs
        ys = rs.copy()
        for i in range(len(rs)):
            if not terms[i]:
                ys[i] += self.gamma * best_q[i]
            #else:
            #    print 'rs', rs
            #    print 'best_q', best_q
            #    print 'ys', ys
        # print 'rs', rs
        # print 'best_q', best_q
        # print 'all_q', all_q

        #sa = np.array([np.append(t[0], t[1]) for t in ts])
        s = np.array([t[0] for t in ts])
        a = [t[1] for t in ts]
        return s, a, ys

    def random_action(self):
        a = np.random.uniform(-self.max_torque,self.max_torque)
        print 'random_action', a
        return a

    def random_action_discrete(self):
        #p = 1.0 / np.sqrt(self.action_counts)
        #return np.random.choice(range(self.current_net.n_q), p=p/np.sum(p))
        return np.random.choice(range(self.current_net.n_q))

    def plot_sim(self, ind):
        aa = np.array(self.a_hists[ind])
        ss = np.array(self.s_hists[ind])
        plt.plot(ss)
        plt.plot(aa/self.max_torque)
        plt.show()

    def dump_transitions(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self.transitions.container, f)

    def run_sas_train(self):
        self.run_net_train(self.sas_net, 'sas_train_data.p')

    def run_certainty_train(self):
        self.run_net_train(self.certainty_net, 'certainty_train_data.p')

    def run_action_train(self, xs, us):
        #self.run_net_train(self.action_net, 'action_train_data.p')
        xs = self.standardize_state(xs)
        us = self.standardize_action(us)
        self.run_net_train(self.action_net, xs, us)

    #def run_net_train(self, net, train_data_file):
    def run_net_train(self, net, xs, us):
        #self.profiler.tic('data load')
        #xs, us = pickle.load(open(train_data_file, 'rb'))
        #self.profiler.toc('data load')

        n_test = len(xs) / 10

        inds = np.array(range(len(xs)))
        np.random.shuffle(inds)
        xs = xs[inds]
        us = us[inds]
        #test_inds = np.random.choice(range(n_train), n_test, False)
        x_test, u_test = xs[:n_test], us[:n_test]
        xs, us = xs[n_test:], us[n_test:]
        assert len(xs) == len(us)

        # ms_x = mu_std(xs)
        # ms_u = mu_std(us)

        # net.set_standardizer(ms_x, ms_u)

        # xs = standardize(xs, ms_x)
        # x_test = standardize(x_test, ms_x)
        # us = standardize(us, ms_u)
        # u_test = standardize(u_test, ms_u)

        n_train = len(xs)
        n_batch = self.n_minibatch
        # print n_train

        # print ms_x, ms_u
        for ep in range(1000):
            self.profiler.tic('epoch')
            print 'epoch', ep
            mse = net.mse_q(x_test, u_test)
            self.mse_hist.append(mse)
            delta = net.learn_delta_q(x_test, u_test)
            #print 'test_us', u_test
            #print 'delta', delta, delta.shape
            #real_mse = np.mean(delta * (ms_u[1]**2))
            print 'mse', mse, '=', np.mean(delta), 'learn_rate', net.get_learn_rate()
            for j in range(n_train/n_batch):
                s = slice(j * n_batch, (j+1)*n_batch, 1)
                net.train(xs[s], us[s])
            self.profiler.toc('epoch')

    def run_dp_train(self):

        sa_costs = None
        with open('../Pendulum/dp_sa_cost.p', 'rb') as f:
            sa_costs = pickle.load(f).flatten()

        sa_samples = []
        a_grid = np.linspace(-2, 2, 9)
        s1_grid = np.linspace(0, 2*np.pi, 51)
        s2_grid = np.linspace(-10, 10, 51)

        for i, c in enumerate(sa_costs):
            a = i / (51 * 51)
            ss = i % (51 * 51)
            s2, s1 = ss / 51, ss % 51
            a = a_grid[a]
            s1 = s1_grid[s1]
            s2 = s2_grid[s2]
            sa_samples.append([s1, s2, a, c])

        sa_samples = np.array(sa_samples)
        n = len(sa_samples)
        for i in range(0):
            sample_size = n if i%100==0 else 50
            inds = np.random.choice(range(n), sample_size, False)
            #t = np.array(random.sample(sa_samples, sample_size))
            t = sa_samples[inds]

            if i%100 == 0:
                print 'iter', i
                mse = self.current_net.mse_q(t[:,0:3], -t[:,3][:,np.newaxis])
                learn_rate = self.current_net.get_learn_rate()
                print 'mse', mse, 'learn_rate', learn_rate
                self.mse_hist.append(mse)
                self.learn_rate_hist.append(learn_rate)
            else:
                self.current_net.train(t[:,0:3], -t[:,3][:,np.newaxis])

        #plt.plot(mses); plt.show()
        vis = NetVisualizer(self.current_net)
        vis.q_heat_map()

    def load_action_files(self, files):
        for fname in files:
            lines = open(fname, 'r').readlines()
            lines = map(lambda l: map(float, l.strip().split(' ')), lines)
            xs = np.array(lines[::3])
            us = np.array(lines[1::3])
            us_orig = np.array(lines[2::3])
            return xs, us, us_orig

    def load_action_files_old(self, files):
        for fname in files:
            lines = open(fname, 'r').readlines()
            lines = map(lambda l: map(float, l.strip().split(' ')), lines)
            xs = np.array(lines[::2])
            us = np.array(lines[1::2])
            return xs, us

    def load_simbicon_transitions(self, files):
        t = []
        for fname in files:
            lines = open(fname, 'r').readlines()
            lines = map(lambda l: map(float, l.strip().split(' ')), lines)
            r = np.array(lines[::3])
            s = np.array(lines[1::3])
            a = np.array(lines[2::3])
            r = np.array(map(float, r))
            a = np.array(map(int, a))
            t.extend([(s[i], a[i], r[i+1], False, s[i+1]) for i in range(len(s)-1)])
        return t

    def reflect_transition(self, t):
        return (reflect_state(t[0]), (t[1]+2) % 4, t[2], t[3], reflect_state(t[4]))

    def run_no_matlab(self):

        def count_dist(a):
            a = list(a)
            return [a.count(i) for i in range(4)]

        self.profiler.tic('no_matlab')
        if len(c.transitions.container) < 100:
            return

        mse_hist = []
        correct_rates = []
        np.random.shuffle(self.transitions.container)
        s = np.array([i[0] for i in self.transitions.container])

        for i in range(1500):
            if i % 100 == 0:
                self.profiler.tic('qs time')
                qs = self.current_net.q_from_s_discrete(s)
                self.profiler.toc('qs time')
                print qs[:20]
                print 'iteration', i
                self.profiler.tic('eval time')
                preds, actuals, rate = self.evaluate_simbicon(self.good_simbicon_transitions)
                self.profiler.toc('eval time')
                print 'actual_dist', count_dist(actuals)
                print 'pred_dist', count_dist(preds)
                print 'correct_rate', rate
                self.q_mean_hist.append(np.mean(qs))
                #if rate > 0.93:
                #    print 'good rate'
                #    break

            self.train_once(i % 100 == 0)
            if self.will_update_old_net_next() or i == 0:
                print 'testing mse, iter', i
                mse, qs, test_ys = self.test_mse(self.ref_transitions)
                mse_hist.append(mse)
                correct_rates.append(rate)
                self.correct_rate_hist.append(rate)
                self.mse_hist.append(mse)
                print 'mse_hist:', self.mse_hist
                if len(mse_hist) > 1 and mse_hist[-1] > mse_hist[-2]:
                    print 'mse_hist', mse_hist
                    print 'correct_rates', correct_rates
                    print 'loading best model'
                    self.current_net.load_model(self.last_no_matlab_net_path)
                    return
                self.current_net.save_model(self.last_no_matlab_net_path)
            self.maybe_update_old_net()
        self.profiler.toc('no_matlab')

    def whole_second_frac(self, n):
        return np.abs(self.sim_t*n - np.round(self.sim_t*n)) < 0.0001

    def run_matlab(self, algo):

        print 'ready for matlab'

        while True:
            start_time = time.time()
            #self.profiler.tic('wait')
            while not os.path.isfile(self.matlab_state_file):
                if time.time() - start_time > 300:
                    print "timeout"
                    #self.current_net.save_model(self.save_path)
                    self.profiler.toc('total controller run time')
                    self.profiler.print_time_stats()
                    sys.exit()
                pass

            f = open(self.matlab_state_file, 'r')
            lines = f.readlines()
            f.close()
            if len(lines) < 4: # file isn't fully written yet
                continue

            #self.profiler.toc('wait')

            reward_str = lines[0].rstrip()
            reward = float(reward_str)
            self.cur_r += reward
            # if reward > 0 and not self.succeeded and sim_t >= 0.05:
            #     logging.info('succeeded, reward=%s, sim_t=%s, state=%s', reward, sim_t, state)
            #     self.succeeded = True

            #if reward == 0.0:
            #    print 'sim already failed at time', self.sim_t

            term_str = lines[1].rstrip()
            term = int(term_str) > 0

            state_strs = lines[2].rstrip().split(' ')
            state = np.array(map(float, state_strs))
            if algo != 'pend_dp':
                state = self.standardize_state(state)

            last_sim_t = self.sim_t
            self.sim_t = float(lines[3].rstrip())

            if self.sim_t < last_sim_t:
                self.last_state = None
                self.sim_num += 1
                print 'new sim', self.sim_num, 'self.train_t', self.train_t
                #self.sim_start_time = self.train_t
                self.a_hists.append(self.current_a_hist[:])
                self.s_hists.append(self.current_s_hist[:])
                self.t_hists.append(self.current_t_hist[:])
                self.current_a_hist = []
                self.current_s_hist = []
                self.current_t_hist = []
                self.b_params.append(self.cur_b)
                self.d_params.append(self.cur_d)
                self.r_by_params.append(self.cur_r)
                self.cur_r = 0
                if len(self.b_params) > 1:
                    self.cur_b, self.cur_d = self.do_gp()
                else:
                    self.cur_b, self.cur_d = np.random.randn(self.b_dim), \
                            np.random.randn(self.d_dim)


            #if reward != 0.0 and self.whole_second_frac(10):
            if reward == 10.0:
                print 'received reward', reward, 'at time', self.sim_t #, 'x', state[0]

            #certainty = self.certainty_net.q_from_sa(state.reshape((1,-1)))[0][0]
            self.train_t += self.sim_dt

            #print 'sim_t', sim_t, 'state', state, 'certainty', certainty
            #self.certainty_hist.append(certainty)

            os.remove(self.matlab_state_file)
            if algo == 'RL':
                action = self.RL_train_and_action(state, reward, term)
                self.action_counts[action] += 1;
                write_str = '%s\n' % action
            elif algo == 'action_net':
                action = self.action_net_main(state)
                write_str = ' '.join(map(str, action)) + '\n'
            elif algo == 'pend_dp':
                state = np.array(state)
                state = self.state_noise(state)
                state = self.b_transform(state)
                output = self.current_net.manual_max_a_p(state[np.newaxis,:])
                action = output[0,0]
                action = self.d_transform(action)[0]
                write_str = '%s\n' % action
                print 'input', state
                print 'output', output
                print 'action', write_str
            else:
                raise ValueError('Not a valid algo')


            #print 'action', action
            f = open(self.python_action_file, 'w')
            #f.write('%s\n' % ' '.join(map(str, action)))
            f.write(write_str)
            f.close()

            if term:
                print 'sim failed at time', self.sim_t
                #self.initialize_nets()
                #self.run_no_matlab()

    def action_net_main(self, state):
        net_out = self.action_net.q_from_sa(state.reshape((1,-1)))[0]
        return self.unstandardize_action(net_out)

    def correct_rate(self, pred, actual):
        count = 0
        for x,y in zip(pred, actual):
            if x == y:
                count += 1
        return count/float(len(pred))

    def evaluate_simbicon(self, t):
        actual = [i[1] for i in t]
        pred = [self.current_net.get_best_a_discrete(i[0]) for i in t]
        mid = len(pred)/2
        # print 'first half', self.correct_rate(pred[:mid], actual[:mid])
        # print 'second half', self.correct_rate(pred[mid:], actual[mid:])
        return np.array(pred), np.array(actual), self.correct_rate(pred, actual)

    def standardize_action(self, u):
        return (u - self.u_mean) / self.u_std

    def unstandardize_action(self, u):
        return u * self.u_std + self.u_mean

    def standardize_state(self, s):
        return (s - self.s_mean) / self.s_std

    def unstandardize_state(self, s):
        return s * self.s_std + self.s_mean

    def standardize_transition(self, t):
        t_new = list(t)
        t_new[0] = self.standardize_state(t[0])
        t_new[-1] = self.standardize_state(t[-1])
        return t_new

    def unstandardize_transition(self, t):
        t_new = list(t)
        t_new[0] = self.unstandardize_state(t[0])
        t_new[-1] = self.unstandardize_state(t[-1])
        return t_new

    def truncate_transitions(self, t):
        return [(i[0][:-4], i[1], i[2], i[3], i[4][:-4]) for i in t]

    def summary_stats(self, t):
        t = np.array(t)
        return 'min: %f, max: %f, mean: %f, std: %f' % \
                (np.min(t), np.max(t), np.mean(t), np.std(t))

    def print_reward_summary(self):
        print self.summary_stats([x[2] for x in self.transitions.container])

    def change_rewards_unstandardized(self, t):
        t_new = t[:]
        old_rewards = []
        new_rewards = []
        for i in t_new:
            old_rewards.append(i[2])
            #i[2] = matlab_reward(i[-1])
            if i[2] == 10:
                i[2] = 2
            new_rewards.append(i[2])

        print 'old rewards', self.summary_stats(old_rewards)
        print 'new rewards', self.summary_stats(new_rewards)
        return t_new

    def change_rewards_standardized(self, t):
        t_unstandardized = [self.unstandardize_transition(i) for i in t]
        new_unstandardized = self.change_rewards_unstandardized(t_unstandardized)
        return [self.standardize_transition(i) for i in new_unstandardized]

    def transition_ratio(self):
        good_t = [x for x in self.transitions.container if x[2] > 0]
        return float(len(good_t))/len(self.transitions.container)

    def transition_summary(self):
        def t_to_tuple(t):
            def f(x):
                y = x
                return tuple(np.round(y))
            return (f(t[0]), t[1], t[2], t[3], f(t[4]))

        t = [t_to_tuple(x) for x in self.transitions.container]
        a = {}
        for i in t:
            if i not in a:
                a[i] = 0
            a[i] += 1

        return a, [x for x in a.items() if x[1] > 1]

    def duplicate_positive_rewards(self, t, ratio):
        good_t = [x for x in t if x[2] > 0]
        d = ratio * len(t) / (1-ratio) / len(good_t)
        print 'total len', len(t)
        print 'num good', len(good_t)
        print 'duplicating', d, 'times'
        ret_t = t[:]
        for _ in range(int(d)):
            ret_t.extend(good_t)
        return ret_t

    def set_standardizer(self, s):
        self.s_std = np.std(s, 0)
        self.s_mean = np.mean(s, 0)

    def set_u_standardizer(self, u):
        self.u_std = np.std(u, 0)
        self.u_mean = np.mean(u, 0)

    def RL_train_and_action(self, state, reward, term):

        #if self.time_step % 10 == 0:
        #self.profiler.toc('RL train and action')
        #self.profiler.tic('RL train and action')
        if self.whole_second_frac(10):
            print 'iter', self.time_step, 'sim_num', self.sim_num, \
                    'num transitions', len(self.transitions.container), \
                    'transition ratio', self.transition_ratio(), \
                    'train_t', self.train_t, 'sim_t', self.sim_t, \
                    'epsilon', self.epsilon, 'times_trained', self.times_trained, \
                    'net_update_count', self.net_update_count, \
                    'learn_rate', self.current_net.get_learn_rate()

        #self.profiler.tic('nn_cycle')
        if self.last_state != None:
            transition = (self.last_state, self.last_action, reward, term, state)
            self.transitions.append(transition)
            self.current_a_hist.append(self.last_action)
            self.current_s_hist.append(self.last_state)
            self.current_t_hist.append(transition)

        self.maybe_update_old_net()
        action = None
        if self.train_t < 0.0 or self.last_action == None:
            action = self.random_action_discrete()
        elif self.train_t - self.last_action_time > self.min_action_gap:
            self.last_action_time = self.train_t
            if np.random.random() > self.epsilon:
                action = self.current_net.get_best_a_discrete(state)

                # qs = c.current_net.q_from_s_discrete(state[np.newaxis,:])[0]
                # poss_next_action = (self.last_action + 1) % 4
                # if qs[poss_next_action] > qs[self.last_action]:
                #     action = poss_next_action
                # else:
                #     action = self.last_action

            else:
                #action = self.random_action()
                action = self.random_action_discrete()
        else:
            action = self.last_action

        # #print 'action', action
        # if self.bang_action == 1:
        #     if action > 0.0:
        #         action = self.max_torque
        #     else:
        #         action = -self.max_torque

        #self.profiler.tic('nn_cycle1')
        if self.train_t > 0.0 and self.train_t - self.last_train_time > self.min_train_gap \
                and len(self.transitions.container) > self.n_minibatch \
                and self.RL_train:
            #self.train_once(self.times_trained % self.mse_freq == 0)
            for i in range(self.train_per_iter):
                self.train_once(False)

        #self.profiler.toc('nn_cycle1')

        self.time_step += 1
        self.last_state = state
        self.last_action = action
        #self.profiler.toc('nn_cycle')
        if self.whole_second_frac(10):
            print 'action taken:', action
        return action