Exemple #1
0
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 1
        self.TRANSFER = 1
        self.BATCH_SIZE = 32
        #self.BATCH_SIZE = 5
        self.GAMMA = 0.95
        #self.SAMPLE_ALPHA = 0.5
        #self.SAMPLE_EPISLON = 0.
        #self.SAMPLE_BETA = 0.
        #self.SAMPLE_S = 44.8
        self.SAMPLE_S = 10.0
        self.SAMPLE_Q = 1.0

        LEARNING_RATE = 1e-3
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.Adam(self.net.parameters(), lr=LEARNING_RATE))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore
Exemple #2
0
    def __init__(self, memory, env):
        # hyperparameters
        self.TRAIN_MAX = 1
        self.TRANSFER = 1
        self.BATCH_SIZE = 32
        self.GAMMA = 0.95
        self.SAMPLE_ALPHA = 0.5
        self.SAMPLE_EPISLON = 0.
        self.SAMPLE_BETA = 0.
        self.plotter = Plotter(folder='DQN/plot/cartpole_simple/singleP')
        #LEARNING_RATE = 0.00025
        LEARNING_RATE = 1e-3
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.EPS_START = 1.
        self.EPS_END = 0.05
        self.EPS_DECAY = 200  # DECAY larger: slower
        self.MEMORY_SIZE = 5000
        self.steps_done = 0
        self.net = DQN()  # Deep Net
        self.net.setOptimizer(
            optim.Adam(self.net.parameters(), lr=LEARNING_RATE))
        #self.net.setOptimizer(optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE,
        #                                        momentum=MOMENTUM, alpha=SQUARED_MOMENTUM,
        #                                        eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.env = env
Exemple #3
0
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 1
        self.TRANSFER = 1
        self.BATCH_SIZE = 32
        self.GAMMA = 0.95
        self.SAMPLE_ALPHA = 0.5
        self.SAMPLE_EPISLON = 0.
        self.SAMPLE_BETA = 0.

        #LEARNING_RATE = 0.00025
        LEARNING_RATE = 1e-3
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.Adam(self.net.parameters(), lr=LEARNING_RATE))
        #self.net.setOptimizer(optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE,
        #                                        momentum=MOMENTUM, alpha=SQUARED_MOMENTUM,
        #                                        eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore
Exemple #4
0
class Evaluator:
    def __init__(self, memory, env):
        # hyperparameters
        self.TRAIN_MAX = 1
        self.TRANSFER = 1
        self.BATCH_SIZE = 32
        self.GAMMA = 0.95
        self.SAMPLE_ALPHA = 0.5
        self.SAMPLE_EPISLON = 0.
        self.SAMPLE_BETA = 0.
        self.plotter = Plotter(folder='DQN/plot/cartpole_simple/singleP')
        #LEARNING_RATE = 0.00025
        LEARNING_RATE = 1e-3
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.EPS_START = 1.
        self.EPS_END = 0.05
        self.EPS_DECAY = 200  # DECAY larger: slower
        self.MEMORY_SIZE = 5000
        self.steps_done = 0
        self.net = DQN()  # Deep Net
        self.net.setOptimizer(
            optim.Adam(self.net.parameters(), lr=LEARNING_RATE))
        #self.net.setOptimizer(optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE,
        #                                        momentum=MOMENTUM, alpha=SQUARED_MOMENTUM,
        #                                        eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.env = env

    def behavior_policy(self, state):
        # We can add epislon here to decay the randomness
        # We store the tensor of size 1x1
        sample = random.random()
        eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \
                        math.exp(-1. * self.steps_done / self.EPS_DECAY)
        if sample > eps_threshold:
            return self.policy(state)
        else:
            return self.env.action_space.sample()

    def policy(self, state):
        # return tensor of size 1x1
        res = self.net(Variable(state, volatile=True).type(FloatTensor)).data
        return res.max(1)[1][0]
        #return res.max(1)[1].view(1,1)
        #return self.net(Variable(state, volatile=True).type(FloatTensor))\
        #                .data.max(1)[1].view(1,1)

    def minibatch(self, exp_replay, pretrain=False):
        batch = random.sample(list(exp_replay), self.BATCH_SIZE)
        unzipped = list(zip(*batch))
        state_batch = Variable(torch.from_numpy(np.array(
            unzipped[0])).type(FloatTensor),
                               volatile=True)
        # here previously no type conversion, and result in error
        action_batch = Variable(torch.from_numpy(np.array(
            unzipped[1])).type(LongTensor),
                                volatile=True)
        reward_batch = Variable(torch.from_numpy(np.array(
            unzipped[2])).type(FloatTensor),
                                volatile=True)
        target_batch = None
        if pretrain:
            # only use reward
            target_batch = reward_batch
        else:
            term_batch = Variable(torch.from_numpy(np.array(
                unzipped[4])).type(FloatTensor),
                                  volatile=True)
            next_state_batch = Variable(torch.from_numpy(np.array(
                unzipped[3])).type(FloatTensor),
                                        volatile=True)
            # here previously no type conversion, and result in error
            next_state_values = self.net(next_state_batch).max(1)[0].unsqueeze(
                1)
            next_state_values = term_batch * next_state_values
            next_state_values.volatile = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)

        state_batch.volatile = False
        state_batch.requires_grad = True
        action_batch.volatile = False
        target_batch.volatile = False

        return state_batch, action_batch, target_batch

    def run(self):
        i = 0
        while True:
            i = i + 1
            time_t = 0
            while True:
                time_t += 1
                state = self.env.get_state()
                state_torch = torch.from_numpy(state).type(
                    FloatTensor)  # convert to torch and normalize
                state_torch = state_torch.unsqueeze(0).type(FloatTensor)
                action = self.behavior_policy(state_torch)
                next_state, r, done = self.env.step(action)  # 0.03s
                if len(self.memory) == self.MEMORY_SIZE:
                    self.memory.pop(0)
                self.memory.append((state, [action], [r], \
                                            next_state, [1-done])
                                )
                if len(self.memory) < 1000:
                    i = 0
                    if done:
                        break
                    else:
                        continue
                #batch_tuple = self.minibatch(self.memory)
                #loss = self.net.optimize(batch_tuple)

                if done:
                    self.steps_done += 1
                    print('episode: {}, score: {}'.format(i, time_t))
                    self.plotter.plot_train_rewards(time_t)
                    break
            if len(self.memory) >= 1000:
                batch_tuple = self.minibatch(self.memory)
                loss = self.net.optimize(batch_tuple)
        self.plotter.terminate()
Exemple #5
0
class Evaluator(multiprocessing.Process):
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 1
        self.TRANSFER = 1
        self.BATCH_SIZE = 32
        #self.BATCH_SIZE = 5
        self.GAMMA = 0.95
        #self.SAMPLE_ALPHA = 0.5
        #self.SAMPLE_EPISLON = 0.
        #self.SAMPLE_BETA = 0.
        #self.SAMPLE_S = 44.8
        self.SAMPLE_S = 10.0
        self.SAMPLE_Q = 1.0

        LEARNING_RATE = 1e-3
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.Adam(self.net.parameters(), lr=LEARNING_RATE))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):
        #batch = exp_replay.sample(self.BATCH_SIZE)
        #print(batch)
        unzipped = list(zip(*exp_replay))
        state_batch = Variable(torch.from_numpy(np.array(
            unzipped[0])).type(FloatTensor),
                               volatile=True)
        action_batch = Variable(torch.from_numpy(np.array(
            unzipped[1])).type(LongTensor),
                                volatile=True)
        reward_batch = Variable(torch.from_numpy(np.array(
            unzipped[2])).type(FloatTensor),
                                volatile=True)
        target_batch = None
        if pretrain:
            # only use reward
            target_batch = reward_batch
        else:
            term_batch = Variable(torch.from_numpy(np.array(
                unzipped[4])).type(FloatTensor),
                                  volatile=True)
            next_state_batch = Variable(torch.from_numpy(np.array(
                unzipped[3])).type(FloatTensor),
                                        volatile=True)
            next_state_values = self.targetNet(next_state_batch).max(
                1)[0].unsqueeze(1)
            next_state_values = term_batch * next_state_values
            next_state_values.volatile = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)
        # calculate the probability for each transition
        # calculate distance matrix
        state_feature_batch = self.targetNet.getstate(state_batch)
        inner_product = state_feature_batch.matmul(
            state_feature_batch.transpose(1, 0))
        state_feature_batch_l2 = (state_feature_batch**2).sum(
            dim=1, keepdim=True).expand_as(inner_product)
        distance_matrix = state_feature_batch_l2 + state_feature_batch_l2.transpose(
            1, 0) - 2 * inner_product

        # calculate Q value ditance matrix
        # Here use target value to calculate
        Q_dist_matrix = target_batch.expand_as(distance_matrix)
        Q_dist_matrix = Q_dist_matrix - Q_dist_matrix.transpose(
            1, 0)  # not absolute value
        Q_dist_matrix = Q_dist_matrix.abs()
        #print('distance q')
        #print(Q_dist_matrix.data)
        # Number[i,j] = Number[i,j] + (D_f[i,j] <= sample_S^2 AND D_Q[i,j] <= sample_Q AND action[i]=action[j])
        # only consider same actions
        Action_Mask = (action_batch.expand_as(distance_matrix)) == (
            action_batch.transpose(1, 0).expand_as(distance_matrix))
        Mask = (distance_matrix.data <=
                (self.SAMPLE_S)) & (Q_dist_matrix.data <=
                                    self.SAMPLE_Q) & Action_Mask.data
        Cluster = []
        #print('mask')
        counter = 0
        while True:
            # clustering by VERTEX-COVER-ALL-VERTEX, always find largest degree
            #print('counter = {}' . format(counter))
            counter += 1

            Number = Mask.sum(dim=1)
            value, indx = Number.max(dim=0)
            #print('indx= {}' . format(indx))
            if value[0] == 0:
                # already empty
                break
            v = Mask[indx]
            #print(v)
            #print(Mask)
            Cluster.append(v)
            # delete vertices
            Delete = v.expand_as(Mask) | v.transpose(1, 0).expand_as(Mask)
            Delete = Delete ^ 1
            #Delete = v.transpose(1,0).matmul(v) ^ 1
            #print(Delete)
            Mask = Mask & Delete
        k = len(Cluster)
        Cluster = torch.cat(Cluster)
        #print('cluster')
        #print(Cluster)
        Number = Cluster.sum(dim=1).type(LongTensor)
        probability_batch = torch.ones(k) / float(k)
        cluster_is = torch.multinomial(probability_batch,
                                       self.BATCH_SIZE,
                                       replacement=True)
        # convert the cluster indices to number of items in each cluster
        Sample_num = torch.eye(k).index_select(
            0, cluster_is).sum(dim=0).type(LongTensor)
        #N = Cluster[0].size()[0] # number of vertices
        state_sample = []
        action_sample = []
        target_sample = []
        for i in range(k):
            n = Sample_num[i]
            N = Number[i]
            if n == 0:
                continue
            cluster = Cluster[i]
            # get nonzero indices
            v_indices = cluster.nonzero().squeeze(1)
            if n == N:
                # pick up all
                state_sample.append(state_batch.index_select(0, v_indices))
                action_sample.append(action_batch.index_select(0, v_indices))
                target_sample.append(target_batch.index_select(0, v_indices))
                continue
            prob = torch.ones(v_indices.size()) / n
            if n < N:
                # uniformly pick
                v_indices_is = torch.multinomial(prob, n)
                v_indices = v_indices.index_select(0, v_indices_is)
                state_sample.append(state_batch.index_select(0, v_indices))
                action_sample.append(action_batch.index_select(0, v_indices))
                target_sample.append(target_batch.index_select(0, v_indices))
                continue
            # uniformly pick with replacement
            v_indices_is = torch.multinomial(prob, n, replacement=True)
            v_indices = v_indices.index_select(0, v_indices_is)
            state_sample.append(state_batch.index_select(0, v_indices))
            action_sample.append(action_batch.index_select(0, v_indices))
            target_sample.append(target_batch.index_select(0, v_indices))
        state_batch = torch.cat(state_sample)
        action_batch = torch.cat(action_sample)
        target_batch = torch.cat(target_sample)

        state_batch.volatile = False
        state_batch.requires_grad = True
        action_batch.volatile = False
        target_batch.volatile = False
        return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):

        os.system("taskset -p 0xff %d" % os.getpid())
        pretrain = True
        i = 0
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping...')
                time.sleep(1.)
            for step_i in range(1, self.TRAIN_MAX + 1):
                # minibatch, and get the target value
                #print('training... step {}' . format(step_i))
                #self.semaphore.acquire()
                #memory = copy.deepcopy(self.memory)
                memory = self.memory
                #self.semaphore.release()
                if len(memory) < 1000:
                    continue
                i += 1
                print('training... {}'.format(i))
                batch_tuple = self.minibatch(memory, pretrain)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    self.copy_weights()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True
            #if i == 50:
            #    pretrain = False
            pretrain = False
Exemple #6
0
class Evaluator(multiprocessing.Process):
    def __init__(self, memory, shared, semaphore):
        multiprocessing.Process.__init__(self)
        # hyperparameters
        self.TRAIN_MAX = 1
        self.TRANSFER = 1
        self.BATCH_SIZE = 32
        self.GAMMA = 0.95
        self.SAMPLE_ALPHA = 0.5
        self.SAMPLE_EPISLON = 0.
        self.SAMPLE_BETA = 0.

        #LEARNING_RATE = 0.00025
        LEARNING_RATE = 1e-3
        MOMENTUM = 0.95
        SQUARED_MOMENTUM = 0.95
        MIN_SQUARED_GRAD = 0.01

        self.net = DQN()  # Deep Net
        self.targetNet = DQN()
        self.copy_weights()
        self.net.setOptimizer(
            optim.Adam(self.net.parameters(), lr=LEARNING_RATE))
        #self.net.setOptimizer(optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE,
        #                                        momentum=MOMENTUM, alpha=SQUARED_MOMENTUM,
        #                                        eps=MIN_SQUARED_GRAD))
        self.memory = memory
        self.shared = shared  # shared resources, {'memory', 'SENT_FLAG'}
        self.semaphore = semaphore

    def minibatch(self, exp_replay, pretrain=False):
        batch = random.sample(list(exp_replay), self.BATCH_SIZE)
        unzipped = list(zip(*batch))
        state_batch = Variable(torch.from_numpy(np.array(
            unzipped[0])).type(FloatTensor),
                               volatile=True)
        # here previously no type conversion, and result in error
        action_batch = Variable(torch.from_numpy(np.array(
            unzipped[1])).type(LongTensor),
                                volatile=True)
        reward_batch = Variable(torch.from_numpy(np.array(
            unzipped[2])).type(FloatTensor),
                                volatile=True)
        target_batch = None
        if pretrain:
            # only use reward
            target_batch = reward_batch
        else:
            term_batch = Variable(torch.from_numpy(np.array(
                unzipped[4])).type(FloatTensor),
                                  volatile=True)
            next_state_batch = Variable(torch.from_numpy(np.array(
                unzipped[3])).type(FloatTensor),
                                        volatile=True)
            # here previously no type conversion, and result in error
            next_state_values = self.targetNet(next_state_batch).max(
                1)[0].unsqueeze(1)
            next_state_values = term_batch * next_state_values
            next_state_values.volatile = False
            target_batch = reward_batch + (self.GAMMA * next_state_values)

        state_batch.volatile = False
        state_batch.requires_grad = True
        action_batch.volatile = False
        target_batch.volatile = False

        return state_batch, action_batch, target_batch

    def copy_weights(self):
        self.targetNet.load_state_dict(self.net.state_dict())

    def run(self):
        # keep two nets: Q-net, and target-net
        # keep looping:
        #   0. loop until SENT_FLAG is not set
        #
        #   1. loop for a fixed # of steps:
        #         minibatch, and get the target value for the batch
        #         optimize the net parameters by this batch
        #         for some fixed time, copy weights from Q-net to target-net
        #
        #   2. set copy weights from Q-net to shared weights
        #      set SENT_FLAG to true
        # TODO: pretrain in the first loop
        os.system("taskset -p 0xff %d" % os.getpid())
        pretrain = True
        i = 0
        while True:
            while self.shared['SENT_FLAG']:
                # loop until it is set to 0
                print('sleeping...')
                time.sleep(1.)
            for step_i in range(1, self.TRAIN_MAX + 1):
                memory = self.memory

                if len(memory) < 1000:
                    continue
                i += 1
                print('training... {}'.format(i))
                batch_tuple = self.minibatch(memory, pretrain)
                loss = self.net.optimize(batch_tuple)
                #print('loss: {}' . format(loss))
                #print('optimized')
                if step_i % self.TRANSFER == 0:
                    self.copy_weights()
            self.shared['weights'] = self.net.state_dict()
            self.shared['SENT_FLAG'] = True

            pretrain = False
Exemple #7
0
import os


def wait(T):
    for i in list(range(T))[::-1]:
        print(i + 1)
        time.sleep(1)


if __name__ == '__main__':
    # hyperparameters
    os.system(
        "taskset -p 0xff %d" % os.getpid()
    )  #https://stackoverflow.com/questions/15639779/why-does-multiprocessing-use-only-a-single-core-after-i-import-numpy
    MEMORY_SIZE = 5000
    imp_net = DQN()
    # populate memory
    # let improver populate first
    manager = SyncManager()
    manager.start()
    memory = ReplayMemory(MEMORY_SIZE)
    s = multiprocessing.Semaphore(1)
    #memory = multiprocessing.Queue(MEMORY_SIZE)
    memory = manager.list()
    shared = manager.dict({'SENT_FLAG': True, 'weights': None})
    #shared = manager.dict({'memory':memory, 'SENT_FLAG':True, 'weights':None})
    #improver = Improver(imp_net, shared, myGym(), s)
    improver = Improver(imp_net, MEMORY_SIZE, memory, shared, myGym(), s)
    # improver is executed by the main process
    evaluator = Evaluator(memory, shared, s)