-
Notifications
You must be signed in to change notification settings - Fork 0
/
mtlgen.py
482 lines (416 loc) · 17.5 KB
/
mtlgen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
#!/usr/bin/env python
#
# Problem generator for multi-task reinforcement learning problems
#
# Usage:
#
#
# License:
#
# Copyright 2012 Deon Garrett <deong@acm.org>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import argparse
import ast
import random
import numpy as np
import scipy.linalg as sla
import numpy.random as npr
import networkx as nx
import math
#
# create random reward structure for an (nstates, nactions) MDP
#
# parameters:
# nstates: number of states
# nactions: number of actions
# covmat: nxn covariance matrix, where n=number of tasks
# returns:
# an (nstates x nactions x ntasks) reward matrix
#
def mvnrewards(nstates, nactions, mu, covmat):
"""Create a random reward structure for an (nstates, nactions) MDP
where the rewards for each pair of tasks are correlated according
to the specified covariance matrix."""
# make sure covmat is positive definite; raise an exception if it
# isn't. Note that the multivariate_normal call succeeds either way
# but the results aren't physically meaningful if the matrix isn't
# semi-positive definite, and we'd rather bail than generate data
# that doesn't match what the user asked for.
sla.cholesky(covmat)
ntasks = covmat.shape[0]
rewards = npr.multivariate_normal(mu, covmat, (nstates, nactions))
return rewards
#
# create a random graph with uniform out-degree
#
# parameters:
# nstates: number of states
# nactions: number of actions
# returns:
# a directed multigraph with each node having out-degree exactly
# equal to edges and in-degree > 0.
#
def rgud(nodes, edges):
"""Create a random graph representing the transition graph for a
random MDP."""
# need din/dout -- degree sequences
# dout is easy...every node gets a constant number of outbound edges
# equal to the number actions in the MDP. din is tougher...we want
# the in-degree of each node to be random, subject to the constraints
# that every node must be reachable by at least one edge and the total
# in-degree over the entire graph must equal the total out-degree.
dout = [edges] * nodes
# to compute din, we generate a random sequence of N+1 random numbers,
# take the difference between adjacent pairs, and scale up to the
# desired target sum (with rounding)
xs = np.sort(np.asarray([random.random() for i in range(nodes + 1)]))
diffs = xs[1:] - xs[:nodes]
diffs = sum(dout) / sum(diffs) * diffs
din = [int(round(x)) for x in diffs]
# at this point, din contains random fan-ins for each node, but we
# may have nodes with 0 edges, and due to rounding, we may be off
# by one in the sum as well. So walk the list, bumping any zero values
# up to one, and then randomly remove any excess we have by decrementing
# some of the nodes with larger fan-ins
total_in = sum(din)
for index, degree in enumerate(din):
if degree < 1:
din[index] = 1
total_in += 1
# now remove edges randomly until the degrees match
while total_in > sum(dout):
node = random.randint(0, nodes - 1)
if din[node] > 1:
din[node] -= 1
total_in -= 1
# finally, a last sanity check...if we don't have enough inbound
# edges, add some more. Note that I'm not sure this ever happens,
# but it's easy enough to handle.
while total_in < sum(dout):
node = random.randint(0, nodes - 1)
din[node] += 1
total_in += 1
# if we did this right, the sums should be guaranteed to match
assert(sum(din) == sum(dout))
# generate a random directed multigraph with the specified degree
# sequences
tgraph = nx.directed_configuration_model(din, dout)
return tgraph
#
# take an arbitrary directed graph and make it strongly connected,
# maintaining the total number of incoming and outgoing edges for
# each vertex.
#
def make_strongly_connected(G):
components = nx.strongly_connected_components(G)
num_components = len(components)
if num_components == 1:
return G
# for each connected component, connect one node to a node in
# the successor component, and delete an edge to make up for it.
# which edge to delete isn't trivial -- it only needs to be an edge
# that is somehow redundant in terms of connecting the graph. Our
# approach is to delete an edge at random, and keep trying until
# either the graph is connected or we exhaust the number of tries.
attempts = 0
max_attempts = num_components * math.log2(num_components)
while num_components > 1 and attempts < max_attempts:
for index in range(num_components):
source_comp = components[index]
target_comp = components[(index+1) % num_components]
# pick a random vertex from the source component and connect it
# to a vertex in the target component, deleting one of the outgoing
# edges from the source vertex to keep the degree constant
source_vertex = source_comp[npr.randint(len(source_comp))]
target_vertex = target_comp[npr.randint(len(target_comp))]
source_edges = list(G[source_vertex].keys())
G.remove_edge(source_vertex, source_edges[npr.randint(len(source_edges))])
G.add_edge(source_vertex, target_vertex)
components = nx.strongly_connected_components(G)
num_components = len(components)
attempts += 1
return G
# construct a new MDP given a set of rewards and a transition graph
# and write it to stdout
#
# the format of the output records is as follows
#
# -------------------------------------------------------------
# numStates numActions numTasks
#
# 0 [successor [reward_i]{numTasks}]{numActions}
# ...
# numStates-1 [successor [reward_i]{numTasks}]{numActions}
# -------------------------------------------------------------
#
# parameters:
# G: state transition graph
# R: reward structure
#
def write_instance(G, R):
n, m, k = R.shape
# number of nodes in the transition graph should be equal to
# the number of states in the reward matrix
assert(G.number_of_nodes() == n)
print("{} {} {}\n".format(n, m, k))
for node in G:
line = "{} ".format(node)
for index, edge in enumerate(G.successors(node)):
# note that the enumeration flattens out any duplicated
# edges; dups are fine for MDPs -- they just indicate two
# actions that lead to the same successor state. So we
# compensate for this by calculating the number of dups
# and explicitly repeating them the right number of times
for i in range(0, len(G[node][edge])):
line += "{} ".format(edge)
for task in range(0, k):
line += "{0:.3f} ".format(R[node, index, task])
print(line)
print("\n")
#
# Maze-type problems (gridworld)
#
# The basic structure of this type of problem is that each is a 2-d world consisting
# of separate "trails" for each task. Note that there are generally no walls between
# these trails, so an agent is free to move through the world as it chooses. The trails
# simply imply that each cell in the world is marked with a task number, and if the
# agent is in a cell with task number X, there is a path leading to the goal state for
# task X through which all intermediate cells are also marked X.
#
# The transition dynamics are simple up, down, left, right actions for each state. If
# you are at a boundary cell, attempting to move out of the world results in a negative
# penalty for each task and leaves the agent in the same state. This is true even if
# the current state is a goal state.
#
#
# Multi-Maze Generator using Depth-first Search
# Multi-Maze: Maze w/ multiple paths to solve
#
# Code adapted from
# http://code.activestate.com/recipes/578378-random-multi-maze-generator/
# Available under MIT License.
#
# The output of this function is a 2d matrix structure, where the (i,j) pair
# is the "path number", i.e., maze[i][j] = k if cell (i,j) in the maze is
# used by the k-th distinct path through the maze.
#
def make_multimaze(width, height, nTasks):
# width and height of the maze
mx = width
my = height
# 4 directions to move in the maze
dx = [0, 1, 0, -1]
dy = [-1, 0, 1, 0]
maze = [[0 for x in range(mx)] for y in range(my)]
stack = [] # array of stacks
for i in range(nTasks):
while True:
kx = random.randint(0, mx - 1); ky = random.randint(0, my - 1)
if maze[ky][kx] == 0: break
stack.append([(kx, ky)])
maze[ky][kx] = i + 1
cont = True # continue
while cont:
cont = False
for p in range(nTasks):
if len(stack[p]) > 0:
cont = True # continue as long as there is a non-empty stack
(cx, cy) = stack[p][-1]
# find a new cell to add
nlst = [] # list of available neighbors
for i in range(4):
nx = cx + dx[i]
ny = cy + dy[i]
if nx >= 0 and nx < mx and ny >= 0 and ny < my:
if maze[ny][nx] == 0:
# of occupied neighbors must be 1
ctr = 0
for j in range(4):
ex = nx + dx[j]; ey = ny + dy[j]
if ex >= 0 and ex < mx and ey >= 0 and ey < my:
if maze[ey][ex] == p + 1: ctr += 1
if ctr == 1: nlst.append(i)
# if 1 or more neighbors available then randomly select one and add
if len(nlst) > 0:
ir = nlst[random.randint(0, len(nlst) - 1)]
cx += dx[ir]; cy += dy[ir]
maze[cy][cx] = p + 1
stack[p].append((cx, cy))
else: stack[p].pop()
return np.asarray(maze)
#
# create a reward structure for a maze
#
# The basic idea is that there should be a separate location in the
# maze with a positive reward for each task. As the maze generator
# produces separate paths, we should aim to put the reward for each
# task somewhere along that task's path.
#
# returns an array of the same size and shape as the maze, but with
# zeros everywhere except for $tasks non-zero entries.
#
def maze_goal_states(maze, tasks, mu, cov):
rows = maze.shape[0]
cols = maze.shape[1]
reward_values = npr.multivariate_normal(mu, cov)
goals = np.zeros([tasks, rows, cols])
# for each task, build a list of maze locations with that task id;
# choose one at random to be the selected goal state for that task
for task in range(tasks):
locs = np.transpose(np.where(maze == (task+1)))
goal_loc = locs[npr.randint(0, len(locs))]
goals[task, goal_loc[0], goal_loc[1]] = reward_values[task]
return goals
#
# write an instance of the multimaze problem
#
# Ideally, this should be unified with the graph instances, but the
# way that duplicate edges are handled in the graph-based instances
# loses information that is important for mazes (it mixes up which
# action is which). For now I handle this by using a custom writer
# for the mazes.
#
def write_maze_instance(maze, goals):
tasks, rows, cols = goals.shape
print("{} {} {}\n".format(rows*cols, 4, tasks))
for row in range(rows):
for col in range(cols):
node_num = rowcol_to_index(maze, row, col)
line = "{} ".format(node_num)
# order of actions is up, down, left, right
neighbors = [(x, col) for x in [row-1, row+1]] + [(row, y) for y in [col-1, col+1]]
for action, (x,y) in enumerate(neighbors):
target = rowcol_to_index(maze, x, y)
if target != None:
line += "{} ".format(target)
for task in range(tasks):
line += "{} ".format(goals[task, x, y])
else:
line += "{} ".format(node_num)
for task in range(tasks):
line += "{} ".format(-10)
print(line)
print("\n")
#
# take a maze, row, and column, and return a node number or None
# if the requested row and column are out of bounds
#
def rowcol_to_index(maze, row, col):
rows, cols = maze.shape
if row < 0 or row >= rows or col < 0 or col >= cols:
return None
index = row*maze.shape[0] + col
if index >= maze.size or index < 0:
return None
else:
return index
#
# convert a correlation matrix to a covariance matrix with given standard deviations
#
def cor2cov(R, sigma):
return np.diag(sigma).dot(R).dot(np.diag(sigma))
def demo_rgud():
R = np.asarray([[ 1.0, 0.4, -0.4],
[ 0.4, 1.0, 0.6],
[-0.4, 0.6, 1.0]])
nstates = 50
nactions = 2
mu = [100.0] * 3
sigma = [10.0] * 3
cov = cor2cov(R, sigma)
rewards = mvnrewards(nstates, nactions, mu, cov)
G = rgud(nstates, nactions)
cc = nx.strongly_connected_components(G)
G2 = make_strongly_connected(G)
cc2 = nx.strongly_connected_components(G2)
return (G, G2, cc, cc2)
def demo_maze():
R = np.asarray([[ 1.0, 0.4, -0.4],
[ 0.4, 1.0, 0.6],
[-0.4, 0.6, 1.0]])
mu = [100.0] * 3
sigma = [10.0] * 3
cov = cor2cov(R, sigma)
z = make_multimaze(10, 10, 3)
goals = maze_goal_states(z, 3, mu, cov)
return (z, goals)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument( "--demo", help="run with a sample set of parameters", action='store_true')
parser.add_argument("-t", "--type", default=None, help="problem instance type {rgudcr,rzcgl}")
parser.add_argument("-n", "--states", default=100, help="size of the state space", type=int)
parser.add_argument("-m", "--actions", default=4, help="number of available actions", type=int)
parser.add_argument("-k", "--tasks", default=2, help="number of concurrent tasks", type=int)
parser.add_argument("-c", "--correlation", help="task correlation matrix (in Python nested list form)")
parser.add_argument("-s", "--stdev", help="standard deviations of task rewards (in python list form)")
parser.add_argument("-x", "--rows", default=10, help="rows in random maze", type=int)
parser.add_argument("-y", "--cols", default=10, help="columns in random maze", type=int)
args = parser.parse_args()
if args.demo:
# testing mazes
# demo_maze()
# sys.exit(0)
# testing random graphs
G, G2, cc, cc2 = demo_rgud()
print("{} components: {}".format(len(cc), cc))
print("{} components: {}".format(len(cc2), cc2))
sys.exit(0)
# end testing
if not args.type:
parser.print_help()
sys.exit(1)
# read correlation matrix from command line argument. If not given, assume
# independent tasks
if not args.correlation:
args.correlation = np.identity(args.tasks)
else:
args.correlation = np.asarray(ast.literal_eval(args.correlation))
# read standard deviation for the rewards for each task. If not given, assume
# unit standard deviations
if not args.stdev:
args.stdev = np.ones(args.tasks)
else:
args.stdev = np.asarray(ast.literal_eval(args.stdev))
# compute a covariance matrix from the correlation matrix and standard deviations
cov = cor2cov(args.correlation, args.stdev)
if args.type == "rgudcr":
mu = [0.0] * args.tasks
rewards = mvnrewards(args.states, args.actions, mu, cov)
transition_graph = rgud(args.states, args.actions)
write_instance(transition_graph, rewards)
print("# type={}, states={}, actions={}, correlation={}, stdev={}".
format(args.type, args.states, args.actions, args.correlation.tolist(), args.stdev.tolist()))
elif args.type == "rzcgl":
maze = make_multimaze(args.rows, args.cols, args.tasks)
goals = maze_goal_states(maze)
# transition_graph = maze_transition_graph(maze, goals)
# rewards = np.zeros([args.rows * args.cols, 4, args.tasks])
# write_instance(transition_graph, rewards)
write_maze_instance(maze, goals)
print("# type={}, rows={}, cols={}, correlation={}, stdev={}".
format(args.type, args.rows, args.col, args.correlation.tolist(), args.stdev.tolist()))
else:
print("invalid problem type specified: {}", args.type)
parser.print_help()
sys.exit(1)
# R = np.asarray([[ 1.0, 0.4, -0.4],
# [ 0.4, 1.0, 0.6],
# [-0.4, 0.6, 1.0]])
# R = np.asarray([[ 1.0, 0.4, -0.4, 0.0],
# [ 0.4, 1.0, 0.6, -0.1],
# [-0.4, 0.6, 1.0, 0.2],
# [ 0.0, -0.1, 0.2, 1.0]])
# R = np.asarray([[1.0, 0.0],
# [0.0, 1.0]])