def __init__(self, t_layer_sizes, p_layer_sizes, dropout=0): self.t_layer_sizes = t_layer_sizes self.p_layer_sizes = p_layer_sizes # From our architecture definition, size of the notewise input self.t_input_size = 80 # time network maps from notewise input size to various hidden sizes self.time_model = StackedCells( self.t_input_size, celltype=LSTM, layers = t_layer_sizes) self.time_model.layers.append(PassthroughLayer()) #add the output layer of time model # pitch network takes last layer of time model and state of last note, moving upward # and eventually ends with a two-element sigmoid layer #The extra 2 input elements are #1. a value (0 or 1) for whether the previous (half-step lower) # note was chosen to be played (based on previous note-step, starts 0) #2. a value (0 or 1) for whether the previous (half-step lower) note was chosen to be articulated #(based on previous note-step, starts 0) p_input_size = t_layer_sizes[-1] + 2 self.pitch_model = StackedCells( p_input_size, celltype=LSTM, layers = p_layer_sizes) self.pitch_model.layers.append(Layer(p_layer_sizes[-1], 2, activation = T.nnet.sigmoid)) self.dropout = dropout self.conservativity = T.fscalar() #A placeholder for float number self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024))#an object that is used to generate random number self.setup_train() self.setup_predict() self.setup_slow_walk()
def __init__(self, data_manager, t_layer_sizes, p_layer_sizes, dropout=0): print('{:25}'.format("Initializing Model"), end='', flush=True) self.t_layer_sizes = t_layer_sizes self.p_layer_sizes = p_layer_sizes self.dropout = dropout self.data_manager = data_manager self.t_input_size = self.data_manager.f.feature_count self.output_size = self.data_manager.s.information_count self.time_model = StackedCells(self.t_input_size, celltype=LSTM, layers=t_layer_sizes) self.time_model.layers.append(PassthroughLayer()) p_input_size = t_layer_sizes[-1] + self.output_size self.pitch_model = StackedCells(p_input_size, celltype=LSTM, layers=p_layer_sizes) self.pitch_model.layers.append( Layer(p_layer_sizes[-1], self.output_size, activation=T.nnet.sigmoid)) self.conservativity = T.fscalar() self.srng = T.shared_randomstreams.RandomStreams( np.random.randint(0, 1024)) self.epsilon = np.spacing(np.float32(1.0)) print("Done")
def __init__(self, t_layer_sizes, p_layer_sizes, dropout=0): self.t_layer_sizes = t_layer_sizes self.p_layer_sizes = p_layer_sizes # From our architecture definition, size of the notewise input self.t_input_size = 80 # time network maps from notewise input size to various hidden sizes self.time_model = StackedCells( self.t_input_size, celltype=LSTM, layers = t_layer_sizes) self.time_model.layers.append(PassthroughLayer()) # pitch network takes last layer of time model and state of last note, moving upward # and eventually ends with a two-element sigmoid layer p_input_size = t_layer_sizes[-1] + 2 self.pitch_model = StackedCells( p_input_size, celltype=LSTM, layers = p_layer_sizes) self.pitch_model.layers.append(Layer(p_layer_sizes[-1], 2, activation = T.nnet.sigmoid)) self.dropout = dropout self.conservativity = T.fscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) self.setup_train() self.setup_predict() self.setup_slow_walk()
def __init__(self, t_layer_sizes, p_layer_sizes, dropout=0): self.t_layer_sizes = t_layer_sizes self.p_layer_sizes = p_layer_sizes # From our architecture definition, size of the notewise input self.t_input_size = 80 # time network maps from notewise input size to various hidden sizes self.time_model = StackedCells( self.t_input_size, celltype=LSTM, layers = t_layer_sizes) self.time_model.layers.append(PassthroughLayer()) # pitch network takes last layer of time model and state of last note, moving upward # and eventually ends with a two-element sigmoid layer p_input_size = t_layer_sizes[-1] + 2 self.pitch_model = StackedCells( p_input_size, celltype=LSTM, layers = p_layer_sizes) self.pitch_model.layers.append(Layer(p_layer_sizes[-1], 2, activation = T.nnet.sigmoid)) self.dropout = dropout self.conservativity = T.fscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) print "model-setup::Trace-1" self.setup_train() print "model-setup::Trace-2" self.setup_predict() print "model-setup::Trace-3" self.setup_slow_walk()
def __init__(self, hidden_size, input_size, vocab_size, stack_size=1, celltype=LSTM): # declare model self.model = StackedCells(input_size, celltype=celltype, layers=[hidden_size] * stack_size) # add an embedding self.model.layers.insert(0, Embedding(vocab_size, input_size)) # add a classifier: self.model.layers.append( Layer(hidden_size, vocab_size, activation=softmax)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self._stop_word = theano.shared(np.int32(999999999), name="stop word") self.for_how_long = T.ivector() self.input_mat = T.imatrix() self.priming_word = T.iscalar() self.srng = T.shared_randomstreams.RandomStreams( np.random.randint(0, 1024)) # create symbolic variables for prediction: self.predictions = self.create_prediction() # create symbolic variable for greedy search: self.greedy_predictions = self.create_prediction(greedy=True) # create gradient training functions: self.create_cost_fun() self.create_training_function() self.create_predict_function() # For saving state self.epochs = 0
def __init__(self, hidden_size, input_size, output_size, stack_size=1, celltype=RNN, steps=40): # declare model self.model = StackedCells(input_size, celltype=celltype, layers=[hidden_size] * stack_size) # add a classifier: self.model.layers.append( Layer(hidden_size, output_size, activation=T.tanh)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps = steps self.gfs = T.tensor3('gfs') #输入gfs数据 self.pm25in = T.tensor3('pm25in') #pm25初始数据部分 self.layerstatus = None self.results = None self.cnt = T.tensor3('cnt') # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() self.create_predict_function() '''上面几步的意思就是先把公式写好'''
def __init__(self, hidden_size, input_size, stack_size=2, celltype=LSTM): self.input_size = input_size # Modelling self.model = StackedCells(input_size, celltype=celltype, activation=T.tanh, layers=[hidden_size] * stack_size) # disable modulation of the input layer self.model.layers[0].in_gate2.activation = lambda x: x # add an output layer self.model.layers.append( Layer(hidden_size, input_size, activation=softmax)) # Setup symbolic tensor variables that will be used in computation # inputs are windows of spectrum data self.input = T.fvector("input") self.prev_input = T.fvector("prev_input") # create symbolic variables for prediction: self.prediction = self.create_prediction() # create gradient training functions: self.create_cost_fun() self.create_training_function() self.create_predict_function()
class Model(object): """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, output_size, stack_size=1, celltype=RNN,steps=40): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = T.tanh)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=steps self.gfs=T.tensor3('gfs')#输入gfs数据 self.pm25in=T.tensor3('pm25in')#pm25初始数据部分 self.layerstatus=None self.results=None self.cnt = T.tensor3('cnt') # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() self.create_predict_function() '''上面几步的意思就是先把公式写好''' @property def params(self): return self.model.params def create_prediction(self):#做一次predict的方法 gfs=self.gfs pm25in=self.pm25in #初始第一次前传 self.layerstatus=self.model.forward(T.concatenate([gfs[:,0],gfs[:,1],gfs[:,2],pm25in[:,0],pm25in[:,1],self.cnt[:,:,0]],axis=1)) #results.shape?40*1 self.results=self.layerstatus[-1] if self.steps > 1: self.layerstatus=self.model.forward(T.concatenate([gfs[:,1],gfs[:,2],gfs[:,3],pm25in[:,1],self.results,self.cnt[:,:,1]],axis=1),self.layerstatus) self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) #前传之后step-2次 for i in xrange(2,self.steps): self.layerstatus=self.model.forward(T.concatenate([gfs[:,i],gfs[:,i+1],gfs[:,i+2],T.shape_padright(self.results[:,i-2]),T.shape_padright(self.results[:,i-1]),self.cnt[:,:,i]],axis=1),self.layerstatus) #need T.shape_padright??? self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) return self.results def create_predict_function(self): self.pred_fun = theano.function(inputs=[self.gfs,self.pm25in,self.cnt],outputs =self.predictions,allow_input_downcast=True) def __call__(self, gfs,pm25in): return self.pred_fun(gfs,pm25in)
def __init__(self, time_model_layer_sizes, note_model_layer_sizes): self.time_model = StackedCells(input_size, celltype=LSTM, layers=time_model_layer_sizes) self.time_model.layers.append(Router()) note_model_input_size = time_model_layer_sizes[-1] + outptu_size self.note_model = StackedCells(note_model_input_size, celltype=LSTM, layers=note_model_layer_sizes) self.note_model.layers.append(Layer(note_model_layer_sizes[-1], output_size, activation=T.nnet.sigmoid)) self.time_model_layer_sizes = time_model_layer_sizes self.note_model_layer_sizes = note_model_layer_sizes self._initialize_update_function() self._initialize_predict_function()
def __init__(self, input_parts, layer_sizes, output_size, window_size=0, dropout=0, mode="drop", unroll_batch_num=None): """ Parameters: input_parts: A list of InputParts layer_sizes: A list of the form [ (indep, per_note), ... ] where indep is the number of non-shifted cells to have, and per_note is the number of cells to have per window note, which shift as the network moves Alternately can just be [ indep, ... ] output_size: An integer, the width of the desired output dropout: How much dropout to apply. mode: Either "drop" or "roll". If drop, discard memory that goes out of range. If roll, roll it instead """ self.input_parts = input_parts self.window_size = window_size layer_sizes = [ x if isinstance(x, tuple) else (x, 0) for x in layer_sizes ] self.layer_sizes = layer_sizes self.tot_layer_sizes = [(indep + per_note * self.window_size) for indep, per_note in layer_sizes] self.output_size = output_size self.dropout = dropout self.input_size = sum(part.PART_WIDTH for part in input_parts) self.cells = StackedCells(self.input_size, celltype=LSTM, activation=T.tanh, layers=self.tot_layer_sizes) self.cells.layers.append( Layer(self.tot_layer_sizes[-1], self.output_size, activation=lambda x: x)) assert mode in ("drop", "roll"), "Must specify either drop or roll mode" self.mode = mode self.unroll_batch_num = unroll_batch_num
class Model(object): """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, output_size, stack_size, celltype=RNN,steps=40): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size]*stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = lambda x:x)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=steps self.gfs=T.tensor3('gfs')#输入gfs数据 self.pm25in=T.tensor3('pm25in')#pm25初始数据部分 self.layerstatus=None self.results=None # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() self.create_predict_function() @property def params(self): return self.model.params def create_prediction(self):#做一次predict的方法 gfs=self.gfs pm25in=self.pm25in #初始第一次前传 gfs_x=T.concatenate([gfs[:,0],gfs[:,1],gfs[:,2]],axis=1) pm25in_x=T.concatenate([pm25in[:,0],pm25in[:,1]],axis=1) self.layerstatus=self.model.forward(T.concatenate([gfs_x,pm25in_x],axis=1)) self.results=self.layerstatus[-1] pm25next=pm25in[:,1]-self.results if self.steps > 1: for i in xrange(1,self.steps): gfs_x=T.concatenate([gfs_x[:,9:],gfs[:,i+2]],axis=1) pm25in_x=T.concatenate([pm25in_x[:,1:],pm25next],axis=1) self.layerstatus=self.model.forward(T.concatenate([gfs_x,pm25in_x],axis=1),self.layerstatus) self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) pm25next=pm25next-self.layerstatus[-1] return self.results def create_predict_function(self): self.pred_fun = theano.function(inputs=[self.gfs,self.pm25in],outputs =self.predictions,allow_input_downcast=True) def __call__(self, gfs,pm25in): return self.pred_fun(gfs,pm25in)
def __init__(self, input_parts, layer_sizes, output_size, window_size=0, dropout=0, mode="drop", unroll_batch_num=None): """ Parameters: input_parts: A list of InputParts layer_sizes: A list of the form [ (indep, per_note), ... ] where indep is the number of non-shifted cells to have, and per_note is the number of cells to have per window note, which shift as the network moves Alternately can just be [ indep, ... ] output_size: An integer, the width of the desired output dropout: How much dropout to apply. mode: Either "drop" or "roll". If drop, discard memory that goes out of range. If roll, roll it instead """ self.input_parts = input_parts self.window_size = window_size layer_sizes = [x if isinstance(x,tuple) else (x,0) for x in layer_sizes] self.layer_sizes = layer_sizes self.tot_layer_sizes = [(indep + per_note*self.window_size) for indep, per_note in layer_sizes] self.output_size = output_size self.dropout = dropout self.input_size = sum(part.PART_WIDTH for part in input_parts) self.cells = StackedCells( self.input_size, celltype=LSTM, activation=T.tanh, layers = self.tot_layer_sizes ) self.cells.layers.append(Layer(self.tot_layer_sizes[-1], self.output_size, activation = lambda x:x)) assert mode in ("drop", "roll"), "Must specify either drop or roll mode" self.mode = mode self.unroll_batch_num = unroll_batch_num
def __init__(self, hidden_size, input_size, n_components, stack_size=1, celltype=LSTM): # declare model self.model = StackedCells(input_size, celltype=celltype, layers=[hidden_size] * stack_size) # add an embedding self.model.layers.insert(0, Embedding(vocab_size, input_size)) # add a classifier: self.model.layers.append(Layer(hidden_size, vocab_size, activation=linear)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self._stop_word = theano.shared(np.int32(999999999), name="stop word") self.for_how_long = T.ivector() self.input_mat = T.imatrix() self.priming_word = T.iscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) # create symbolic variables for prediction: self.predictions = self.create_prediction() # create symbolic variable for greedy search: self.greedy_predictions = self.create_prediction(greedy=True) # create gradient training functions: self.create_cost_fun() self.create_training_function() self.create_predict_function()
def __init__(self, hidden_size, input_size, output_size, stack_size, celltype=RNN,steps=40): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size]*stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = lambda x:x)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=steps self.gfs=T.tensor3('gfs')#输入gfs数据 self.pm25in=T.tensor3('pm25in')#pm25初始数据部分 self.layerstatus=None self.results=None # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() self.create_predict_function()
def __init__(self, hidden_size, input_size, vocab_size, entropy_reg = 0.001, key_entropy_reg = 0.001, stack_size=1, celltype=LSTM): # core layer in RNN/LSTM self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add an embedding self.model.layers.insert(0, Embedding(vocab_size, input_size)) # add a classifier: self.model.layers.append(Layer(hidden_size, vocab_size, activation = softmax)) self.entropy_reg = entropy_reg self.key_entropy_reg = key_entropy_reg self.turing_params = Parameters() #init turing machine model self.turing_updates , self.turing_predict = turing_model.build(self.turing_params , hidden_size , vocab_size) self.hidden_size = hidden_size # inputs are matrices of indices, # each row is a sentence, each column a timestep self._stop_word = theano.shared(np.int32(999999999), name="stop word") self.for_how_long = T.ivector() self.mask_matrix = T.imatrix() self.input_mat = T.imatrix() self.priming_word = T.iscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) # create symbolic variables for prediction: #change by darong #issue : what is greedy self.lstm_predictions = self.create_lstm_prediction() self.final_predictions,self.entropy,self.key_entropy = self.create_final_prediction() # create symbolic variable for greedy search: self.greedy_predictions = self.create_lstm_prediction(greedy=True) # create gradient training functions: self.create_cost_fun()#create 2 cost func(lstm final) self.lstm_lr = 0.01 self.turing_lr = 0.01 self.all_lr = 0.01 self.create_training_function()#create 3 functions(lstm turing all) self.create_predict_function()#create 2 predictions(lstm final) # create ppl self.lstm_ppl = self.create_lstm_ppl() self.final_ppl = self.create_final_ppl() self.create_ppl_function()
def __init__(self, hidden_size, input_size, output_size, stack_size=1, celltype=RNN): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = T.tanh)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=T.iscalar() self.gfs=T.matrix()#输入gfs数据 self.pm25in=T.matrix()#pm25初始数据部分 self.pm25target=T.matrix()#输出的目标target self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() # create gradient training functions: self.create_cost_fun() self.create_training_function() self.create_predict_function() '''上面几步的意思就是先把公式写好'''
def __init__(self, hidden_size, input_size, output_size, stack_size=1, celltype=RNN,steps=40): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = T.tanh)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=steps self.gfs=T.tensor3('gfs')#输入gfs数据 self.pm25in=T.tensor3('pm25in')#pm25初始数据部分 self.layerstatus=None self.results=None self.cnt = T.tensor3('cnt') # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() self.create_predict_function() self.pm25target=T.matrix('pm25target')#输出的目标target,这一版把target维度改了 self.create_valid_error() self.create_validate_function() '''上面几步的意思就是先把公式写好'''
def __init__(self, hidden_size, input_size, output_size, stack_size=1, celltype=Layer,steps=40): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = T.tanh)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=steps self.stepsin=T.iscalar('stepsin') self.x=T.tensor3('x')#输入gfs数据 self.target=T.tensor3('target')#输出的目标target,这一版把target维度改了 self.layerstatus=None self.results=None # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() self.predictions2 = self.create_prediction2() # create gradient training functions: self.create_cost_fun() self.create_valid_error() self.create_training_function() self.create_predict_function() self.create_validate_function() '''上面几步的意思就是先把公式写好'''
def __init__(self, hidden_size, input_size, output_size, celltype=Layer): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =hidden_size) # add a classifier: self.regression=Layer(hidden_size[-1], output_size[0], activation = T.tanh) self.classifier=Layer(hidden_size[-1], output_size[1], activation = softmax) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=T.iscalar('steps') self.x=T.tensor3('x')#输入gfs数据 self.target0=T.tensor3('target0')#输出的目标target,这一版把target维度改了 self.target1=T.itensor3('target1') self.layerstatus=None self.results=None # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions0,self.predictions1 = self.create_prediction() # create gradient training functions: #self.create_cost_fun() #self.create_valid_error() #self.create_training_function() self.create_predict_function() #self.create_validate_function() '''上面几步的意思就是先把公式写好'''
class Model(object): def __init__(self, t_layer_sizes, p_layer_sizes, dropout=0): self.t_layer_sizes = t_layer_sizes self.p_layer_sizes = p_layer_sizes # From our architecture definition, size of the notewise input self.t_input_size = 80 # time network maps from notewise input size to various hidden sizes self.time_model = StackedCells( self.t_input_size, celltype=LSTM, layers = t_layer_sizes) self.time_model.layers.append(PassthroughLayer()) # pitch network takes last layer of time model and state of last note, moving upward # and eventually ends with a two-element sigmoid layer p_input_size = t_layer_sizes[-1] + 2 self.pitch_model = StackedCells( p_input_size, celltype=LSTM, layers = p_layer_sizes) self.pitch_model.layers.append(Layer(p_layer_sizes[-1], 2, activation = T.nnet.sigmoid)) self.dropout = dropout self.conservativity = T.fscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) self.setup_train() self.setup_predict() self.setup_slow_walk() @property def params(self): return self.time_model.params + self.pitch_model.params @params.setter def params(self, param_list): ntimeparams = len(self.time_model.params) self.time_model.params = param_list[:ntimeparams] self.pitch_model.params = param_list[ntimeparams:] @property def learned_config(self): return [self.time_model.params, self.pitch_model.params, [l.initial_hidden_state for mod in (self.time_model, self.pitch_model) for l in mod.layers if has_hidden(l)]] @learned_config.setter def learned_config(self, learned_list): self.time_model.params = learned_list[0] self.pitch_model.params = learned_list[1] for l, val in zip((l for mod in (self.time_model, self.pitch_model) for l in mod.layers if has_hidden(l)), learned_list[2]): l.initial_hidden_state.set_value(val.get_value()) def setup_train(self): # dimensions: (batch, time, notes, input_data) with input_data as in architecture self.input_mat = T.btensor4() # dimensions: (batch, time, notes, onOrArtic) with 0:on, 1:artic self.output_mat = T.btensor4() self.epsilon = np.spacing(np.float32(1.0)) def step_time(in_data, *other): other = list(other) split = -len(self.t_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states def step_note(in_data, *other): other = list(other) split = -len(self.p_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states # We generate an output for each input, so it doesn't make sense to use the last output as an input. # Note that we assume the sentinel start value is already present # TEMP CHANGE: NO SENTINEL input_slice = self.input_mat[:,0:-1] n_batch, n_time, n_note, n_ipn = input_slice.shape # time_inputs is a matrix (time, batch/note, input_per_note) time_inputs = input_slice.transpose((1,0,2,3)).reshape((n_time,n_batch*n_note,n_ipn)) num_time_parallel = time_inputs.shape[1] # apply dropout if self.dropout > 0: time_masks = theano_lstm.MultiDropout( [(num_time_parallel, shape) for shape in self.t_layer_sizes], self.dropout) else: time_masks = [] time_outputs_info = [initial_state_with_taps(layer, num_time_parallel) for layer in self.time_model.layers] time_result, _ = theano.scan(fn=step_time, sequences=[time_inputs], non_sequences=time_masks, outputs_info=time_outputs_info) self.time_thoughts = time_result # Now time_result is a list of matrix [layer](time, batch/note, hidden_states) for each layer but we only care about # the hidden state of the last layer. # Transpose to be (note, batch/time, hidden_states) last_layer = get_last_layer(time_result) n_hidden = last_layer.shape[2] time_final = get_last_layer(time_result).reshape((n_time,n_batch,n_note,n_hidden)).transpose((2,1,0,3)).reshape((n_note,n_batch*n_time,n_hidden)) # note_choices_inputs represents the last chosen note. Starts with [0,0], doesn't include last note. # In (note, batch/time, 2) format # Shape of start is thus (1, N, 2), concatenated with all but last element of output_mat transformed to (x, N, 2) start_note_values = T.alloc(np.array(0,dtype=np.int8), 1, time_final.shape[1], 2 ) correct_choices = self.output_mat[:,1:,0:-1,:].transpose((2,0,1,3)).reshape((n_note-1,n_batch*n_time,2)) note_choices_inputs = T.concatenate([start_note_values, correct_choices], axis=0) # Together, this and the output from the last LSTM goes to the new LSTM, but rotated, so that the batches in # one direction are the steps in the other, and vice versa. note_inputs = T.concatenate( [time_final, note_choices_inputs], axis=2 ) num_timebatch = note_inputs.shape[1] # apply dropout if self.dropout > 0: pitch_masks = theano_lstm.MultiDropout( [(num_timebatch, shape) for shape in self.p_layer_sizes], self.dropout) else: pitch_masks = [] note_outputs_info = [initial_state_with_taps(layer, num_timebatch) for layer in self.pitch_model.layers] note_result, _ = theano.scan(fn=step_note, sequences=[note_inputs], non_sequences=pitch_masks, outputs_info=note_outputs_info) self.note_thoughts = note_result # Now note_result is a list of matrix [layer](note, batch/time, onOrArticProb) for each layer but we only care about # the hidden state of the last layer. # Transpose to be (batch, time, note, onOrArticProb) note_final = get_last_layer(note_result).reshape((n_note,n_batch,n_time,2)).transpose(1,2,0,3) # The cost of the entire procedure is the negative log likelihood of the events all happening. # For the purposes of training, if the ouputted probability is P, then the likelihood of seeing a 1 is P, and # the likelihood of seeing 0 is (1-P). So the likelihood is (1-P)(1-x) + Px = 2Px - P - x + 1 # Since they are all binary decisions, and are all probabilities given all previous decisions, we can just # multiply the likelihoods, or, since we are logging them, add the logs. # Note that we mask out the articulations for those notes that aren't played, because it doesn't matter # whether or not those are articulated. # The padright is there because self.output_mat[:,:,:,0] -> 3D matrix with (b,x,y), but we need 3d tensor with # (b,x,y,1) instead active_notes = T.shape_padright(self.output_mat[:,1:,:,0]) mask = T.concatenate([T.ones_like(active_notes),active_notes], axis=3) loglikelihoods = mask * T.log( 2*note_final*self.output_mat[:,1:] - note_final - self.output_mat[:,1:] + 1 + self.epsilon ) self.cost = T.neg(T.sum(loglikelihoods)) updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta") self.update_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs=self.cost, updates=updates, allow_input_downcast=True) self.update_thought_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs= ensure_list(self.time_thoughts) + ensure_list(self.note_thoughts) + [self.cost], allow_input_downcast=True) def _predict_step_note(self, in_data_from_time, *states): # States is [ *hiddens, last_note_choice ] hiddens = list(states[:-1]) in_data_from_prev = states[-1] in_data = T.concatenate([in_data_from_time, in_data_from_prev]) # correct for dropout if self.dropout > 0: masks = [1 - self.dropout for layer in self.pitch_model.layers] masks[0] = None else: masks = [] new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) # Now new_states is a per-layer set of activations. probabilities = get_last_layer(new_states) # Thus, probabilities is a vector of two probabilities, P(play), and P(artic | play) shouldPlay = T.cast(self.srng.uniform() < (probabilities[0] ** self.conservativity), 'int16') shouldArtic = T.cast(shouldPlay * (self.srng.uniform() < probabilities[1]), 'int16') chosen = T.stack(shouldPlay, shouldArtic) # print chosen return ensure_list(new_states) + [chosen] # l = map(lambda x: T.cast(x, 'float32'), ensure_list(new_states) + [chosen]) # print l # return l def setup_predict(self): # In prediction mode, note steps are contained in the time steps. So the passing gets a little bit hairy. self.predict_seed = T.bmatrix() self.steps_to_simulate = T.iscalar() def step_time(*states): # States is [ *hiddens, prev_result, time] hiddens = list(states[:-2]) in_data = states[-2] time = states[-1] # correct for dropout if self.dropout > 0: masks = [1 - self.dropout for layer in self.time_model.layers] masks[0] = None else: masks = [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) # Now new_states is a list of matrix [layer](notes, hidden_states) for each layer time_final = get_last_layer(new_states) start_note_values = theano.tensor.alloc(np.array(0,dtype=np.int8), 2) # This gets a little bit complicated. In the training case, we can pass in a combination of the # time net's activations with the known choices. But in the prediction case, those choices don't # exist yet. So instead of iterating over the combination, we iterate over only the activations, # and then combine in the previous outputs in the step. And then since we are passing outputs to # previous inputs, we need an additional outputs_info for the initial "previous" output of zero. note_outputs_info = ([ initial_state_with_taps(layer) for layer in self.pitch_model.layers ] + [ dict(initial=start_note_values, taps=[-1.0]) ]) #print note_outputs_info #print LSTM #.initial_hidden_state notes_result, updates = theano.scan(fn=self._predict_step_note, sequences=[time_final], outputs_info=note_outputs_info) # Now notes_result is a list of matrix [layer/output](notes, onOrArtic) output = get_last_layer(notes_result) next_input = OutputFormToInputFormOp()(output, time + 1) # TODO: Fix time #next_input = T.cast(T.alloc(0, 3, 4),'int64') return (ensure_list(new_states) + [ next_input, time + 1, output ]), updates # start_sentinel = startSentinel() num_notes = self.predict_seed.shape[0] time_outputs_info = ([ initial_state_with_taps(layer, num_notes) for layer in self.time_model.layers ] + [ dict(initial=self.predict_seed, taps=[-1]), dict(initial=0, taps=[-1]), None ]) time_result, updates = theano.scan( fn=step_time, outputs_info=time_outputs_info, n_steps=self.steps_to_simulate ) self.predict_thoughts = time_result self.predicted_output = time_result[-1] self.predict_fun = theano.function( inputs=[self.steps_to_simulate, self.conservativity, self.predict_seed], outputs=self.predicted_output, updates=updates, allow_input_downcast=True) self.predict_thought_fun = theano.function( inputs=[self.steps_to_simulate, self.conservativity, self.predict_seed], outputs=ensure_list(self.predict_thoughts), updates=updates, allow_input_downcast=True) def setup_slow_walk(self): self.walk_input = theano.shared(np.ones((2,2), dtype='int8')) self.walk_time = theano.shared(np.array(0, dtype='int64')) self.walk_hiddens = [theano.shared(np.ones((2,2), dtype=theano.config.floatX)) for layer in self.time_model.layers if has_hidden(layer)] # correct for dropout if self.dropout > 0: masks = [1 - self.dropout for layer in self.time_model.layers] masks[0] = None else: masks = [] new_states = self.time_model.forward(self.walk_input, prev_hiddens=self.walk_hiddens, dropout=masks) # Now new_states is a list of matrix [layer](notes, hidden_states) for each layer time_final = get_last_layer(new_states) start_note_values = theano.tensor.alloc(np.array(0,dtype=np.int8), 2) note_outputs_info = ([ initial_state_with_taps(layer) for layer in self.pitch_model.layers ] + [ dict(initial=start_note_values, taps=[-1]) ]) notes_result, updates = theano.scan(fn=self._predict_step_note, sequences=[time_final], outputs_info=note_outputs_info) # Now notes_result is a list of matrix [layer/output](notes, onOrArtic) output = get_last_layer(notes_result) next_input = OutputFormToInputFormOp()(output, self.walk_time + 1) # TODO: Fix time #next_input = T.cast(T.alloc(0, 3, 4),'int64') slow_walk_results = (new_states[:-1] + notes_result[:-1] + [ next_input, output ]) updates.update({ self.walk_time: self.walk_time+1, self.walk_input: next_input }) updates.update({hidden:newstate for hidden, newstate, layer in zip(self.walk_hiddens, new_states, self.time_model.layers) if has_hidden(layer)}) self.slow_walk_fun = theano.function( inputs=[self.conservativity], outputs=slow_walk_results, updates=updates, allow_input_downcast=True) def start_slow_walk(self, seed): seed = np.array(seed) num_notes = seed.shape[0] self.walk_time.set_value(0) self.walk_input.set_value(seed) for layer, hidden in zip((l for l in self.time_model.layers if has_hidden(l)),self.walk_hiddens): hidden.set_value(np.repeat(np.reshape(layer.initial_hidden_state.get_value(), (1,-1)), num_notes, axis=0))
class RelativeShiftLSTMStack(object): """ Manages a stack of LSTM cells with potentially a relative shift applied """ def __init__(self, input_parts, layer_sizes, output_size, window_size=0, dropout=0, mode="drop", unroll_batch_num=None): """ Parameters: input_parts: A list of InputParts layer_sizes: A list of the form [ (indep, per_note), ... ] where indep is the number of non-shifted cells to have, and per_note is the number of cells to have per window note, which shift as the network moves Alternately can just be [ indep, ... ] output_size: An integer, the width of the desired output dropout: How much dropout to apply. mode: Either "drop" or "roll". If drop, discard memory that goes out of range. If roll, roll it instead """ self.input_parts = input_parts self.window_size = window_size layer_sizes = [ x if isinstance(x, tuple) else (x, 0) for x in layer_sizes ] self.layer_sizes = layer_sizes self.tot_layer_sizes = [(indep + per_note * self.window_size) for indep, per_note in layer_sizes] self.output_size = output_size self.dropout = dropout self.input_size = sum(part.PART_WIDTH for part in input_parts) self.cells = StackedCells(self.input_size, celltype=LSTM, activation=T.tanh, layers=self.tot_layer_sizes) self.cells.layers.append( Layer(self.tot_layer_sizes[-1], self.output_size, activation=lambda x: x)) assert mode in ("drop", "roll"), "Must specify either drop or roll mode" self.mode = mode self.unroll_batch_num = unroll_batch_num @property def params(self): return self.cells.params + list( l.initial_hidden_state for l in self.cells.layers if has_hidden(l)) @params.setter def params(self, paramlist): self.cells.params = paramlist[:len(self.cells.params)] for l, val in zip((l for l in self.cells.layers if has_hidden(l)), paramlist[len(self.cells.params):]): l.initial_hidden_state.set_value(val.get_value()) def perform_step(self, in_data, shifts, hiddens, dropout_masks=[]): """ Perform a step through the LSTM network. in_data: A theano tensor (float32) of shape (batch, input_size) shifts: A theano tensor (int32) of shape (batch), giving the relative shifts to apply to the last hiddens hiddens: A list of hiddens [layer](batch, hidden_idx) dropout_masks: If [], apply dropout deterministically. Otherwise, should be a set of masks returned by get_dropout_masks, generally passed through a scan as a non-sequence. """ # hiddens is of shape [layer](batch, hidden_idx) # We want to permute the hidden_idx values according to shifts, # which are ints of shape (batch) n_batch = in_data.shape[0] new_hiddens = [] for layer_i, (indep, per_note) in enumerate(self.layer_sizes): if per_note == 0: # Don't bother with this layer new_hiddens.append(hiddens[layer_i]) continue # The theano_lstm code puts [memory_cells... , old_activations...] # We want to slide the memory cells only. lstm_hsplit = self.cells.layers[layer_i].hidden_size indep_mem = hiddens[layer_i][:, :indep] per_note_mem = hiddens[layer_i][:, indep:lstm_hsplit] remaining_values = hiddens[layer_i][:, lstm_hsplit:] # per_note_mem is (batch, per_note_mem) separated_mem = per_note_mem.reshape( (n_batch, self.window_size, per_note)) # separated_mem is (batch, note, mem) # [a b c ... x y z] shifted up 1 (+1) goes to [b c ... x y z 0] # [a b c ... x y z] shifted down 1 (-1) goes to [0 a b c ... x y] def _shift_step(c_mem, c_shift): # c_mem is (note, mem) # c_shift is an int if self.mode == "drop": def _clamp_w(x): return T.maximum(0, T.minimum(x, self.window_size)) ins_at_front = T.zeros((_clamp_w(-c_shift), per_note)) ins_at_back = T.zeros((_clamp_w(c_shift), per_note)) take_part = c_mem[_clamp_w(c_shift):self.window_size - _clamp_w(-c_shift), :] return T.concatenate( [ins_at_front, take_part, ins_at_back], 0) elif self.mode == "roll": return T.roll(c_mem, (-c_shift) % 12, axis=0) if self.unroll_batch_num is None: shifted_mem, _ = theano.map(_shift_step, [separated_mem, shifts]) else: shifted_mem_parts = [] for i in range(self.unroll_batch_num): shifted_mem_parts.append( _shift_step(separated_mem[i], shifts[i])) shifted_mem = T.stack(shifted_mem_parts) new_per_note_mem = shifted_mem.reshape( (n_batch, self.window_size * per_note)) new_layer_hiddens = T.concatenate( [indep_mem, new_per_note_mem, remaining_values], 1) new_hiddens.append(new_layer_hiddens) if dropout_masks == [] or not self.dropout: masks = [] else: masks = [None] + dropout_masks new_states = self.cells.forward(in_data, prev_hiddens=new_hiddens, dropout=masks) return new_states def do_preprocess_scan(self, deterministic_dropout=False, **kwargs): """ Run a scan using this LSTM, preprocessing all inputs before the scan. Parameters: kwargs[k]: should be a theano tensor of shape (n_batch, n_time, ... ) Note that "relative_position" should be a keyword argument given here if there are relative shifts. deterministic_dropout: If True, apply dropout deterministically, scaling everything. If false, sample dropout Returns: A theano tensor of shape (n_batch, n_time, output_size) of activations """ assert len(kwargs) > 0, "Need at least one input argument!" n_batch, n_time = list(kwargs.values())[0].shape[:2] squashed_kwargs = { k: v.reshape([n_batch * n_time] + [x for x in v.shape[2:]]) for k, v in kwargs.items() } full_input = T.concatenate( [part.generate(**squashed_kwargs) for part in self.input_parts], 1) adjusted_input = full_input.reshape([n_batch, n_time, self.input_size]).dimshuffle( (1, 0, 2)) if "relative_position" in kwargs: relative_position = kwargs["relative_position"] diff_shifts = T.extra_ops.diff(relative_position, axis=1) cat_shifts = T.concatenate( [T.zeros((n_batch, 1), 'int32'), diff_shifts], 1) shifts = cat_shifts.dimshuffle((1, 0)) else: shifts = T.zeros(n_time, n_batch, 'int32') def _scan_fn(in_data, shifts, *other): other = list(other) if self.dropout and not deterministic_dropout: split = -len(self.tot_layer_sizes) hiddens = other[:split] masks = [None] + other[split:] else: masks = [] hiddens = other return self.perform_step(in_data, shifts, hiddens, dropout_masks=masks) if self.dropout and not deterministic_dropout: dropout_masks = UpscaleMultiDropout( [(n_batch, shape) for shape in self.tot_layer_sizes], self.dropout) else: dropout_masks = [] outputs_info = [ initial_state_with_taps(layer, n_batch) for layer in self.cells.layers ] result, _ = theano.scan(fn=_scan_fn, sequences=[adjusted_input, shifts], non_sequences=dropout_masks, outputs_info=outputs_info) final_out = get_last_layer(result).transpose((1, 0, 2)) return final_out def prepare_sample_scan(self, start_pos, start_out, deterministic_dropout=False, **kwargs): """ Prepare a sample scan Parameters: kwargs[k]: should be a theano tensor of shape (n_batch, n_time, ... ) Note that "relative_position" should be a keyword argument given here if there are relative shifts, as should "timestep" start_pos: a theano tensor of shape (n_batch) giving the initial position passed to the out_to_in function start_out: a theano tensor of shape (n_batch, X) giving the initial "output" passed to the out_to_in_fn deterministic_dropout: If True, apply dropout deterministically, scaling everything. If false, sample dropout Returns: A namedtuple, where sequences: a list of sequences to input into scan non_sequences: a list of non_sequences into scan outputs_info: a list of outputs_info for scan num_taps: the number of outputs with taps for this (other values): for internal use """ assert len(kwargs) > 0, "Need at least one input argument!" n_batch, n_time = list(kwargs.values())[0].shape[:2] transp_kwargs = { k: v.dimshuffle((1, 0) + tuple(range(2, v.ndim))) for k, v in kwargs.items() } if self.dropout and not deterministic_dropout: dropout_masks = UpscaleMultiDropout( [(n_batch, shape) for shape in self.tot_layer_sizes], self.dropout) else: dropout_masks = [] outputs_info = [{ "initial": start_pos, "taps": [-1] }, { "initial": start_out, "taps": [-1] }] + [ initial_state_with_taps(layer, n_batch) for layer in self.cells.layers ] sequences = list(transp_kwargs.values()) non_sequences = dropout_masks num_taps = len([True for x in outputs_info if x is not None]) return SampleScanSpec(sequences=sequences, non_sequences=non_sequences, outputs_info=outputs_info, num_taps=num_taps, kwargs_keys=list(transp_kwargs.keys()), deterministic_dropout=deterministic_dropout, start_pos=start_pos) def sample_scan_routine(self, spec, *inputs): """ Start a scan routine. This is implemented as a generator, since we may need to interrupt the state in the middle of iteration. How to use: scan_rout = x.sample_scan_routine(spec, *inputs) - spec: The SampleScanSpec returned by prepare_sample_scan - *inputs: The scan inputs, in [ sequences..., taps..., non_sequences... ] order last_rel_pos, last_out, cur_kwargs = scan_rout.send(None) - last_rel_pos is a theano tensor of shape (n_batch) - last_out will be a theano tensor of shape (n_batch, output_size) - cur_kwargs[k] is a theano tensor of shape (n_batch, ...), from kwargs out_activations = scan_rout.send((new_pos, addtl_kwargs)) - new_pos is a theano tensor of shape (n_batch), giving the new relative position - addtl_kwargs[k] is a theano tensor of shape (n_batch, ...) to be added to cur kwargs Note that "relative_position" will be added automatically. scan_outputs = scan_rout.send(new_out) - new_out is a tensor of shape (n_batch, X) to be output scan_rout.close() -> scan_outputs should be returned back to scan """ stuff = list(inputs) I = len(spec.kwargs_keys) kwarg_seq_vals = stuff[:I] cur_kwargs = {k: v for k, v in zip(spec.kwargs_keys, kwarg_seq_vals)} last_pos, last_out = stuff[I:I + 2] other = stuff[I + 2:] if self.dropout and not spec.deterministic_dropout: split = -len(self.tot_layer_sizes) hiddens = other[:split] masks = [None] + other[split:] else: masks = [] hiddens = other cur_pos, addtl_kwargs = yield (last_pos, last_out, cur_kwargs) all_kwargs = {"relative_position": cur_pos} all_kwargs.update(cur_kwargs) all_kwargs.update(addtl_kwargs) shift = T.switch(T.eq(all_kwargs["timestep"], 0), 0, cur_pos - last_pos) full_input = T.concatenate( [part.generate(**all_kwargs) for part in self.input_parts], 1) step_stuff = self.perform_step(full_input, shift, hiddens, dropout_masks=masks) new_hiddens = step_stuff[:-1] raw_output = step_stuff[-1] sampled_output = yield (raw_output) yield [cur_pos, sampled_output] + step_stuff def extract_sample_scan_results(self, spec, outputs): """ Extract outputs from the scan results. Parameters: outputs: The outputs from the scan associated with this stack Returns: positions, raw_output, sampled_output """ positions = T.concatenate([ T.shape_padright(spec.start_pos), outputs[0].transpose( (1, 0))[:, :-1] ], 1) sampled_output = outputs[2].transpose((1, 0, 2)) raw_output = outputs[-1].transpose((1, 0, 2)) return positions, raw_output, sampled_output def do_sample_scan(self, start_pos, start_out, sample_fn, out_to_in_fn, deterministic_dropout=True, **kwargs): """ Run a scan using this LSTM, sampling and processing as we go. Parameters: kwargs[k]: should be a theano tensor of shape (n_batch, n_time, ... ) Note that "relative_position" should be a keyword argument given here if there are relative shifts. start_pos: a theano tensor of shape (n_batch) giving the initial position passed to the out_to_in function start_out: a theano tensor of shape (n_batch, X) giving the initial "output" passed to the out_to_in_fn sample_fn: a function with signature sample_fn(out_activations, rel_pos) -> new_out, new_rel_pos where - rel_pos is a theano tensor of shape (n_batch) - out_activations is a tensor of shape (n_batch, output_size) and - new_out is a tensor of shape (n_batch, X) to be output - new_rel_pos should be a theano tensor of shape (n_batch) out_to_in_fn: a function with signature out_to_in_fn(rel_pos, last_out, **cur_kwargs) -> addtl_kwargs where - rel_pos is a theano tensor of shape (n_batch) - last_out will be a theano tensor of shape (n_batch, output_size) - cur_kwargs[k] is a theano tensor of shape (n_batch, ...), from kwargs and - addtl_kwargs[k] is a theano tensor of shape (n_batch, ...) to be added to cur kwargs Note that "relative_position" will be added automatically. deterministic_dropout: If True, apply dropout deterministically, scaling everything. If false, sample dropout Returns: positions, raw_output, sampled_output, updates """ raise NotImplementedError() spec = self.prepare_sample_scan(start_pos, start_out, sample_fn, deterministic_dropout, **kwargs) def _scan_fn(*stuff): scan_rout = self.sample_scan_routine(spec, *stuff) rel_pos, last_out, cur_kwargs = scan_rout.send(None) addtl_kwargs = out_to_in_fn(rel_pos, last_out, **cur_kwargs) out_activations = scan_rout.send(addtl_kwargs) sampled_output, new_pos = sample_fn(out_activations, rel_pos) scan_outputs = scan_rout.send((sampled_output, new_pos)) scan_rout.close() return scan_outputs result, updates = theano.scan(fn=_scan_fn, sequences=spec.sequences, non_sequences=spec.non_sequences, outputs_info=spec.outputs_info) positions, raw_output, sampled_output = self.extract_sample_scan_results( spec, result) return positions, raw_output, sampled_output, updates
class Model(object): def __init__(self, t_layer_sizes, p_layer_sizes, dropout=0): self.t_layer_sizes = t_layer_sizes self.p_layer_sizes = p_layer_sizes # From our architecture definition, size of the notewise input self.t_input_size = 80 # time network maps from notewise input size to various hidden sizes self.time_model = StackedCells( self.t_input_size, celltype=LSTM, layers = t_layer_sizes) self.time_model.layers.append(PassthroughLayer()) # pitch network takes last layer of time model and state of last note, moving upward # and eventually ends with a two-element sigmoid layer p_input_size = t_layer_sizes[-1] + 2 self.pitch_model = StackedCells( p_input_size, celltype=LSTM, layers = p_layer_sizes) self.pitch_model.layers.append(Layer(p_layer_sizes[-1], 2, activation = T.nnet.sigmoid)) self.dropout = dropout self.conservativity = T.fscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) print "model-setup::Trace-1" self.setup_train() print "model-setup::Trace-2" self.setup_predict() print "model-setup::Trace-3" self.setup_slow_walk() @property def params(self): return self.time_model.params + self.pitch_model.params @params.setter def params(self, param_list): ntimeparams = len(self.time_model.params) self.time_model.params = param_list[:ntimeparams] self.pitch_model.params = param_list[ntimeparams:] @property def learned_config(self): return [self.time_model.params, self.pitch_model.params, [l.initial_hidden_state for mod in (self.time_model, self.pitch_model) for l in mod.layers if has_hidden(l)]] @learned_config.setter def learned_config(self, learned_list): self.time_model.params = learned_list[0] self.pitch_model.params = learned_list[1] for l, val in zip((l for mod in (self.time_model, self.pitch_model) for l in mod.layers if has_hidden(l)), learned_list[2]): l.initial_hidden_state.set_value(val.get_value()) def setup_train(self): # dimensions: (batch, time, notes, input_data) with input_data as in architecture self.input_mat = T.btensor4() # dimensions: (batch, time, notes, onOrArtic) with 0:on, 1:artic self.output_mat = T.btensor4() self.epsilon = np.spacing(np.float32(1.0)) print "model-setup-train::Trace-1" def step_time(in_data, *other): other = list(other) split = -len(self.t_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states def step_note(in_data, *other): other = list(other) split = -len(self.p_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states # We generate an output for each input, so it doesn't make sense to use the last output as an input. # Note that we assume the sentinel start value is already present # TEMP CHANGE: NO SENTINEL print "model-setup-train::Trace-2" input_slice = self.input_mat[:,0:-1] n_batch, n_time, n_note, n_ipn = input_slice.shape # time_inputs is a matrix (time, batch/note, input_per_note) time_inputs = input_slice.transpose((1,0,2,3)).reshape((n_time,n_batch*n_note,n_ipn)) num_time_parallel = time_inputs.shape[1] # apply dropout if self.dropout > 0: time_masks = MultiDropout( [(num_time_parallel, shape) for shape in self.t_layer_sizes], self.dropout) else: time_masks = [] print "model-setup-train::Trace-3" time_outputs_info = [initial_state_with_taps(layer, num_time_parallel) for layer in self.time_model.layers] time_result, _ = theano.scan(fn=step_time, sequences=[time_inputs], non_sequences=time_masks, outputs_info=time_outputs_info) print "model-setup-train::Trace-4" self.time_thoughts = time_result # Now time_result is a list of matrix [layer](time, batch/note, hidden_states) for each layer but we only care about # the hidden state of the last layer. # Transpose to be (note, batch/time, hidden_states) last_layer = get_last_layer(time_result) n_hidden = last_layer.shape[2] time_final = get_last_layer(time_result).reshape((n_time,n_batch,n_note,n_hidden)).transpose((2,1,0,3)).reshape((n_note,n_batch*n_time,n_hidden)) # note_choices_inputs represents the last chosen note. Starts with [0,0], doesn't include last note. # In (note, batch/time, 2) format # Shape of start is thus (1, N, 2), concatenated with all but last element of output_mat transformed to (x, N, 2) start_note_values = T.alloc(0, 1, time_final.shape[1], 2 ) correct_choices = self.output_mat[:,1:,0:-1,:].transpose((2,0,1,3)).reshape((n_note-1,n_batch*n_time,2)) note_choices_inputs = T.concatenate([start_note_values, correct_choices], axis=0) print "model-setup-train::Trace-5" # Together, this and the output from the last LSTM goes to the new LSTM, but rotated, so that the batches in # one direction are the steps in the other, and vice versa. note_inputs = T.concatenate( [time_final, note_choices_inputs], axis=2 ) num_timebatch = note_inputs.shape[1] # apply dropout if self.dropout > 0: pitch_masks = MultiDropout( [(num_timebatch, shape) for shape in self.p_layer_sizes], self.dropout) else: pitch_masks = [] print "model-setup-train::Trace-6" note_outputs_info = [initial_state_with_taps(layer, num_timebatch) for layer in self.pitch_model.layers] note_result, _ = theano.scan(fn=step_note, sequences=[note_inputs], non_sequences=pitch_masks, outputs_info=note_outputs_info) self.note_thoughts = note_result # Now note_result is a list of matrix [layer](note, batch/time, onOrArticProb) for each layer but we only care about # the hidden state of the last layer. # Transpose to be (batch, time, note, onOrArticProb) note_final = get_last_layer(note_result).reshape((n_note,n_batch,n_time,2)).transpose(1,2,0,3) print "model-setup-train::Trace-7" # The cost of the entire procedure is the negative log likelihood of the events all happening. # For the purposes of training, if the ouputted probability is P, then the likelihood of seeing a 1 is P, and # the likelihood of seeing 0 is (1-P). So the likelihood is (1-P)(1-x) + Px = 2Px - P - x + 1 # Since they are all binary decisions, and are all probabilities given all previous decisions, we can just # multiply the likelihoods, or, since we are logging them, add the logs. # Note that we mask out the articulations for those notes that aren't played, because it doesn't matter # whether or not those are articulated. # The padright is there because self.output_mat[:,:,:,0] -> 3D matrix with (b,x,y), but we need 3d tensor with # (b,x,y,1) instead active_notes = T.shape_padright(self.output_mat[:,1:,:,0]) mask = T.concatenate([T.ones_like(active_notes),active_notes], axis=3) loglikelihoods = mask * T.log( 2*note_final*self.output_mat[:,1:] - note_final - self.output_mat[:,1:] + 1 + self.epsilon ) print "model-setup-train::Trace-8" self.cost = T.neg(T.sum(loglikelihoods)) print "model-setup-train::Trace-9" updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta") print "model-setup-train::Trace-10" self.update_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs=self.cost, updates=updates, allow_input_downcast=True) self.update_thought_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs= ensure_list(self.time_thoughts) + ensure_list(self.note_thoughts) + [self.cost], allow_input_downcast=True) def _predict_step_note(self, in_data_from_time, *states): # States is [ *hiddens, last_note_choice ] hiddens = list(states[:-1]) in_data_from_prev = states[-1] in_data = T.concatenate([in_data_from_time, in_data_from_prev]) # correct for dropout if self.dropout > 0: masks = [1 - self.dropout for layer in self.pitch_model.layers] masks[0] = None else: masks = [] new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) # Now new_states is a per-layer set of activations. probabilities = get_last_layer(new_states) # Thus, probabilities is a vector of two probabilities, P(play), and P(artic | play) shouldPlay = self.srng.normal() < (probabilities[0] ** self.conservativity) shouldArtic = shouldPlay * (self.srng.uniform() < probabilities[1]) chosen = T.stack(shouldPlay, shouldArtic) return ensure_list(new_states) + [chosen] def setup_predict(self): # In prediction mode, note steps are contained in the time steps. So the passing gets a little bit hairy. self.predict_seed = T.bmatrix() self.steps_to_simulate = T.iscalar() def step_time(*states): # States is [ *hiddens, prev_result, time] hiddens = list(states[:-2]) in_data = states[-2] time = states[-1] # correct for dropout if self.dropout > 0: masks = [1 - self.dropout for layer in self.time_model.layers] masks[0] = None else: masks = [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) # Now new_states is a list of matrix [layer](notes, hidden_states) for each layer time_final = get_last_layer(new_states) start_note_values = theano.tensor.alloc(0, 2) # This gets a little bit complicated. In the training case, we can pass in a combination of the # time net's activations with the known choices. But in the prediction case, those choices don't # exist yet. So instead of iterating over the combination, we iterate over only the activations, # and then combine in the previous outputs in the step. And then since we are passing outputs to # previous inputs, we need an additional outputs_info for the initial "previous" output of zero. note_outputs_info = ([ initial_state_with_taps(layer) for layer in self.pitch_model.layers ] + [ dict(initial=start_note_values, taps=[-1]) ]) notes_result, updates = theano.scan(fn=self._predict_step_note, sequences=[time_final], outputs_info=note_outputs_info) # Now notes_result is a list of matrix [layer/output](notes, onOrArtic) output = get_last_layer(notes_result) next_input = OutputFormToInputFormOp()(output, time + 1) # TODO: Fix time #next_input = T.cast(T.alloc(0, 3, 4),'int64') return (ensure_list(new_states) + [ next_input, time + 1, output ]), updates num_notes = self.predict_seed.shape[0] time_outputs_info = ([ initial_state_with_taps(layer, num_notes) for layer in self.time_model.layers ] + [ dict(initial=self.predict_seed, taps=[-1]), dict(initial=0, taps=[-1]), None ]) time_result, updates = theano.scan( fn=step_time, outputs_info=time_outputs_info, n_steps=self.steps_to_simulate ) self.predict_thoughts = time_result self.predicted_output = time_result[-1] self.predict_fun = theano.function( inputs=[self.steps_to_simulate, self.conservativity, self.predict_seed], outputs=self.predicted_output, updates=updates, allow_input_downcast=True) self.predict_thought_fun = theano.function( inputs=[self.steps_to_simulate, self.conservativity, self.predict_seed], outputs=ensure_list(self.predict_thoughts), updates=updates, allow_input_downcast=True) def setup_slow_walk(self): self.walk_input = theano.shared(np.ones((2,2), dtype='int8')) self.walk_time = theano.shared(np.array(0, dtype='int64')) self.walk_hiddens = [theano.shared(np.ones((2,2), dtype=theano.config.floatX)) for layer in self.time_model.layers if has_hidden(layer)] # correct for dropout if self.dropout > 0: masks = [1 - self.dropout for layer in self.time_model.layers] masks[0] = None else: masks = [] new_states = self.time_model.forward(self.walk_input, prev_hiddens=self.walk_hiddens, dropout=masks) # Now new_states is a list of matrix [layer](notes, hidden_states) for each layer time_final = get_last_layer(new_states) start_note_values = theano.tensor.alloc(0, 2) note_outputs_info = ([ initial_state_with_taps(layer) for layer in self.pitch_model.layers ] + [ dict(initial=start_note_values, taps=[-1]) ]) notes_result, updates = theano.scan(fn=self._predict_step_note, sequences=[time_final], outputs_info=note_outputs_info) # Now notes_result is a list of matrix [layer/output](notes, onOrArtic) output = get_last_layer(notes_result) next_input = OutputFormToInputFormOp()(output, self.walk_time + 1) # TODO: Fix time #next_input = T.cast(T.alloc(0, 3, 4),'int64') slow_walk_results = (new_states[:-1] + notes_result[:-1] + [ next_input, output ]) updates.update({ self.walk_time: self.walk_time+1, self.walk_input: next_input }) updates.update({hidden:newstate for hidden, newstate, layer in zip(self.walk_hiddens, new_states, self.time_model.layers) if has_hidden(layer)}) self.slow_walk_fun = theano.function( inputs=[self.conservativity], outputs=slow_walk_results, updates=updates, allow_input_downcast=True) def start_slow_walk(self, seed): seed = np.array(seed) num_notes = seed.shape[0] self.walk_time.set_value(0) self.walk_input.set_value(seed) for layer, hidden in zip((l for l in self.time_model.layers if has_hidden(l)),self.walk_hiddens): hidden.set_value(np.repeat(np.reshape(layer.initial_hidden_state.get_value(), (1,-1)), num_notes, axis=0))
class Model(object): def __init__(self, data_manager, t_layer_sizes, p_layer_sizes, dropout=0): print('{:25}'.format("Initializing Model"), end='', flush=True) self.t_layer_sizes = t_layer_sizes self.p_layer_sizes = p_layer_sizes self.dropout = dropout self.data_manager = data_manager self.t_input_size = self.data_manager.f.feature_count self.output_size = self.data_manager.s.information_count self.time_model = StackedCells(self.t_input_size, celltype=LSTM, layers=t_layer_sizes) self.time_model.layers.append(PassthroughLayer()) p_input_size = t_layer_sizes[-1] + self.output_size self.pitch_model = StackedCells(p_input_size, celltype=LSTM, layers=p_layer_sizes) self.pitch_model.layers.append( Layer(p_layer_sizes[-1], self.output_size, activation=T.nnet.sigmoid)) self.conservativity = T.fscalar() self.srng = T.shared_randomstreams.RandomStreams( np.random.randint(0, 1024)) self.epsilon = np.spacing(np.float32(1.0)) print("Done") @property def params(self): return self.time_model.params + self.pitch_model.params @params.setter def params(self, param_list): ntimeparams = len(self.time_model.params) self.time_model.params = param_list[:ntimeparams] self.pitch_model.params = param_list[ntimeparams:] @property def learned_config(self): return [ self.time_model.params, self.pitch_model.params, [ l.initial_hidden_state for mod in (self.time_model, self.pitch_model) for l in mod.layers if has_hidden(l) ] ] @learned_config.setter def learned_config(self, learned_list): self.time_model.params = learned_list[0] self.pitch_model.params = learned_list[1] for l, val in zip((l for mod in (self.time_model, self.pitch_model) for l in mod.layers if has_hidden(l)), learned_list[2]): l.initial_hidden_state.set_value(val.get_value()) def setup(self): self.setup_train() self.setup_generate() def loss_func(self, y_true, y_predict): active_notes = T.shape_padright(y_true[:, :, :, 0]) mask = T.concatenate([ T.ones_like(active_notes), active_notes, T.repeat(T.ones_like(active_notes), self.output_size - 2, -1) ], axis=-1) loglikelihoods = mask * T.log(2 * y_predict * y_true - y_predict - y_true + 1 + self.epsilon) return T.neg(T.sum(loglikelihoods)) def setup_train(self): print('{:25}'.format("Setup Train"), end='', flush=True) self.input_mat = T.btensor4() self.output_mat = T.btensor4() def step_time(in_data, *other): other = list(other) split = -len(self.t_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states def step_note(in_data, *other): other = list(other) split = -len(self.p_layer_sizes) if self.dropout else len(other) hiddens = other[:split] masks = [None] + other[split:] if self.dropout else [] new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) return new_states def get_dropout(layers, num_time_parallel=1): if self.dropout > 0: return theano_lstm.MultiDropout([(num_time_parallel, shape) for shape in layers], self.dropout) else: return [] # TIME PASS input_slice = self.input_mat[:, 0:-1] n_batch, n_time, n_note, n_ipn = input_slice.shape time_inputs = input_slice.transpose((1, 0, 2, 3)).reshape( (n_time, n_batch * n_note, n_ipn)) time_masks = get_dropout(self.t_layer_sizes, time_inputs.shape[1]) time_outputs_info = [ initial_state_with_taps(layer, time_inputs.shape[1]) for layer in self.time_model.layers ] time_result, _ = theano.scan(fn=step_time, sequences=[time_inputs], non_sequences=time_masks, outputs_info=time_outputs_info) self.time_thoughts = time_result last_layer = get_last_layer(time_result) n_hidden = last_layer.shape[2] time_final = get_last_layer(time_result).reshape( (n_time, n_batch, n_note, n_hidden)).transpose( (2, 1, 0, 3)).reshape((n_note, n_batch * n_time, n_hidden)) # PITCH PASS start_note_values = T.alloc(np.array(0, dtype=np.int8), 1, time_final.shape[1], self.output_size) correct_choices = self.output_mat[:, 1:, 0:-1, :].transpose( (2, 0, 1, 3)).reshape( (n_note - 1, n_batch * n_time, self.output_size)) note_choices_inputs = T.concatenate( [start_note_values, correct_choices], axis=0) note_inputs = T.concatenate([time_final, note_choices_inputs], axis=2) note_masks = get_dropout(self.p_layer_sizes, note_inputs.shape[1]) note_outputs_info = [ initial_state_with_taps(layer, note_inputs.shape[1]) for layer in self.pitch_model.layers ] note_result, _ = theano.scan(fn=step_note, sequences=[note_inputs], non_sequences=note_masks, outputs_info=note_outputs_info) self.note_thoughts = note_result note_final = get_last_layer(note_result).reshape( (n_note, n_batch, n_time, self.output_size)).transpose(1, 2, 0, 3) self.cost = self.loss_func(self.output_mat[:, 1:], note_final) updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta") self.update_fun = theano.function( inputs=[self.input_mat, self.output_mat], outputs=self.cost, updates=updates, allow_input_downcast=True) print("Done") def _predict_step_note(self, in_data_from_time, *states): hiddens = list(states[:-1]) in_data_from_prev = states[-1] in_data = T.concatenate([in_data_from_time, in_data_from_prev]) if self.dropout > 0: masks = [1 - self.dropout for layer in self.pitch_model.layers] masks[0] = None else: masks = [] new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) probabilities = get_last_layer(new_states) shouldPlay = self.srng.uniform() < (probabilities[0]** self.conservativity) shouldArtic = shouldPlay * (self.srng.uniform() < probabilities[1]) chosen = T.stack( [T.cast(shouldPlay, 'int8'), T.cast(shouldArtic, 'int8')]) return ensure_list(new_states) + [chosen] def setup_generate(self): print('{:25}'.format("Setup Generate"), end='', flush=True) self.generate_seed_input = T.btensor3() self.steps_to_simulate = T.iscalar() def step_time_seed(in_data, *hiddens): if self.dropout > 0: time_masks = [ 1 - self.dropout for layer in self.time_model.layers ] time_masks[0] = None else: time_masks = [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=time_masks) return new_states time_inputs = self.generate_seed_input[0:-1] n_time, n_note, n_ipn = time_inputs.shape time_outputs_info_seed = [ initial_state_with_taps(layer, n_note) for layer in self.time_model.layers ] time_result, _ = theano.scan(fn=step_time_seed, sequences=[time_inputs], outputs_info=time_outputs_info_seed) last_layer = get_last_layer(time_result) n_hidden = last_layer.shape[2] def step_time(*states): hiddens = list(states[:-2]) in_data = states[-2] time = states[-1] if self.dropout > 0: masks = [1 - self.dropout for layer in self.time_model.layers] masks[0] = None else: masks = [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) time_final = get_last_layer(new_states) start_note_values = theano.tensor.alloc(np.array(0, dtype=np.int8), self.output_size) note_outputs_info = ([ initial_state_with_taps(layer) for layer in self.pitch_model.layers ] + [dict(initial=start_note_values, taps=[-1])]) notes_result, updates = theano.scan(fn=self._predict_step_note, sequences=[time_final], outputs_info=note_outputs_info) output = get_last_layer(notes_result) next_input = OutputFormToInputFormOp(self.data_manager)(output, time + 1) return (ensure_list(new_states) + [next_input, time + 1, output]), updates time_outputs_info = (time_outputs_info_seed + [ dict(initial=self.generate_seed_input[-1], taps=[-1]), dict(initial=n_time, taps=[-1]), None ]) time_result, updates = theano.scan(fn=step_time, outputs_info=time_outputs_info, n_steps=self.steps_to_simulate) self.predicted_output = time_result[-1] self.generate_fun = theano.function(inputs=[ self.steps_to_simulate, self.conservativity, self.generate_seed_input ], outputs=self.predicted_output, updates=updates, allow_input_downcast=True, on_unused_input='warn') print("Done")
class MusicGenerator(object): def __init__(self, time_model_layer_sizes, note_model_layer_sizes): self.time_model = StackedCells(input_size, celltype=LSTM, layers=time_model_layer_sizes) self.time_model.layers.append(Router()) note_model_input_size = time_model_layer_sizes[-1] + outptu_size self.note_model = StackedCells(note_model_input_size, celltype=LSTM, layers=note_model_layer_sizes) self.note_model.layers.append(Layer(note_model_layer_sizes[-1], output_size, activation=T.nnet.sigmoid)) self.time_model_layer_sizes = time_model_layer_sizes self.note_model_layer_sizes = note_model_layer_sizes self._initialize_update_function() self._initialize_predict_function() @property def params(self): return self.time_model.params + self.note_model.params @params.setter def params(self, params): time_model_size = len(self.time_model.params) self.time_model.params = params[:time_model_size] self.note_model.params = params[time_model_size:] @property def configuration(self): models = [self.time_model, self.note_model] initial_hidden_states = [] for model in models: for layer in model.layers: if hasattr(layer, initial_state): initial_hidden_states.append(layer.initial_hidden_state) return [self.time_model.params, self.note_model.params, initial_hidden_states] @configuration.setter def configuration(self, configuration): self.time_model.params = configuration[0] self.note_model.params = configuration[1] hidden_state_layers = [] models = [self.time_model, self.note_model] for model in models: for layer in model.layers: if hasattr(layer, INITIAL_HIDDEN_STATE_KEY): hidden_state_layers.append(layer) initial_hidden_states = configuration[2] for layer_id in xrange(len(hidden_state_layers)): layer = hidden_state_layers[layer_id] state = initial_hidden_states[layer_id] layer.initial_hidden_state.set_value(state.get_value()) @staticmethod def get_time_model_input(adjusted_input): batch_size, num_timesteps, num_notes, num_attributes = adjusted_input.shape tranposed_input = adjusted_input.transpose((1, 0, 2, 3)) return tranposed_input.reshape((num_timesteps, batch_size * num_notes, num_attributes)) @staticmethod def get_note_model_input(adjusted_input, adjusted_output, time_model_output): batch_size, num_timesteps, num_notes, _ = adjusted_input.shape num_hidden = time_model_output.shape[2] reshaped_time_model_output = time_model_output.reshape((num_timesteps, batch_size, num_notes, num_hidden)) transposed_time_model_output = reshaped_time_model_output.transpose((2, 1, 0, 3)) adjusted_time_model_output = transposed_time_model_output.reshape((num_notes, batch_size * num_timesteps, num_hidden)) starting_notes = T.alloc(0, 1, adjusted_time_model_output.shape[1], output_size) correct_choices = adjusted_output[:, :, :-1, :].transpose((2, 0, 1, 3)) reshaped_correct_choices = correct_choices.reshape((num_notes - 1, batch_size * num_timesteps, output_size)) adjusted_correct_choices = T.concatenate([starting_notes, reshaped_correct_choices], axis=0) return T.concatenate([adjusted_time_model_output, adjusted_correct_choices], axis=2) @staticmethod def get_initial_state(layer, dimensions=None): if not hasattr(layer, INITIAL_HIDDEN_STATE_KEY): return None return { 'initial': layer.initial_hidden_state if dimensions is None else T.repeat(T.shape_padleft(layer.initial_hidden_state), dimensions, axis=0), 'taps': [-1] } @staticmethod def get_output(step, input, outputs_info): result, _ = theano.scan(fn=step, sequences=[input], outputs_info=outputs_info) return result[-1] @staticmethod def get_prediction(adjusted_input, note_model_output): batch_size, num_timesteps, num_notes, _ = adjusted_input.shape reshaped_note_model_output = note_model_output.reshape((num_notes, batch_size, num_timesteps, output_size)) return reshaped_note_model_output.transpose(1, 2, 0, 3) @staticmethod def get_loss(adjusted_output, prediction): epsilon = 1e-7 active_notes = T.shape_padright(adjusted_output[:, :, :, 0]) masks = T.concatenate([T.ones_like(active_notes), active_notes], axis=3) log_likelihoods = T.log(2 * prediction * adjusted_output - prediction - adjusted_output + 1 + epsilon) masked_log_likelihoods = masks * log_likelihoods return T.neg(T.sum(masked_log_likelihoods)) def get_outputs_info(self, adjusted_input, layers): batch_size = adjusted_input.shape[1] return [self.get_initial_state(layer, batch_size) for layer in layers] def get_time_prediction_outputs_info(self, initial_note): initial_states = [self.get_initial_state(layer) for layer in self.note_model.layers] first_note = { 'initial': initial_note, 'taps': [-1] } return initial_states + [first_note] def get_prediction_outputs_info(self, num_notes, initial_note): initial_states = [self.get_initial_state(layer, num_notes) for layer in self.time_model.layers] first_note = { 'initial': initial_note, 'taps': [-1] } padder = { 'initial': 0, 'taps': [-1] } return initial_states + [first_note, padder, None] def _initialize_update_function(self): def time_step(input, *previous_hidden_state): return self.time_model.forward(input, prev_hiddens=previous_hidden_state) def note_step(input, *previous_hidden_state): return self.note_model.forward(input, prev_hiddens=previous_hidden_state) input = T.btensor4() adjusted_input = input[:, :-1] output = T.btensor4() adjusted_output = output[:, 1:] time_model_input = self.get_time_model_input(adjusted_input) time_model_outputs_info = self.get_outputs_info(time_model_input, self.time_model.layers) time_model_output = self.get_output(time_step, time_model_input, time_model_outputs_info) note_model_input = self.get_note_model_input(adjusted_input, adjusted_output, time_model_output) note_outputs_info = self.get_outputs_info(note_model_input, self.note_model.layers) note_model_output = self.get_output(note_step, note_model_input, note_outputs_info) prediction = self.get_prediction(adjusted_input, note_model_output) loss = self.get_loss(adjusted_output, prediction) updates, _, _, _, _ = create_optimization_updates(loss, self.params) self.update = theano.function(inputs=[input, output], outputs=loss, updates=updates, allow_input_downcast=True) def _initialize_predict_function(self): def predicted_note_step(time_model_output, *states): previous_note_model_input = states[-1] note_model_input = T.concatenate([time_model_output, previous_note_model_input]) previous_hidden_state = list(states[:-1]) note_model_output = self.note_model.forward(note_model_input, prev_hiddens=previous_hidden_state) probabilities = note_model_output[-1] generator = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) is_note_played = probabilities[0] > generator.uniform() is_note_articulated = (probabilities[1] > generator.uniform()) * is_note_played prediction = T.cast(T.stack(is_note_played, is_note_articulated), 'int8') return note_model_output + [prediction] def predicted_time_step(*states): time_model_input = states[-2] previous_hidden_state = list(states[:-2]) time_model_output = self.time_model.forward(time_model_input, prev_hiddens=previous_hidden_state) time_model_output_last_layer = time_model_output[-1] initial_note = T.alloc(0, output_size) note_outputs_info = self.get_time_prediction_outputs_info(initial_note) notes_model_output, updates = theano.scan(fn=predicted_note_step, sequences=[time_model_output_last_layer], outputs_info=note_outputs_info) output = notes_model_output[-1] time = states[-1] next_input = OutputTransformer()(output, time + 1) return (time_model_output + [next_input, time + 1, output]), updates length = T.iscalar() initial_note = T.bmatrix() num_notes = initial_note.shape[0] time_outputs_info = self.get_prediction_outputs_info(num_notes, initial_note) time_model_output, updates = theano.scan(fn=predicted_time_step, outputs_info=time_outputs_info, n_steps=length) prediction = time_model_output[-1] self.predict = theano.function([length, initial_note], outputs=prediction, updates=updates, allow_input_downcast=True)
class Model: """ Simple predictive model for forecasting spectral content from sequence using LSTMs. Choose how many LSTMs to stack and what size their memory should be. """ def __init__(self, hidden_size, input_size, stack_size=2, celltype=LSTM): self.input_size = input_size # Modelling self.model = StackedCells(input_size, celltype=celltype, activation=T.tanh, layers=[hidden_size] * stack_size) # disable modulation of the input layer self.model.layers[0].in_gate2.activation = lambda x: x # add an output layer self.model.layers.append( Layer(hidden_size, input_size, activation=softmax)) # Setup symbolic tensor variables that will be used in computation # inputs are windows of spectrum data self.input = T.fvector("input") self.prev_input = T.fvector("prev_input") # create symbolic variables for prediction: self.prediction = self.create_prediction() # create gradient training functions: self.create_cost_fun() self.create_training_function() self.create_predict_function() @property def params(self): return self.model.params def create_prediction(self): result = self.model.forward(self.input) # softmaxes are the last layer of our network, # and are at the end of our results list: # we reorder the predictions to be: # 1. what row / example # 2. what timestep # 3. softmax dimension return result[-1] def create_cost_fun(self): # our cost function is the squared difference # between the input and the prediction diff = self.prediction - self.input squared_diff = diff**2 self.cost = squared_diff.sum() def create_predict_function(self): self.pred_fun = theano.function(inputs=[self.input], outputs=self.prediction, allow_input_downcast=True) def create_training_function(self): updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta") self.update_fun = theano.function(inputs=[self.input], outputs=self.cost, updates=updates, allow_input_downcast=True) def __call__(self, x): return self.pred_fun(x)
class Model: """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, vocab_size, stack_size=1, celltype=LSTM): # core layer in RNN/LSTM self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add an embedding self.model.layers.insert(0, Embedding(vocab_size, input_size)) # add a classifier: self.model.layers.append(Layer(hidden_size, vocab_size, activation = softmax)) self.turing_params = Parameters() #init turing machine model self.turing_updates , self.turing_predict = turing_model.build(self.turing_params , hidden_size , vocab_size) # inputs are matrices of indices, # each row is a sentence, each column a timestep self._stop_word = theano.shared(np.int32(999999999), name="stop word") self.for_how_long = T.ivector() self.input_mat = T.imatrix() self.priming_word = T.iscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) # create symbolic variables for prediction: #change by darong #issue : what is greedy self.lstm_predictions = self.create_lstm_prediction() self.final_predictions = self.create_final_prediction() # create symbolic variable for greedy search: self.greedy_predictions = self.create_lstm_prediction(greedy=True) # create gradient training functions: self.create_cost_fun()#create 2 cost func(lstm final) self.lstm_lr = 0.01 self.turing_lr = 0.01 self.all_lr = 0.01 self.create_training_function()#create 3 functions(lstm turing all) self.create_predict_function()#create 2 predictions(lstm final) # create ppl self.lstm_ppl = self.create_lstm_ppl() self.final_ppl = self.create_final_ppl() self.create_ppl_function() def save(self, save_file, vocab): pickle.dump(self.model, open(save_file, "wb")) # pickle is for lambda function, cPickle cannot pickle.dump(vocab, open(save_file+'.vocab', "wb")) # pickle is for lambda function, cPickle cannot def save_turing(self, save_file): self.turing_params.save(save_file + '.turing') def load(self, load_file, lr): self.model = pickle.load(open(load_file, "rb")) if os.path.isfile(load_file + '.turing') : self.turing_params.load(load_file + '.turing') else : print "no turing model!!!! pretrain with lstm param" self.turing_params['W_input_hidden'] = self.model.layers[-1].params[0].get_value().T #not sure self.turing_params['W_read_hidden'] = self.model.layers[-1].params[0].get_value().T self.turing_params['b_hidden_0'] = self.model.layers[-1].params[1].get_value() temp = self.model.layers[1].initial_hidden_state.get_value()[self.hidden_size:] self.turing_params['memory_init'] = temp.reshape((1,)+temp.shape) # need to compile again for calculating predictions after loading lstm self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) self.lstm_predictions = self.create_lstm_prediction() self.final_predictions = self.create_final_prediction() self.greedy_predictions = self.create_lstm_prediction(greedy=True)#can change to final self.create_cost_fun()#create 2 cost func(lstm final) self.lstm_lr = lr self.turing_lr = lr#change this self.all_lr = lr self.create_training_function()#create 3 functions(lstm turing all) self.create_predict_function()#create 2 predictions(lstm final) self.lstm_ppl = self.create_lstm_ppl() self.final_ppl = self.create_final_ppl() self.create_ppl_function() print "done loading model" # print "done compile" def stop_on(self, idx): self._stop_word.set_value(idx) @property def params(self): return self.model.params def create_lstm_prediction(self, greedy=False): def step(idx, *states): # new hiddens are the states we need to pass to LSTMs # from past. Because the StackedCells also include # the embeddings, and those have no state, we pass # a "None" instead: new_hiddens = [None] + list(states) new_states = self.model.forward(idx, prev_hiddens = new_hiddens) if greedy: new_idxes = new_states[-1] new_idx = new_idxes.argmax() # provide a stopping condition for greedy search: return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word)) else: return new_states[1:] # in sequence forecasting scenario we take everything # up to the before last step, and predict subsequent # steps ergo, 0 ... n - 1, hence: inputs = self.input_mat[:, 0:-1] num_examples = inputs.shape[0] # pass this to Theano's recurrence relation function: # choose what gets outputted at each timestep: if greedy: outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]] result, _ = theano.scan(fn=step, n_steps=200, outputs_info=outputs_info) else: outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]] result, _ = theano.scan(fn=step, sequences=[inputs.T], outputs_info=outputs_info) if greedy: return result[0] # softmaxes are the last layer of our network, # and are at the end of our results list: return result[-1].transpose((2,0,1)) # we reorder the predictions to be: # 1. what row / example # 2. what timestep # 3. softmax dimension def create_final_prediction(self, greedy=False): def step(idx, *states): # new hiddens are the states we need to pass to LSTMs # from past. Because the StackedCells also include # the embeddings, and those have no state, we pass # a "None" instead: new_hiddens = [None] + list(states) new_states = self.model.forward(idx, prev_hiddens = new_hiddens) if greedy: new_idxes = new_states[-1] new_idx = new_idxes.argmax() # provide a stopping condition for greedy search: return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word)) else: return new_states[1:] # in sequence forecasting scenario we take everything # up to the before last step, and predict subsequent # steps ergo, 0 ... n - 1, hence: inputs = self.input_mat[:, 0:-1] num_examples = inputs.shape[0] # pass this to Theano's recurrence relation function: # choose what gets outputted at each timestep: if greedy: outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]] result, _ = theano.scan(fn=step, n_steps=200, outputs_info=outputs_info) else: outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]] result, _ = theano.scan(fn=step, sequences=[inputs.T], outputs_info=outputs_info) if greedy: return result[0] # softmaxes are the last layer of our network, # and are at the end of our results list: hidden_size = result[-2].shape[2]/2 turing_result = self.turing_predict(result[-2][:,:,hidden_size:]) #the last layer do transpose before compute return turing_result.transpose((1,0,2)) # we reorder the predictions to be: # 1. what row / example # 2. what timestep # 3. softmax dimension def create_cost_fun (self): # create a cost function that # takes each prediction at every timestep # and guesses next timestep's value: what_to_predict = self.input_mat[:, 1:] # because some sentences are shorter, we # place masks where the sentences end: # (for how long is zero indexed, e.g. an example going from `[2,3)`) # has this value set 0 (here we substract by 1): for_how_long = self.for_how_long - 1 # all sentences start at T=0: starting_when = T.zeros_like(self.for_how_long) self.lstm_cost = masked_loss(self.lstm_predictions, what_to_predict, for_how_long, starting_when).sum() self.final_cost = masked_loss(self.final_predictions, what_to_predict, for_how_long, starting_when).sum() def create_predict_function(self): self.lstm_pred_fun = theano.function( inputs=[self.input_mat], outputs=self.lstm_predictions, allow_input_downcast=True ) self.final_pred_fun = theano.function( inputs=[self.input_mat], outputs=self.final_predictions, allow_input_downcast=True ) self.greedy_fun = theano.function( inputs=[self.priming_word], outputs=T.concatenate([T.shape_padleft(self.priming_word), self.greedy_predictions]), allow_input_downcast=True ) def create_training_function(self): updates, _, _, _, _ = create_optimization_updates(self.lstm_cost, self.params, method="SGD", lr=self.lstm_lr) # updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr) self.lstm_update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.lstm_cost, updates=updates, allow_input_downcast=True) updates_turing = self.turing_updates(self.final_cost , lr=self.turing_lr) # updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr) self.turing_update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.final_cost, updates=updates_turing, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True), allow_input_downcast=True) all_updates_lstm, _, _, _, _ = create_optimization_updates(self.final_cost, self.params, method="SGD", lr=self.all_lr,part=True) all_updates_turing_temp = self.turing_updates(self.final_cost , lr=self.all_lr) updates_all = all_updates_lstm for pair in all_updates_turing_temp : updates_all[pair[0]] = pair[1] self.all_update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.final_cost, updates=updates_all, allow_input_downcast=True) def create_lstm_ppl(self): def timestep(predictions, label, len_example, total_len_example): label_binary = T.gt(label[0:len_example-1], 0) oov_count = T.shape(label_binary)[0] - T.sum(label_binary) a = total_len_example return T.sum(T.log( 1./ predictions[T.arange(len_example-1), label[0:len_example-1]]) * label_binary ), oov_count result, _ = theano.scan(fn=timestep, sequences=[ self.lstm_predictions, self.input_mat[:, 1:], self.for_how_long ], non_sequences=T.sum(self.for_how_long)) oov_count_total = T.sum(result[1]) return T.exp(T.sum(result[0]).astype(theano.config.floatX)/(T.sum(self.for_how_long) - oov_count_total).astype(theano.config.floatX)).astype(theano.config.floatX) def create_final_ppl(self): def timestep(predictions, label, len_example, total_len_example): label_binary = T.gt(label[0:len_example-1], 0) oov_count = T.shape(label_binary)[0] - T.sum(label_binary) a = total_len_example return T.sum(T.log( 1./ predictions[T.arange(len_example-1), label[0:len_example-1]]) * label_binary ), oov_count result, _ = theano.scan(fn=timestep, sequences=[ self.final_predictions, self.input_mat[:, 1:], self.for_how_long ], non_sequences=T.sum(self.for_how_long)) oov_count_total = T.sum(result[1]) return T.exp(T.sum(result[0]).astype(theano.config.floatX)/(T.sum(self.for_how_long) - oov_count_total).astype(theano.config.floatX)).astype(theano.config.floatX) def create_ppl_function(self): self.lstm_ppl_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.lstm_ppl, allow_input_downcast=True) self.final_ppl_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.final_ppl, allow_input_downcast=True) def __call__(self, x): return self.pred_fun(x)#any problem??
class Model: """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, output_size, celltype=Layer): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =hidden_size) # add a classifier: self.regression=Layer(hidden_size[-1], output_size[0], activation = T.tanh) self.classifier=Layer(hidden_size[-1], output_size[1], activation = softmax) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=T.iscalar('steps') self.x=T.tensor3('x')#输入gfs数据 self.target0=T.tensor3('target0')#输出的目标target,这一版把target维度改了 self.target1=T.itensor3('target1') self.layerstatus=None self.results=None # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions0,self.predictions1 = self.create_prediction() # create gradient training functions: #self.create_cost_fun() #self.create_valid_error() #self.create_training_function() self.create_predict_function() #self.create_validate_function() '''上面几步的意思就是先把公式写好''' @property def params(self): return self.model.params+self.regression.params+self.classifier.params def create_prediction(self):#做一次predict的方法 def step(idx): new_states=self.model.forward(idx) output0=self.regression.activate(new_states[-1]) output1=self.classifier.activate(new_states[-1]) return [output0,output1]#不论recursive与否,会全部输出 x = self.x num_examples = x.shape[0] #outputs_info =[initial_state_with_taps(layer, num_examples) for layer in self.model.layers] #outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]] [result0,result1], _ = theano.scan(fn=step, n_steps=self.steps, sequences=dict(input=x.dimshuffle((1,0,2)), taps=[-0]), ) return result0.dimshuffle((1,0,2)),result1.dimshuffle((2,0,1)) def create_cost_fun (self): y=self.target1[:,0,0] self.cost = (self.predictions0 - self.target0[:,:,0:1]).norm(L=2)+100*(-T.mean(T.log(self.predictions1)[T.arange(y.shape[0]),:,y])) def create_valid_error(self): self.valid_error0=T.mean(T.abs_(self.predictions0 - self.target0[:,:,0:1]),axis=0) #self.valid_error1=-T.mean(T.log(self.predictions1)[T.arange(self.target1.shape[0]),:,self.target1[:,0,0]]) self.valid_error1=T.mean(T.eq(T.argmax(self.predictions1, axis=2).dimshuffle(1,0),self.target1[:,0,0])) def create_predict_function(self): self.pred_fun = theano.function(inputs=[self.x,self.steps],outputs =[self.predictions0,self.predictions1],allow_input_downcast=True) def create_training_function(self): updates, gsums, xsums, lr, max_norm = create_optimization_updates(self.cost, self.params, lr=0.01, method="adagrad")#这一步Gradient Decent!!!! self.update_fun = theano.function( inputs=[self.x, self.target0,self.target1,self.steps], outputs=self.cost, updates=updates, name='update_fun', profile=False, allow_input_downcast=True) def create_validate_function(self): self.valid_fun = theano.function( inputs=[self.x, self.target0,self.target1,self.steps], outputs=[self.valid_error0,self.valid_error1], allow_input_downcast=True ) def __call__(self, x): return self.pred_fun(x)
class Model: """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, vocab_size, stack_size=1, celltype=LSTM): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add an embedding self.model.layers.insert(0, Embedding(vocab_size, input_size)) # add a classifier: self.model.layers.append(Layer(hidden_size, vocab_size, activation = softmax)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self._stop_word = theano.shared(np.int32(999999999), name="stop word") self.for_how_long = T.ivector() self.input_mat = T.imatrix() self.priming_word = T.iscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() # create symbolic variable for greedy search: self.greedy_predictions = self.create_prediction(greedy=True) # create gradient training functions: self.create_cost_fun() self.create_training_function() self.create_predict_function() '''上面几步的意思就是先把公式写好''' def stop_on(self, idx): self._stop_word.set_value(idx) @property def params(self): return self.model.params def create_prediction(self,greedy=False): def step(idx,*states): new_hiddens=list(states) new_states=self.model.forward(idx,prev_hiddens = new_hiddens) if greedy: return else: return new_states#不论recursive与否,会全部输出 inputs = self.input_mat[:,0:-1] num_examples = inputs.shape[0] if greedy: return else: outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]] result, _ = theano.scan(fn=step, sequences=[inputs.T], outputs_info=outputs_info) return result[-1].transpose((2,0,1)) def create_prediction(self, greedy=False): def step(idx, *states): # new hiddens are the states we need to pass to LSTMs # from past. Because the StackedCells also include # the embeddings, and those have no state, we pass # a "None" instead: new_hiddens = [None] + list(states) new_states = self.model.forward(idx, prev_hiddens = new_hiddens)#这一步更新!!!!,idx是layer_input #new_states是一个列表,包括了stackcells各个层的最新输出 if greedy: new_idxes = new_states[-1]#即最后一层softmax的输出 new_idx = new_idxes.argmax() # provide a stopping condition for greedy search: return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word)) else: return new_states[1:]#除第0层之外,其他各层输出 # in sequence forecasting scenario we take everything # up to the before last step, and predict subsequent # steps ergo, 0 ... n - 1, hence: inputs = self.input_mat[:, 0:-1] num_examples = inputs.shape[0] # pass this to Theano's recurrence relation function: # choose what gets outputted at each timestep: if greedy: outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]] result, _ = theano.scan(fn=step, n_steps=200, outputs_info=outputs_info) else: outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]] result, _ = theano.scan(fn=step, sequences=[inputs.T], outputs_info=outputs_info) '''就是这里sequences相当于每次把inputs的一个给到idx,改动这里使符合一次给多种的pm25形式''' '''outputs_info:就是说让scan把每回的输出重新传回fn的输入,而outputs_info就是第一回没有之前输出时,给入的值。于是output_info也暗示了这种回传的形式 Second, if there is no accumulation of results, we can set outputs_info to None. This indicates to scan that it doesn’t need to pass the prior result to fn.''' '''The general order of function parameters to fn is: sequences (if any), prior result(s) (if needed), non-sequences (if any) not only taps should respect an order, but also variables, since this is how scan figures out what should be represented by what''' if greedy: return result[0] # softmaxes are the last layer of our network,指的就是result[-1]是softmax层 # and are at the end of our results list: # print "res=", result # print "res eval=", result[-1].eval() return result[-1].transpose((2,0,1)) # we reorder the predictions to be: # 1. what row / example # 2. what timestep # 3. softmax dimension '''def create_prediction(self, greedy=False): return result[-1].transpose((2,0,1))''' def create_cost_fun (self): # create a cost function that # takes each prediction at every timestep # and guesses next timestep's value: what_to_predict = self.input_mat[:, 1:]#每一句话除了第一个字符之后的所有字符,等于给了第一个,之后整句话是predict出来 # because some sentences are shorter, we # place masks where the sentences end: # (for how long is zero indexed, e.g. an example going from `[2,3)`) # has this value set 0 (here we substract by 1): for_how_long = self.for_how_long - 1 # all sentences start at T=0: starting_when = T.zeros_like(self.for_how_long) '''predict的是完整的句子后面的各个词,注意这个predictions只调用了一遍,那就是说这一遍就是一个mini batch了''' self.cost = masked_loss(self.predictions, what_to_predict, for_how_long, starting_when).sum() def create_predict_function(self): self.pred_fun = theano.function( inputs=[self.input_mat], outputs =self.predictions, allow_input_downcast=True ) self.greedy_fun = theano.function( inputs=[self.priming_word], outputs=T.concatenate([T.shape_padleft(self.priming_word), self.greedy_predictions]), allow_input_downcast=True ) def create_training_function(self): updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta")#这一步Gradient Decent!!!! self.update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.cost, updates=updates, allow_input_downcast=True) def __call__(self, x): return self.pred_fun(x)
class Model(object): """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, output_size, stack_size=1, celltype=RNN,steps=40): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = lambda x:x)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=steps self.gfs=T.tensor3('gfs')#输入gfs数据 self.pm25in=T.tensor3('pm25in')#pm25初始数据部分 self.layerstatus=None self.results=None self.cnt = T.tensor3('cnt') # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() self.create_predict_function() self.pm25target=T.matrix('pm25target')#输出的目标target,这一版把target维度改了 self.create_valid_error() self.create_validate_function() '''上面几步的意思就是先把公式写好''' @property def params(self): return self.model.params def create_prediction(self):#做一次predict的方法 gfs=self.gfs pm25in=self.pm25in #初始第一次前传 gfs_x=T.concatenate([gfs[:,0],gfs[:,1],gfs[:,2]],axis=1) pm25in_x=T.concatenate([pm25in[:,0],pm25in[:,1]],axis=1) self.layerstatus=self.model.forward(T.concatenate([gfs_x,pm25in_x,self.cnt[:,:,0]],axis=1)) self.results=self.layerstatus[-1] for i in xrange(1,7):#前6次(0-5),输出之前的先做的6个frame,之后第7次是第1个输出 gfs_x=T.concatenate([gfs_x[:,9:],gfs[:,i+2]],axis=1) pm25in_x=T.concatenate([pm25in_x[:,1:],pm25in[:,i+1]],axis=1) self.layerstatus=self.model.forward(T.concatenate([gfs_x,pm25in_x,self.cnt[:,:,i]],axis=1),self.layerstatus) self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) if self.steps > 1: gfs_x=T.concatenate([gfs_x[:,9:],gfs[:,9]],axis=1) pm25in_x=T.concatenate([pm25in_x[:,1:],T.shape_padright(self.results[:,-1])],axis=1) self.layerstatus=self.model.forward(T.concatenate([gfs_x,pm25in_x,self.cnt[:,:,7]],axis=1),self.layerstatus) self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) #前传之后step-2次 for i in xrange(2,self.steps): gfs_x=T.concatenate([gfs_x[:,9:],gfs[:,i+8]],axis=1) pm25in_x=T.concatenate([pm25in_x[:,1:],T.shape_padright(self.results[:,-1])],axis=1) self.layerstatus=self.model.forward(T.concatenate([gfs_x,pm25in_x,self.cnt[:,:,i+6]],axis=1),self.layerstatus) #need T.shape_padright??? self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) return self.results def create_predict_function(self): self.pred_fun = theano.function(inputs=[self.gfs,self.pm25in,self.cnt],outputs =self.predictions,allow_input_downcast=True) def create_valid_error(self): self.valid_error=T.mean(T.abs_(self.predictions[:,6:46] - self.pm25target[:,6:46]),axis=0) def create_validate_function(self): self.valid_fun = theano.function( inputs=[self.gfs,self.pm25in, self.pm25target,self.cnt], outputs=self.valid_error, allow_input_downcast=True ) def __call__(self, gfs,pm25in): return self.pred_fun(gfs,pm25in)
class Model: """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, vocab_size, stack_size=1, celltype=LSTM): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add an embedding self.model.layers.insert(0, Embedding(vocab_size, input_size)) # add a classifier: self.model.layers.append(Layer(hidden_size, vocab_size, activation = softmax)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self._stop_word = theano.shared(np.int32(999999999), name="stop word") self.for_how_long = T.ivector() self.input_mat = T.imatrix() self.priming_word = T.iscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) # create symbolic variables for prediction: self.predictions = self.create_prediction() # create symbolic variable for greedy search: self.greedy_predictions = self.create_prediction(greedy=True) # create gradient training functions: self.create_cost_fun() self.create_training_function() self.create_predict_function() def stop_on(self, idx): self._stop_word.set_value(idx) @property def params(self): return self.model.params def create_prediction(self, greedy=False): def step(idx, *states): # new hiddens are the states we need to pass to LSTMs # from past. Because the StackedCells also include # the embeddings, and those have no state, we pass # a "None" instead: new_hiddens = [None] + list(states) new_states = self.model.forward(idx, prev_hiddens = new_hiddens) if greedy: new_idxes = new_states[-1] new_idx = new_idxes.argmax() # provide a stopping condition for greedy search: return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word)) else: return new_states[1:] # in sequence forecasting scenario we take everything # up to the before last step, and predict subsequent # steps ergo, 0 ... n - 1, hence: inputs = self.input_mat[:, 0:-1] num_examples = inputs.shape[0] # pass this to Theano's recurrence relation function: # choose what gets outputted at each timestep: if greedy: outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]] result, _ = theano.scan(fn=step, n_steps=200, outputs_info=outputs_info) else: outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]] result, _ = theano.scan(fn=step, sequences=[inputs.T], outputs_info=outputs_info) if greedy: return result[0] # softmaxes are the last layer of our network, # and are at the end of our results list: return result[-1].transpose((2,0,1)) # we reorder the predictions to be: # 1. what row / example # 2. what timestep # 3. softmax dimension def create_cost_fun (self): # create a cost function that # takes each prediction at every timestep # and guesses next timestep's value: what_to_predict = self.input_mat[:, 1:] # because some sentences are shorter, we # place masks where the sentences end: # (for how long is zero indexed, e.g. an example going from `[2,3)`) # has this value set 0 (here we substract by 1): for_how_long = self.for_how_long - 1 # all sentences start at T=0: starting_when = T.zeros_like(self.for_how_long) self.cost = masked_loss(self.predictions, what_to_predict, for_how_long, starting_when).sum() def create_predict_function(self): self.pred_fun = theano.function( inputs=[self.input_mat], outputs =self.predictions, allow_input_downcast=True ) self.greedy_fun = theano.function( inputs=[self.priming_word], outputs=T.concatenate([T.shape_padleft(self.priming_word), self.greedy_predictions]), allow_input_downcast=True ) def create_training_function(self): updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta") self.update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.cost, updates=updates, allow_input_downcast=True) def __call__(self, x): return self.pred_fun(x)
class Model: """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, output_size, stack_size=1, celltype=RNN,steps=40): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = T.tanh)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=steps self.gfs=T.matrix()#输入gfs数据 self.pm25in=T.matrix()#pm25初始数据部分 self.pm25target=T.matrix()#输出的目标target self.layerstatus=None self.results=None self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() # create gradient training functions: self.create_cost_fun() self.create_valid_error() self.create_training_function() self.create_predict_function() self.create_validate_function() '''上面几步的意思就是先把公式写好''' @property def params(self): return self.model.params def create_prediction(self):#做一次predict的方法 gfs=self.gfs pm25in=self.pm25in #初始第一次前传 self.layerstatus=self.model.forward(T.concatenate([gfs[0],gfs[1],gfs[2],pm25in[0],pm25in[1]],axis=0)) self.results=T.shape_padright(self.layerstatus[-1]) if self.steps > 1: self.layerstatus=self.model.forward(T.concatenate([gfs[1],gfs[2],gfs[3],pm25in[1],self.results[0]],axis=0),self.layerstatus) self.results=T.concatenate([self.results,T.shape_padright(self.layerstatus[-1])],axis=0) #前传之后step-2次 for i in xrange(2,self.steps): self.layerstatus=self.model.forward(T.concatenate([gfs[i],gfs[i+1],gfs[i+2],self.results[i-2],self.results[i-1]],axis=0),self.layerstatus) #need T.shape_padright??? self.results=T.concatenate([self.results,T.shape_padright(self.layerstatus[-1])],axis=0) return self.results def create_cost_fun (self): self.cost = (self.predictions - self.pm25target).norm(L=2) / self.steps def create_valid_error(self): self.valid_error=T.abs_(self.predictions - self.pm25target) def create_predict_function(self): self.pred_fun = theano.function(inputs=[self.gfs,self.pm25in],outputs =self.predictions,allow_input_downcast=True) def create_training_function(self): updates, gsums, xsums, lr, max_norm = create_optimization_updates(self.cost, self.params, method="adadelta")#这一步Gradient Decent!!!! self.update_fun = theano.function( inputs=[self.gfs,self.pm25in, self.pm25target], outputs=self.cost, updates=updates, name='update_fun', profile=True, allow_input_downcast=True) def create_validate_function(self): self.valid_fun = theano.function( inputs=[self.gfs,self.pm25in, self.pm25target], outputs=self.valid_error, allow_input_downcast=True ) def __call__(self, gfs,pm25in): return self.pred_fun(gfs,pm25in)
class Model: """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, vocab_size, stack_size=1, celltype=LSTM): # declare model self.model = StackedCells(input_size, celltype=celltype, layers=[hidden_size] * stack_size) # add an embedding self.model.layers.insert(0, Embedding(vocab_size, input_size)) # add a classifier: self.model.layers.append( Layer(hidden_size, vocab_size, activation=softmax)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self._stop_word = theano.shared(np.int32(999999999), name="stop word") self.for_how_long = T.ivector() self.input_mat = T.imatrix() self.priming_word = T.iscalar() self.srng = T.shared_randomstreams.RandomStreams( np.random.randint(0, 1024)) # create symbolic variables for prediction: self.predictions = self.create_prediction() # create symbolic variable for greedy search: self.greedy_predictions = self.create_prediction(greedy=True) # create gradient training functions: self.create_cost_fun() self.create_training_function() self.create_predict_function() # For saving state self.epochs = 0 def stop_on(self, idx): self._stop_word.set_value(idx) @property def params(self): return self.model.params def create_prediction(self, greedy=False): def step(idx, *states): # new hiddens are the states we need to pass to LSTMs # from past. Because the StackedCells also include # the embeddings, and those have no state, we pass # a "None" instead: new_hiddens = [None] + list(states) new_states = self.model.forward(idx, prev_hiddens=new_hiddens) if greedy: new_idxes = new_states[-1] #new_idx = new_idxes.argmax() new_idx = new_idxes.argmax() # provide a stopping condition for greedy search: return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until( T.eq(new_idx, self._stop_word)) else: return new_states[1:] # in sequence forecasting scenario we take everything # up to the before last step, and predict subsequent # steps ergo, 0 ... n - 1, hence: inputs = self.input_mat[:, 0:-1] num_examples = inputs.shape[0] # pass this to Theano's recurrence relation function: # choose what gets outputted at each timestep: if greedy: outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [ initial_state_with_taps(layer) for layer in self.model.layers[1:-1] ] result, _ = theano.scan(fn=step, n_steps=200, outputs_info=outputs_info) else: outputs_info = [ initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:] ] result, _ = theano.scan(fn=step, sequences=[inputs.T], outputs_info=outputs_info) if greedy: return result[0] # softmaxes are the last layer of our network, # and are at the end of our results list: return result[-1].transpose((2, 0, 1)) # we reorder the predictions to be: # 1. what row / example # 2. what timestep # 3. softmax dimension def create_cost_fun(self): # create a cost function that # takes each prediction at every timestep # and guesses next timestep's value: what_to_predict = self.input_mat[:, 1:] # because some sentences are shorter, we # place masks where the sentences end: # (for how long is zero indexed, e.g. an example going from `[2,3)`) # has this value set 0 (here we substract by 1): for_how_long = self.for_how_long - 1 # all sentences start at T=0: starting_when = T.zeros_like(self.for_how_long) self.cost = masked_loss(self.predictions, what_to_predict, for_how_long, starting_when).sum() def create_predict_function(self): self.pred_fun = theano.function(inputs=[self.input_mat], outputs=self.predictions, allow_input_downcast=True) self.greedy_fun = theano.function( inputs=[self.priming_word], outputs=T.concatenate( [T.shape_padleft(self.priming_word), self.greedy_predictions]), allow_input_downcast=True) def create_training_function(self): updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta") self.update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.cost, updates=updates, allow_input_downcast=True) def clean_up_files(self, load_path): glob_path = "%s-*%s" % (os.path.splitext(load_path)[0], os.path.splitext(load_path)[1]) files = glob.glob(glob_path) oldest_age = 0 for name in files: try: age = int(os.path.splitext(name)[0].split("-")[-1]) if age > oldest_age: oldest_age = age except: pass # delete on second pass print "Cleaning up redundant files." for name in files: try: age = int(os.path.splitext(name)[0].split("-")[-1]) if age < oldest_age: print ".. removing", name os.remove(name) except: pass def save(self, save_path, clean=False): path = "%s-%s%s" % (os.path.splitext(save_path)[0], self.epochs, os.path.splitext(save_path)[1]) with open(path, 'wb') as f: pickle.dump(self.model.params, f, protocol=pickle.HIGHEST_PROTOCOL) print "Saved model to", path if clean: self.clean_up_files(save_path) def load(self, load_path): glob_path = "%s-*%s" % (os.path.splitext(load_path)[0], os.path.splitext(load_path)[1]) files = glob.glob(glob_path) oldest_age = 0 print glob_path, files for name in files: try: age = int(os.path.splitext(name)[0].split("-")[-1]) if age > oldest_age: oldest_age = age except: pass if oldest_age: path = "%s-%s%s" % (os.path.splitext(load_path)[0], oldest_age, os.path.splitext(load_path)[1]) theano.config.reoptimize_unpickled_function = False theano.gof.compilelock.set_lock_status(False) with open(path, 'rb') as f: print "Loading model from ", path self.model.params = pickle.load(f) self.epochs = oldest_age else: print "Sorry, there is no file I can open with that name" def __call__(self, x): return self.pred_fun(x)
class Model(object): """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, output_size, stack_size=1, celltype=RNN): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = T.tanh)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=40 self.gfs=T.matrix('gfs')#输入gfs数据 self.pm25in=T.matrix('pm25in')#pm25初始数据部分 self.pm25target=T.matrix('pm25target')#输出的目标target #self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() # create gradient training functions: self.create_cost_fun() self.create_valid_error() self.create_training_function() self.create_predict_function() self.create_validate_function() '''上面几步的意思就是先把公式写好''' @property def params(self): return self.model.params def create_prediction(self): def oneStep(gfs_tm2,gfs_tm1,gfs_t,pm25_tm2,pm25_tm1,*prev_hiddens): input_x=T.concatenate([gfs_tm2,gfs_tm1,gfs_t,pm25_tm2,pm25_tm1],axis=0) new_states = self.model.forward(input_x, prev_hiddens) #错位之后返回 return [new_states[-1]]+new_states[:-1] result, updates = theano.scan(oneStep, n_steps=self.steps, sequences=[dict(input=self.gfs, taps=[-2,-1,-0])], outputs_info=[dict(initial=self.pm25in, taps=[-2,-1])] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in self.model.layers if hasattr(layer, 'initial_hidden_state')]) #根据oneStep,result的结果list有两个元素,result[0]是new_stats[-1]即最后一层输出的array,result[1]是之前层 return result[0] def create_cost_fun (self): #可能改cost function,记得 self.cost = (self.predictions - self.pm25target).norm(L=2) / self.steps def create_valid_error(self): self.valid_error=T.abs_(self.predictions - self.pm25target) def create_predict_function(self): self.pred_fun = theano.function(inputs=[self.gfs,self.pm25in],outputs =self.predictions,allow_input_downcast=True) def create_training_function(self): updates, gsums, xsums, lr, max_norm = create_optimization_updates(self.cost, self.params, method="adadelta")#这一步Gradient Decent!!!! self.update_fun = theano.function( inputs=[self.gfs,self.pm25in, self.pm25target], outputs=self.cost, updates=updates, name='update_fun', profile=True, allow_input_downcast=True) def create_validate_function(self): self.valid_fun = theano.function( inputs=[self.gfs,self.pm25in, self.pm25target], outputs=self.valid_error, allow_input_downcast=True ) def __call__(self, gfs,pm25in): return self.pred_fun(gfs,pm25in)
class Model: """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, output_size, stack_size=1, celltype=RNN): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = T.tanh)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=T.iscalar() self.gfs=T.matrix()#输入gfs数据 self.pm25in=T.matrix()#pm25初始数据部分 self.pm25target=T.matrix()#输出的目标target self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() # create gradient training functions: self.create_cost_fun() self.create_training_function() self.create_predict_function() '''上面几步的意思就是先把公式写好''' @property def params(self): return self.model.params '''def create_prediction(self): def oneStep(gfs_tm2,gfs_tm1,gfs_t,pm25_in,pm25_tm1,*hidden_states): input_x=gfs_tm2+gfs_tm1+gfs_t+pm25_in+pm25_tm1 new_hiddens=list(hidden_states) layers_out = self.model.forward(input_x, prev_hiddens = new_hiddens) #这一步更新!!!!,这里input_x和previous_hidden应该是放在outputinfo里进行迭代的 y_given_x=layers_out[-1]#每一层的结果都有输出,最后一层就是输出层了,这里就是输出了下一帧pm25 hiddens=layers_out return [y_given_x]+hiddens #按下面三行描述规则排序,预测的那一时刻帧为0 # in sequence forecasting scenario we take everything # up to the before last step, and predict subsequent # steps ergo, 0 ... n - 1, hence: gfs=self.gfs pm25in=self.pm25in pm250=self.pm250 hiddens0=[initial_state_with_taps(layer,1) for layer in self.model.layers] #这个函数是自动按照scan的格式,已经把taps=-1加上了,所以之后在scan里就直接写进去了 # pass this to Theano's recurrence relation function: # choose what gets outputted at each timestep: outputs_info = [dict(initial=pm250, taps=[-1])]+hiddens0 result, _ = theano.scan(fn=oneStep, sequences=[dict(input=gfs, taps=[-2,-1,0]),pm25in], outputs_info=outputs_info, n_steps=self.steps) return result[0]#每一次y_given_x组成的list # we reorder the predictions to be: # 1. what row / example # 2. what timestep # 3. softmax dimension''' def create_prediction(self): def oneStep(gfs_tm2,gfs_tm1,gfs_t,pm25_tm2,pm25_tm1,*prev_hiddens): input_x=gfs_tm2+gfs_tm1+gfs_t+pm25_tm2+pm25_tm1 new_states = self.model.forward(input_x, prev_hiddens) #错位之后返回 return [new_states[-1]]+new_states[:-1] gfs=self.gfs initial_predict=self.pm25in result, updates = theano.scan(oneStep, n_steps=self.steps, sequences=[dict(input=gfs, taps=[-2,-1,-0])], outputs_info=[dict(initial=initial_predict, taps=[-2,-1])] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in self.model.layers if hasattr(layer, 'initial_hidden_state')]) return result[0] def create_cost_fun (self): self.cost = (self.predictions - self.pm25target).norm(L=2) / self.steps def create_predict_function(self): self.pred_fun = theano.function(inputs=[self.gfs,self.pm25in,self.steps],outputs =self.predictions,allow_input_downcast=True) def create_training_function(self): updates, gsums, xsums, lr, max_norm = create_optimization_updates(self.cost, self.params, method="adadelta")#这一步Gradient Decent!!!! self.update_fun = theano.function( inputs=[self.gfs,self.pm25in, self.pm25target,self.steps], outputs=self.cost, updates=updates, allow_input_downcast=True) def __call__(self, gfs,pm25in,steps): return self.pred_fun(gfs,pm25in,steps)
class Model: """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, output_size, stack_size=1, celltype=Layer,steps=40): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = T.tanh)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=steps self.stepsin=T.iscalar('stepsin') self.x=T.tensor3('x')#输入gfs数据 self.target=T.tensor3('target')#输出的目标target,这一版把target维度改了 self.layerstatus=None self.results=None # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() self.predictions2 = self.create_prediction2() # create gradient training functions: self.create_cost_fun() self.create_valid_error() self.create_training_function() self.create_predict_function() self.create_validate_function() '''上面几步的意思就是先把公式写好''' @property def params(self): return self.model.params def create_prediction(self):#做一次predict的方法 '''x=self.x #初始第一次前传 self.layerstatus=self.model.forward(x[:,0]) #results.shape?40*1 self.results=self.layerstatus[-1].dimshuffle((0,'x',1)) if self.steps > 1: for i in xrange(1,self.steps): self.layerstatus=self.model.forward(x[:,i],self.layerstatus) #need T.shape_padright??? self.results=T.concatenate([self.results,self.layerstatus[-1].dimshuffle((0,'x',1))],axis=1) return self.results''' def step(idx): new_states=self.model.forward(idx) return new_states#不论recursive与否,会全部输出 x = self.x num_examples = x.shape[0] #outputs_info =[initial_state_with_taps(layer, num_examples) for layer in self.model.layers] #outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]] result, _ = theano.scan(fn=step, n_steps=self.steps, sequences=dict(input=x.dimshuffle((1,0,2)), taps=[-0]), ) return result[-1].dimshuffle((1,0,2)) def create_prediction2(self):#做一次predict的方法 def step(idx): new_states=self.model.forward(idx) return new_states#不论recursive与否,会全部输出 x = self.x num_examples = x.shape[0] #outputs_info =[initial_state_with_taps(layer, num_examples) for layer in self.model.layers] #outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]] result, _ = theano.scan(fn=step, n_steps=self.stepsin, sequences=dict(input=x.dimshuffle((1,0,2)), taps=[-0]), ) return result[-1].dimshuffle((1,0,2)) def create_cost_fun (self): self.cost = (self.predictions - self.target[:,:,0:1]).norm(L=2) def create_valid_error(self): self.valid_error=T.mean(T.abs_(self.predictions - self.target[:,:,0:1]),axis=0) def create_predict_function(self): self.pred_fun = theano.function(inputs=[self.x],outputs =self.predictions,allow_input_downcast=True) self.pred_fun2 = theano.function(inputs=[self.x,self.stepsin],outputs =self.predictions2,allow_input_downcast=True) def create_training_function(self): updates, gsums, xsums, lr, max_norm = create_optimization_updates(self.cost, self.params, method="adadelta")#这一步Gradient Decent!!!! self.update_fun = theano.function( inputs=[self.x, self.target], outputs=self.cost, updates=updates, name='update_fun', profile=False, allow_input_downcast=True) def create_validate_function(self): self.valid_fun = theano.function( inputs=[self.x, self.target], outputs=self.valid_error, allow_input_downcast=True ) def __call__(self, x): return self.pred_fun(x)
class Model: """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, output_size, stack_size=1, celltype=RNN,steps=40): # declare model self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = lambda x:x)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=steps self.gfs=T.tensor3('gfs')#输入gfs数据 self.pm25in=T.tensor3('pm25in')#pm25初始数据部分 self.pm25target=T.matrix('pm25target')#输出的目标target,这一版把target维度改了 self.layerstatus=None self.results=None self.cnt = T.tensor3('cnt') # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() # create gradient training functions: self.create_cost_fun() self.create_valid_error() self.create_training_function() self.create_predict_function() self.create_validate_function() '''上面几步的意思就是先把公式写好''' @property def params(self): return self.model.params def create_prediction(self):#做一次predict的方法 gfs=self.gfs pm25in=self.pm25in #初始第一次前传 gfs_x=T.concatenate([gfs[:,0],gfs[:,1],gfs[:,2]],axis=1) pm25in_x=T.concatenate([pm25in[:,0],pm25in[:,1]],axis=1) self.layerstatus=self.model.forward(T.concatenate([gfs_x,pm25in_x,self.cnt[:,:,0]],axis=1)) self.results=self.layerstatus[-1] for i in xrange(1,46):#前6次(0-5),输出之前的先做的6个frame,之后第7次是第1个输出 gfs_x=T.concatenate([gfs_x[:,9:],gfs[:,i+2]],axis=1) pm25in_x=T.concatenate([pm25in_x[:,1:],pm25in[:,i+1]],axis=1) self.layerstatus=self.model.forward(T.concatenate([gfs_x,pm25in_x,self.cnt[:,:,i]],axis=1),self.layerstatus) self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) return self.results def create_cost_fun (self): self.cost = (self.predictions[:,6:46] - self.pm25target[:,6:46]).norm(L=2) def create_valid_error(self): self.valid_error=T.mean(T.abs_(self.predictions[:,6:46] - self.pm25target[:,6:46]),axis=0) def create_predict_function(self): self.pred_fun = theano.function(inputs=[self.gfs,self.pm25in,self.cnt],outputs =self.predictions,allow_input_downcast=True) def create_training_function(self): updates, gsums, xsums, lr, max_norm = create_optimization_updates(self.cost, self.params, method="adadelta")#这一步Gradient Decent!!!! self.update_fun = theano.function( inputs=[self.gfs,self.pm25in, self.pm25target,self.cnt], outputs=self.cost, updates=updates, name='update_fun', profile=False, allow_input_downcast=True) def create_validate_function(self): self.valid_fun = theano.function( inputs=[self.gfs,self.pm25in, self.pm25target,self.cnt], outputs=self.valid_error, allow_input_downcast=True ) def __call__(self, gfs,pm25in): return self.pred_fun(gfs,pm25in)
class Model: """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, output_size, stack_size=1, celltype=RNN,steps=40): # declare model self.celltype=celltype self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add a classifier: self.model.layers.append(Layer(hidden_size, output_size, activation = T.tanh)) # inputs are matrices of indices, # each row is a sentence, each column a timestep self.steps=steps self.gfs=T.tensor3('gfs')#输入gfs数据 self.pm25in=T.tensor3('pm25in')#pm25初始数据部分 self.pm25target=T.matrix('pm25target')#输出的目标target,这一版把target维度改了 self.layerstatus=None self.results=None self.cnt = T.tensor3('cnt') # create symbolic variables for prediction:(就是做一次整个序列完整的进行预测,得到结果是prediction) self.predictions = self.create_prediction() # create gradient training functions: self.create_cost_fun() self.create_valid_error() self.create_training_function() self.create_predict_function() self.create_validate_function() '''上面几步的意思就是先把公式写好''' @property def params(self): return self.model.params def create_prediction(self):#做一次predict的方法 gfs=self.gfs pm25in=self.pm25in #初始第一次前传 x=T.concatenate([gfs[:,0],gfs[:,1],gfs[:,2],pm25in[:,0],pm25in[:,1],self.cnt[:,:,0]],axis=1) if self.celltype==RNN: init_hiddens = [(T.repeat(T.shape_padleft(create_shared(layer.hidden_size, name="RNN.initial_hidden_state")), x.shape[0], axis=0) if x.ndim > 1 else create_shared(layer.hidden_size, name="RNN.initial_hidden_state")) if hasattr(layer, 'initial_hidden_state') else None for layer in self.model.layers] if self.celltype==LSTM: init_hiddens = [(T.repeat(T.shape_padleft(create_shared(layer.hidden_size * 2, name="LSTM.initial_hidden_state")), x.shape[0], axis=0) if x.ndim > 1 else create_shared(layer.hidden_size * 2, name="LSTM.initial_hidden_state")) if hasattr(layer, 'initial_hidden_state') else None for layer in self.model.layers] self.layerstatus=self.model.forward(x,init_hiddens) #results.shape?40*1 self.results=self.layerstatus[-1] if self.steps > 1: self.layerstatus=self.model.forward(T.concatenate([gfs[:,1],gfs[:,2],gfs[:,3],pm25in[:,1],self.results,self.cnt[:,:,1]],axis=1),self.layerstatus) self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) #前传之后step-2次 for i in xrange(2,self.steps): self.layerstatus=self.model.forward(T.concatenate([gfs[:,i],gfs[:,i+1],gfs[:,i+2],T.shape_padright(self.results[:,i-2]),T.shape_padright(self.results[:,i-1]),self.cnt[:,:,i]],axis=1),self.layerstatus) #need T.shape_padright??? self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) return self.results def create_cost_fun (self): self.cost = (self.predictions - self.pm25target).norm(L=2) def create_valid_error(self): self.valid_error=T.mean(T.abs_(self.predictions - self.pm25target),axis=0) def create_predict_function(self): self.pred_fun = theano.function(inputs=[self.gfs,self.pm25in,self.cnt],outputs =self.predictions,allow_input_downcast=True) def create_training_function(self): updates, gsums, xsums, lr, max_norm = create_optimization_updates(self.cost, self.params, method="adadelta")#这一步Gradient Decent!!!! self.update_fun = theano.function( inputs=[self.gfs,self.pm25in, self.pm25target,self.cnt], outputs=self.cost, updates=updates, name='update_fun', profile=False, allow_input_downcast=True) def create_validate_function(self): self.valid_fun = theano.function( inputs=[self.gfs,self.pm25in, self.pm25target,self.cnt], outputs=self.valid_error, allow_input_downcast=True ) def __call__(self, gfs,pm25in): return self.pred_fun(gfs,pm25in)
class RelativeShiftLSTMStack( object ): """ Manages a stack of LSTM cells with potentially a relative shift applied """ def __init__(self, input_parts, layer_sizes, output_size, window_size=0, dropout=0, mode="drop", unroll_batch_num=None): """ Parameters: input_parts: A list of InputParts layer_sizes: A list of the form [ (indep, per_note), ... ] where indep is the number of non-shifted cells to have, and per_note is the number of cells to have per window note, which shift as the network moves Alternately can just be [ indep, ... ] output_size: An integer, the width of the desired output dropout: How much dropout to apply. mode: Either "drop" or "roll". If drop, discard memory that goes out of range. If roll, roll it instead """ self.input_parts = input_parts self.window_size = window_size layer_sizes = [x if isinstance(x,tuple) else (x,0) for x in layer_sizes] self.layer_sizes = layer_sizes self.tot_layer_sizes = [(indep + per_note*self.window_size) for indep, per_note in layer_sizes] self.output_size = output_size self.dropout = dropout self.input_size = sum(part.PART_WIDTH for part in input_parts) self.cells = StackedCells( self.input_size, celltype=LSTM, activation=T.tanh, layers = self.tot_layer_sizes ) self.cells.layers.append(Layer(self.tot_layer_sizes[-1], self.output_size, activation = lambda x:x)) assert mode in ("drop", "roll"), "Must specify either drop or roll mode" self.mode = mode self.unroll_batch_num = unroll_batch_num @property def params(self): return self.cells.params + list(l.initial_hidden_state for l in self.cells.layers if has_hidden(l)) @params.setter def params(self, paramlist): self.cells.params = paramlist[:len(self.cells.params)] for l, val in zip((l for l in self.cells.layers if has_hidden(l)), paramlist[len(self.cells.params):]): l.initial_hidden_state.set_value(val.get_value()) def perform_step(self, in_data, shifts, hiddens, dropout_masks=[]): """ Perform a step through the LSTM network. in_data: A theano tensor (float32) of shape (batch, input_size) shifts: A theano tensor (int32) of shape (batch), giving the relative shifts to apply to the last hiddens hiddens: A list of hiddens [layer](batch, hidden_idx) dropout_masks: If [], apply dropout deterministically. Otherwise, should be a set of masks returned by get_dropout_masks, generally passed through a scan as a non-sequence. """ # hiddens is of shape [layer](batch, hidden_idx) # We want to permute the hidden_idx values according to shifts, # which are ints of shape (batch) n_batch = in_data.shape[0] new_hiddens = [] for layer_i, (indep, per_note) in enumerate(self.layer_sizes): if per_note == 0: # Don't bother with this layer new_hiddens.append(hiddens[layer_i]) continue # The theano_lstm code puts [memory_cells... , old_activations...] # We want to slide the memory cells only. lstm_hsplit = self.cells.layers[layer_i].hidden_size indep_mem = hiddens[layer_i][:,:indep] per_note_mem = hiddens[layer_i][:,indep:lstm_hsplit] remaining_values = hiddens[layer_i][:,lstm_hsplit:] # per_note_mem is (batch, per_note_mem) separated_mem = per_note_mem.reshape((n_batch, self.window_size, per_note)) # separated_mem is (batch, note, mem) # [a b c ... x y z] shifted up 1 (+1) goes to [b c ... x y z 0] # [a b c ... x y z] shifted down 1 (-1) goes to [0 a b c ... x y] def _shift_step(c_mem, c_shift): # c_mem is (note, mem) # c_shift is an int if self.mode=="drop": def _clamp_w(x): return T.maximum(0,T.minimum(x,self.window_size)) ins_at_front = T.zeros((_clamp_w(-c_shift),per_note)) ins_at_back = T.zeros((_clamp_w(c_shift),per_note)) take_part = c_mem[_clamp_w(c_shift):self.window_size-_clamp_w(-c_shift),:] return T.concatenate([ins_at_front, take_part, ins_at_back], 0) elif self.mode=="roll": return T.roll(c_mem, (-c_shift)%12, axis=0) if self.unroll_batch_num is None: shifted_mem, _ = theano.map(_shift_step, [separated_mem, shifts]) else: shifted_mem_parts = [] for i in range(self.unroll_batch_num): shifted_mem_parts.append(_shift_step(separated_mem[i], shifts[i])) shifted_mem = T.stack(shifted_mem_parts) new_per_note_mem = shifted_mem.reshape((n_batch, self.window_size * per_note)) new_layer_hiddens = T.concatenate([indep_mem, new_per_note_mem, remaining_values], 1) new_hiddens.append(new_layer_hiddens) if dropout_masks == [] or not self.dropout: masks = [] else: masks = [None] + dropout_masks new_states = self.cells.forward(in_data, prev_hiddens=new_hiddens, dropout=masks) return new_states def do_preprocess_scan(self, deterministic_dropout=False, **kwargs): """ Run a scan using this LSTM, preprocessing all inputs before the scan. Parameters: kwargs[k]: should be a theano tensor of shape (n_batch, n_time, ... ) Note that "relative_position" should be a keyword argument given here if there are relative shifts. deterministic_dropout: If True, apply dropout deterministically, scaling everything. If false, sample dropout Returns: A theano tensor of shape (n_batch, n_time, output_size) of activations """ assert len(kwargs)>0, "Need at least one input argument!" n_batch, n_time = list(kwargs.values())[0].shape[:2] squashed_kwargs = { k: v.reshape([n_batch*n_time] + [x for x in v.shape[2:]]) for k,v in kwargs.items() } full_input = T.concatenate([ part.generate(**squashed_kwargs) for part in self.input_parts ], 1) adjusted_input = full_input.reshape([n_batch, n_time, self.input_size]).dimshuffle((1,0,2)) if "relative_position" in kwargs: relative_position = kwargs["relative_position"] diff_shifts = T.extra_ops.diff(relative_position, axis=1) cat_shifts = T.concatenate([T.zeros((n_batch, 1), 'int32'), diff_shifts], 1) shifts = cat_shifts.dimshuffle((1,0)) else: shifts = T.zeros(n_time, n_batch, 'int32') def _scan_fn(in_data, shifts, *other): other = list(other) if self.dropout and not deterministic_dropout: split = -len(self.tot_layer_sizes) hiddens = other[:split] masks = [None] + other[split:] else: masks = [] hiddens = other return self.perform_step(in_data, shifts, hiddens, dropout_masks=masks) if self.dropout and not deterministic_dropout: dropout_masks = UpscaleMultiDropout( [(n_batch, shape) for shape in self.tot_layer_sizes], self.dropout) else: dropout_masks = [] outputs_info = [initial_state_with_taps(layer, n_batch) for layer in self.cells.layers] result, _ = theano.scan(fn=_scan_fn, sequences=[adjusted_input, shifts], non_sequences=dropout_masks, outputs_info=outputs_info) final_out = get_last_layer(result).transpose((1,0,2)) return final_out def prepare_sample_scan(self, start_pos, start_out, deterministic_dropout=False, **kwargs): """ Prepare a sample scan Parameters: kwargs[k]: should be a theano tensor of shape (n_batch, n_time, ... ) Note that "relative_position" should be a keyword argument given here if there are relative shifts, as should "timestep" start_pos: a theano tensor of shape (n_batch) giving the initial position passed to the out_to_in function start_out: a theano tensor of shape (n_batch, X) giving the initial "output" passed to the out_to_in_fn deterministic_dropout: If True, apply dropout deterministically, scaling everything. If false, sample dropout Returns: A namedtuple, where sequences: a list of sequences to input into scan non_sequences: a list of non_sequences into scan outputs_info: a list of outputs_info for scan num_taps: the number of outputs with taps for this (other values): for internal use """ assert len(kwargs)>0, "Need at least one input argument!" n_batch, n_time = list(kwargs.values())[0].shape[:2] transp_kwargs = { k: v.dimshuffle((1,0) + tuple(range(2,v.ndim))) for k,v in kwargs.items() } if self.dropout and not deterministic_dropout: dropout_masks = UpscaleMultiDropout( [(n_batch, shape) for shape in self.tot_layer_sizes], self.dropout) else: dropout_masks = [] outputs_info = [{"initial":start_pos, "taps":[-1]}, {"initial":start_out, "taps":[-1]}] + [initial_state_with_taps(layer, n_batch) for layer in self.cells.layers] sequences = list(transp_kwargs.values()) non_sequences = dropout_masks num_taps = len([True for x in outputs_info if x is not None]) return SampleScanSpec(sequences=sequences, non_sequences=non_sequences, outputs_info=outputs_info, num_taps=num_taps, kwargs_keys=list(transp_kwargs.keys()), deterministic_dropout=deterministic_dropout, start_pos=start_pos) def sample_scan_routine(self, spec, *inputs): """ Start a scan routine. This is implemented as a generator, since we may need to interrupt the state in the middle of iteration. How to use: scan_rout = x.sample_scan_routine(spec, *inputs) - spec: The SampleScanSpec returned by prepare_sample_scan - *inputs: The scan inputs, in [ sequences..., taps..., non_sequences... ] order last_rel_pos, last_out, cur_kwargs = scan_rout.send(None) - last_rel_pos is a theano tensor of shape (n_batch) - last_out will be a theano tensor of shape (n_batch, output_size) - cur_kwargs[k] is a theano tensor of shape (n_batch, ...), from kwargs out_activations = scan_rout.send((new_pos, addtl_kwargs)) - new_pos is a theano tensor of shape (n_batch), giving the new relative position - addtl_kwargs[k] is a theano tensor of shape (n_batch, ...) to be added to cur kwargs Note that "relative_position" will be added automatically. scan_outputs = scan_rout.send(new_out) - new_out is a tensor of shape (n_batch, X) to be output scan_rout.close() -> scan_outputs should be returned back to scan """ stuff = list(inputs) I = len(spec.kwargs_keys) kwarg_seq_vals = stuff[:I] cur_kwargs = {k:v for k,v in zip(spec.kwargs_keys, kwarg_seq_vals)} last_pos, last_out = stuff[I:I+2] other = stuff[I+2:] if self.dropout and not spec.deterministic_dropout: split = -len(self.tot_layer_sizes) hiddens = other[:split] masks = [None] + other[split:] else: masks = [] hiddens = other cur_pos, addtl_kwargs = yield(last_pos, last_out, cur_kwargs) all_kwargs = { "relative_position": cur_pos } all_kwargs.update(cur_kwargs) all_kwargs.update(addtl_kwargs) shift = T.switch(T.eq(all_kwargs["timestep"],0), 0, cur_pos - last_pos) full_input = T.concatenate([ part.generate(**all_kwargs) for part in self.input_parts ], 1) step_stuff = self.perform_step(full_input, shift, hiddens, dropout_masks=masks) new_hiddens = step_stuff[:-1] raw_output = step_stuff[-1] sampled_output = yield(raw_output) yield [cur_pos, sampled_output] + step_stuff def extract_sample_scan_results(self, spec, outputs): """ Extract outputs from the scan results. Parameters: outputs: The outputs from the scan associated with this stack Returns: positions, raw_output, sampled_output """ positions = T.concatenate([T.shape_padright(spec.start_pos), outputs[0].transpose((1,0))[:,:-1]], 1) sampled_output = outputs[2].transpose((1,0,2)) raw_output = outputs[-1].transpose((1,0,2)) return positions, raw_output, sampled_output def do_sample_scan(self, start_pos, start_out, sample_fn, out_to_in_fn, deterministic_dropout=True, **kwargs): """ Run a scan using this LSTM, sampling and processing as we go. Parameters: kwargs[k]: should be a theano tensor of shape (n_batch, n_time, ... ) Note that "relative_position" should be a keyword argument given here if there are relative shifts. start_pos: a theano tensor of shape (n_batch) giving the initial position passed to the out_to_in function start_out: a theano tensor of shape (n_batch, X) giving the initial "output" passed to the out_to_in_fn sample_fn: a function with signature sample_fn(out_activations, rel_pos) -> new_out, new_rel_pos where - rel_pos is a theano tensor of shape (n_batch) - out_activations is a tensor of shape (n_batch, output_size) and - new_out is a tensor of shape (n_batch, X) to be output - new_rel_pos should be a theano tensor of shape (n_batch) out_to_in_fn: a function with signature out_to_in_fn(rel_pos, last_out, **cur_kwargs) -> addtl_kwargs where - rel_pos is a theano tensor of shape (n_batch) - last_out will be a theano tensor of shape (n_batch, output_size) - cur_kwargs[k] is a theano tensor of shape (n_batch, ...), from kwargs and - addtl_kwargs[k] is a theano tensor of shape (n_batch, ...) to be added to cur kwargs Note that "relative_position" will be added automatically. deterministic_dropout: If True, apply dropout deterministically, scaling everything. If false, sample dropout Returns: positions, raw_output, sampled_output, updates """ raise NotImplementedError() spec = self.prepare_sample_scan(start_pos, start_out, sample_fn, deterministic_dropout, **kwargs) def _scan_fn(*stuff): scan_rout = self.sample_scan_routine(spec, *stuff) rel_pos, last_out, cur_kwargs = scan_rout.send(None) addtl_kwargs = out_to_in_fn(rel_pos, last_out, **cur_kwargs) out_activations = scan_rout.send(addtl_kwargs) sampled_output, new_pos = sample_fn(out_activations, rel_pos) scan_outputs = scan_rout.send((sampled_output, new_pos)) scan_rout.close() return scan_outputs result, updates = theano.scan(fn=_scan_fn, sequences=spec.sequences, non_sequences=spec.non_sequences, outputs_info=spec.outputs_info) positions, raw_output, sampled_output = self.extract_sample_scan_results(spec, result) return positions, raw_output, sampled_output, updates