def build_theano_functions(self): x = T.fmatrix('time_sequence') x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim)) y = x[:,1:self.sequence_dim,:] x = x[:,:self.sequence_dim-1,:] # if we try to include the spectrogram features spec_dims = 0 if self.image_size is not None : print "Convolution activated" self.init_conv() spec = T.ftensor4('spectrogram') spec_features, spec_dims = self.conv.build_conv_layers(spec) print "Conv final dims =", spec_dims spec_dims = np.prod(spec_dims) spec_features = spec_features.reshape( (self.batch_dim, self.sequence_dim-1, spec_dims)) x = T.concatenate([x, spec_features], axis=2) layers_input = [x] dims =np.array([self.time_dim + spec_dims]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, weights_init=Orthogonal(self.orth_scale), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X sequence X time lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:,:,:self.gmm_dim], ((self.sequence_dim-1)*self.batch_dim, self.gmm_dim))), (self.batch_dim, (self.sequence_dim-1), self.gmm_dim)) sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+1e-6 mus = y_hat[:,:,self.gmm_dim*2:] pis = pis[:,:,:,np.newaxis] mus = mus[:,:,:,np.newaxis] sig = sig[:,:,:,np.newaxis] y = y[:,:,np.newaxis,:] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5*((y-mus)**2)/sig**2 coeff = T.log(T.maximum(1./(T.sqrt(2.*np.pi)*sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum(sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum(T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))).mean() LL.name = "summed_likelihood" model = Model(LL) self.model = model parameters = model.parameters algorithm = GradientDescent( cost=LL, parameters=model.parameters, step_rule=Adam()) f = theano.function([x],[pis, sig, mus]) return algorithm, f
def build_theano_functions(self): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') z = T.ftensor3('z') layers_input = [x] dims = np.array([self.input_dim]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear( dims[layer], dims[layer + 1] * 4, #weights_init=Uniform(mean=data_mean, std=1), weights_init=IsotropicGaussian(mean=1., std=1), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value(Orthogonal().generate( np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # the idea is to have one gaussian parametrize every frequency bin print "Last linear transform dim :", dims[1:].sum() output_transform = Linear( dims[1:].sum(), self.output_dim, weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(0), #use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) sig = T.nnet.relu(y_hat[:, :, :self.output_dim / 2]) + 0.05 mus = y_hat[:, :, self.output_dim / 2:] # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time inside_expo = -0.5 * ((y - mus)**2) / sig**2 expo = T.exp(inside_expo) coeff = 1. / (T.sqrt(2. * np.pi) * sig) inside_log = T.log(coeff * expo) inside_log_max = T.max(inside_log, axis=2, keepdims=True) LL = -(inside_log_max + T.log( T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum() #zinside_expo = -0.5*((z-mus)**2)/sig**2 #zexpo = T.exp(zinside_expo) #zcoeff = pis*(1./(T.sqrt(2.*np.pi)*sig)) #zinside_log = (zcoeff*zexpo).sum(axis=2) #zLL = -(T.log(zinside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)): #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append( tuple([parameters[i], parameters[i] - lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug: gradf = theano.function([x, y, lr], [LL, mus, sig], updates=updates) else: #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr], [LL], updates=updates) f = theano.function([x], [sig, mus]) return gradf, f
def build_theano_functions(self): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor4('y') layers_input = [x] dims = np.array([self.time_dim]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear( dims[layer], dims[layer + 1] * 4, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale * Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear( dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=0., std=1), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor #pis = T.reshape( # T.nnet.softmax( # T.nnet.sigmoid( # T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))), # (self.batch_dim, self.time_dim, self.gmm_dim)) pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:, :, :self.gmm_dim], (self.sequence_dim * self.batch_dim, self.gmm_dim))), (self.batch_dim, self.sequence_dim, self.gmm_dim)) sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6 #sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1 #mus = 2.*T.tanh(y_hat[:,:,self.gmm_dim*2:]) mus = y_hat[:, :, self.gmm_dim * 2:] pis = pis[:, :, :, np.newaxis] mus = mus[:, :, :, np.newaxis] sig = sig[:, :, :, np.newaxis] #y = y[:,:,np.newaxis,:] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5 * ((y - mus)**2) / sig**2 coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum( sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum( T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True)) ).mean() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)): #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append( tuple([parameters[i], parameters[i] - lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug: gradf = theano.function([x, y, lr], [LL, pis, mus, sig], updates=updates) else: #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr], [LL], updates=updates) f = theano.function([x], [pis, sig, mus]) return gradf, f
def build_theano_functions(self): x = T.fmatrix('time_sequence') x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim)) y = x[:, 1:self.sequence_dim, :] x = x[:, :self.sequence_dim - 1, :] # if we try to include the spectrogram features spec_dims = 0 if self.image_size is not None: print "Convolution activated" self.init_conv() spec = T.ftensor4('spectrogram') spec_features, spec_dims = self.conv.build_conv_layers(spec) print "Conv final dims =", spec_dims spec_dims = np.prod(spec_dims) spec_features = spec_features.reshape( (self.batch_dim, self.sequence_dim - 1, spec_dims)) x = T.concatenate([x, spec_features], axis=2) layers_input = [x] dims = np.array([self.time_dim + spec_dims]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer + 1] * 4, weights_init=Orthogonal(self.orth_scale), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X sequence X time lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale * Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape( y_hat[:, :, :self.gmm_dim], ((self.sequence_dim - 1) * self.batch_dim, self.gmm_dim))), (self.batch_dim, (self.sequence_dim - 1), self.gmm_dim)) sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6 mus = y_hat[:, :, self.gmm_dim * 2:] pis = pis[:, :, :, np.newaxis] mus = mus[:, :, :, np.newaxis] sig = sig[:, :, :, np.newaxis] y = y[:, :, np.newaxis, :] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5 * ((y - mus)**2) / sig**2 coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum( sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum( T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True)) ).mean() LL.name = "summed_likelihood" model = Model(LL) self.model = model parameters = model.parameters algorithm = GradientDescent(cost=LL, parameters=model.parameters, step_rule=Adam()) f = theano.function([x], [pis, sig, mus]) return algorithm, f
input_dim = 6 output_dim = 8 n_classes = 10 test_data = {x: np.random.normal(size=(n_steps, batch_size, input_dim) ).astype(np.float32), y: np.random.randint(n_classes, size=(batch_size, ) ).astype(np.int32)} inits = { 'weights_init': IsotropicGaussian(0.1), 'biases_init': Constant(0.), } core = CoreNetwork(input_dim=input_dim, dim=output_dim, **inits) core.initialize() proj = Linear(input_dim=output_dim*2, output_dim=n_classes, **inits) proj.initialize() out = Softmax() state, cell = core.apply(x) a = T.concatenate([state, cell], axis=2) a = a.reshape((a.shape[0]*a.shape[1], a.shape[2])) a = proj.apply(a) prop = out.apply(a).reshape((n_steps, batch_size, n_classes)) pred = prop[-1] prop = prop.reshape((n_steps * batch_size, n_classes)) print prop.eval({x: test_data[x]}) y_reat = T.repeat(y[None, :], n_steps, axis=0).reshape( (n_steps * batch_size, 1))
def build_theano_functions(self) : x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor4('y') layers_input = [x] dims =np.array([self.time_dim]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=0., std=1), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor #pis = T.reshape( # T.nnet.softmax( # T.nnet.sigmoid( # T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))), # (self.batch_dim, self.time_dim, self.gmm_dim)) pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:,:,:self.gmm_dim], (self.sequence_dim*self.batch_dim, self.gmm_dim))), (self.batch_dim, self.sequence_dim, self.gmm_dim)) sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+1e-6 #sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1 #mus = 2.*T.tanh(y_hat[:,:,self.gmm_dim*2:]) mus = y_hat[:,:,self.gmm_dim*2:] pis = pis[:,:,:,np.newaxis] mus = mus[:,:,:,np.newaxis] sig = sig[:,:,:,np.newaxis] #y = y[:,:,np.newaxis,:] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5*((y-mus)**2)/sig**2 coeff = T.log(T.maximum(1./(T.sqrt(2.*np.pi)*sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum(sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum(T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))).mean() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)) : #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append(tuple([parameters[i], parameters[i] - lr*grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug : gradf = theano.function([x, y, lr],[LL, pis, mus, sig],updates=updates) else : #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr],[LL],updates=updates) f = theano.function([x],[pis, sig, mus]) return gradf, f
def build_theano_functions(self) : x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') z = T.ftensor3('z') layers_input = [x] dims =np.array([self.input_dim]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, #weights_init=Uniform(mean=data_mean, std=1), weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value(Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # the idea is to have one gaussian parametrize every frequency bin print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(0), #use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) sig = T.nnet.relu(y_hat[:,:,:self.output_dim/2])+0.05 mus = y_hat[:,:,self.output_dim/2:] # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time inside_expo = -0.5*((y-mus)**2)/sig**2 expo = T.exp(inside_expo) coeff = 1./(T.sqrt(2.*np.pi)*sig) inside_log = T.log(coeff*expo) inside_log_max = T.max(inside_log, axis=2, keepdims=True) LL = -(inside_log_max + T.log(T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum() #zinside_expo = -0.5*((z-mus)**2)/sig**2 #zexpo = T.exp(zinside_expo) #zcoeff = pis*(1./(T.sqrt(2.*np.pi)*sig)) #zinside_log = (zcoeff*zexpo).sum(axis=2) #zLL = -(T.log(zinside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)) : #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append(tuple([parameters[i], parameters[i] - lr*grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug : gradf = theano.function([x, y, lr],[LL, mus, sig],updates=updates) else : #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr],[LL],updates=updates) f = theano.function([x],[sig, mus]) return gradf, f
def build_theano_functions(self, data_mean, data_std) : x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') # before the cell, input, forget and output gates, x needs to # be transformed linear_transforms = [] for transform in ['c','i','f','o'] : linear_transforms.append( Linear(self.input_dim, self.lstm_dim, weights_init=Uniform(mean=data_mean, std=data_std), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(data_mean), name=transform+"_transform") ) for transform in linear_transforms : transform.initialize() linear_applications = [] for transform in linear_transforms : linear_applications.append( transform.apply(x)) lstm_input = T.concatenate(linear_applications, axis=2) # the lstm wants batch X time X value lstm = LSTM( dim=self.lstm_dim, weights_init=IsotropicGaussian(mean=0.5,std=1), biases_init=Constant(1)) lstm.initialize() h, _dummy = lstm.apply(lstm_input) # this is where Alex Graves' paper starts output_transform = Linear(self.lstm_dim, self.output_dim, #weights_init=Uniform(mean=data_mean, std=data_std), weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(1), name="output_transform") output_transform.initialize() y_hat = output_transform.apply(h) # transforms to find each gmm params (mu, pi, sig) #pis = NDimensionalSoftmax.apply(y_hat[:,:,0:self.gmm_dim]) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim))) , (self.batch_dim, self.time_dim, self.gmm_dim)) #sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2]) sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1 mus = y_hat[:,:,self.gmm_dim*2:] pis = pis[:,:,:,np.newaxis] mus = mus[:,:,:,np.newaxis] sig = sig[:,:,:,np.newaxis] y = y[:,:,np.newaxis,:] #sig=theano.printing.Print()(sig) # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time #LL = -T.log((pis*(1./(T.sqrt(2.*np.pi)*sig))*T.exp(-0.5*((y-mus)**2)/sig**2)).sum(axis=2)).sum() expo = T.exp(-0.5*((y-mus)**2)/sig**2) test_expo = theano.function([x,y],[expo, mus, sig]) return test_expo coeff = pis*(1./(T.sqrt(2.*np.pi)*sig)) inside_log = (coeff*expo).sum(axis=2) LL = -(T.log(inside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] for i in range(len(grads)) : updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) gradf = theano.function([x, y],[LL],updates=updates) f = theano.function([x],[pis, sig, mus]) return gradf, f
def build_theano_functions(self, data_mean, data_std): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') # before the cell, input, forget and output gates, x needs to # be transformed linear_transforms = [] for transform in ['c', 'i', 'f', 'o']: linear_transforms.append( Linear( self.input_dim, self.lstm_dim, weights_init=Uniform(mean=data_mean, std=data_std), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(data_mean), name=transform + "_transform")) for transform in linear_transforms: transform.initialize() linear_applications = [] for transform in linear_transforms: linear_applications.append(transform.apply(x)) lstm_input = T.concatenate(linear_applications, axis=2) # the lstm wants batch X time X value lstm = LSTM(dim=self.lstm_dim, weights_init=IsotropicGaussian(mean=0.5, std=1), biases_init=Constant(1)) lstm.initialize() h, _dummy = lstm.apply(lstm_input) # this is where Alex Graves' paper starts output_transform = Linear( self.lstm_dim, self.output_dim, #weights_init=Uniform(mean=data_mean, std=data_std), weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(1), name="output_transform") output_transform.initialize() y_hat = output_transform.apply(h) # transforms to find each gmm params (mu, pi, sig) #pis = NDimensionalSoftmax.apply(y_hat[:,:,0:self.gmm_dim]) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:, :, 0:self.gmm_dim], (self.time_dim * self.batch_dim, self.gmm_dim))), (self.batch_dim, self.time_dim, self.gmm_dim)) #sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2]) sig = T.nnet.relu(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 0.1 mus = y_hat[:, :, self.gmm_dim * 2:] pis = pis[:, :, :, np.newaxis] mus = mus[:, :, :, np.newaxis] sig = sig[:, :, :, np.newaxis] y = y[:, :, np.newaxis, :] #sig=theano.printing.Print()(sig) # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time #LL = -T.log((pis*(1./(T.sqrt(2.*np.pi)*sig))*T.exp(-0.5*((y-mus)**2)/sig**2)).sum(axis=2)).sum() expo = T.exp(-0.5 * ((y - mus)**2) / sig**2) test_expo = theano.function([x, y], [expo, mus, sig]) return test_expo coeff = pis * (1. / (T.sqrt(2. * np.pi) * sig)) inside_log = (coeff * expo).sum(axis=2) LL = -(T.log(inside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] for i in range(len(grads)): updates.append( tuple([parameters[i], parameters[i] - self.lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) gradf = theano.function([x, y], [LL], updates=updates) f = theano.function([x], [pis, sig, mus]) return gradf, f
def build_theano_functions(self) : # shape of theano inpu is time+1 X features x = T.fmatrix('frequency_sequence') x = x.reshape((self.batch_dim, self.time_dim+1, self.input_dim)) y = x[:,1:self.time_dim+1,:] x = x[:,:self.time_dim,:] layers_input = [x] dims =np.array([self.input_dim]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # the idea is to have one gaussian parametrize every frequency bin print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, #weights_init=IsotropicGaussian(mean=0., std=1), weights_init=Orthogonal(self.orth_scale), biases_init=Constant(0), #use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) sig = T.nnet.relu(y_hat[:,:,:self.output_dim/2])+0.05 mus = y_hat[:,:,self.output_dim/2:] # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time inside_expo = -0.5*((y-mus)**2)/sig**2 expo = T.exp(inside_expo) coeff = 1./(T.sqrt(2.*np.pi)*sig) inside_log = T.log(coeff*expo) inside_log_max = T.max(inside_log, axis=2, keepdims=True) LL = -(inside_log_max + T.log(T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum() LL.name = "summed_likelihood" model = Model(LL) self.model = model algorithm = GradientDescent( cost=LL, parameters=model.parameters, step_rule=AdaGrad()) f = theano.function([x],[sig, mus]) return algorithm, f