def get_output_for(self, input, deterministic=False, **kwargs): """ Parameters ---------- input : tensor output from the previous layer deterministic : bool If true dropout and scaling is disabled, see notes """ from .. import utils if deterministic or self.p == 0: return input else: # Using theano constant to prevent upcasting one = T.constant(1) retain_prob = one - self.p if self.rescale: # According to pull-request 595 from eduardo4jesus # It needs a proper call in case the input is an sparse variable if type(input) == S.SparseVariable: input = S.mul(input, utils.floatX(1.)/retain_prob) else: input /= retain_prob # use nonsymbolic shape for dropout mask if possible input_shape = self.input_shape if any(s is None for s in input_shape): input_shape = input.shape return input * self._srng.binomial(input_shape, p=retain_prob, dtype=input.dtype)
def _get_diagonal_term(self, X_left, X_right, diag_init): diag = tn.shared(value=diag_init, name='diag') if _tn_is_sparse(X_left) or _tn_is_sparse(X_right): XlXr = tsp.mul(X_left, X_right) y_pred = tsp.dot(XlXr, diag) else: XlXr = T.mul(X_left, X_right) y_pred = T.dot(XlXr, diag) return y_pred, [diag]
def _comiple_message_node_(self, _node, _factor): ''' Pseudocode: (treat _node as X and _factor as L ) if X is the input variable (global) then return u_c , the input else generate a new variable name v_x collect neighbouring L_i of X excluding L for [L_1, L_2 .. L_i ], do v_i = compile_message(L_i -> X) emit(v_x = v1 dot v2 ... dot vi) return v_x ''' if _node == self.head_predicate.i: #This is the input variable. if _node.u is None: print _node # raw_input("Node has nothing") return _node.u #This is NOT the input variable. neighbors = self._get_neighbours( _node, _exclude=_factor) #Will be a list of factors. #Send the neighbour + current node to compilemessage_factor and collect what they have to say. neighboring_values = [ self._compile_message_factor_(_factor=factor, _node=_node) for factor in neighbors ] if len(neighboring_values) > 0: v_x = neighboring_values[0] for remaining_values in neighboring_values[1:]: v_x = sparse.mul(v_x, remaining_values) # v_x = v_x * remaining_values else: #In this case, since there are no neighbors, there's literally nothing to return. #@TODO: What do we do here # print "belief_propagation:Graph:compile_message: Part where there are no neighbours!" pass return v_x
def get_output_for(self, input, deterministic=False, **kwargs): if not isinstance(input, (S.SparseVariable, S.SparseConstant, S.sharedvar.SparseTensorSharedVariable)): raise ValueError("Input for this layer must be sparse") if deterministic or self.p == 0: return input else: # Using Theano constant to prevent upcasting one = T.constant(1, name='one') retain_prob = one - self.p if self.rescale: input = S.mul(input, one/retain_prob) input_shape = self.input_shape if any(s is None for s in input_shape): input_shape = input.shape return input * self._srng.binomial(input_shape, p=retain_prob, dtype=input.dtype)
def test_upcast(self): array1 = numpy.array([[1, 0], [3, 0], [0, 6]], dtype="float32") array2 = numpy.array([[1, 0], [3, 0], [0, 6]], dtype="int32") array3 = numpy.array([[1, 0], [3, 0], [0, 6]], dtype="int8") # AddSS and MulSS for mtype in _mtypes: a = mtype(array1) aR = as_sparse_variable(a) b = mtype(array2) bR = as_sparse_variable(b) c = mtype(array3) cR = as_sparse_variable(c) # Ops that do not upcast self.assertRaises(NotImplementedError, add, aR, bR) self.assertRaises(NotImplementedError, add, bR, aR) self.assertRaises(NotImplementedError, add, bR, cR) self.assertRaises(NotImplementedError, add, cR, bR) self.assertRaises(NotImplementedError, add, aR, cR) self.assertRaises(NotImplementedError, add, cR, aR) self.assertRaises(NotImplementedError, mul, aR, bR) self.assertRaises(NotImplementedError, mul, bR, aR) self.assertRaises(NotImplementedError, mul, bR, cR) self.assertRaises(NotImplementedError, mul, cR, bR) self.assertRaises(NotImplementedError, mul, aR, cR) self.assertRaises(NotImplementedError, mul, cR, aR) # AddSD and MulSD for mtype in _mtypes: a = mtype(array1) a_sv = as_sparse_variable(a) a_dv = tensor.as_tensor_variable(array1) b = mtype(array2) b_sv = as_sparse_variable(b) b_dv = tensor.as_tensor_variable(array2) c = mtype(array3) c_sv = as_sparse_variable(c) c_dv = tensor.as_tensor_variable(array3) # add does not upcast self.assertRaises(NotImplementedError, add, a_sv, b_dv) self.assertRaises(NotImplementedError, add, b_sv, a_dv) self.assertRaises(NotImplementedError, add, b_sv, c_dv) self.assertRaises(NotImplementedError, add, c_sv, b_dv) self.assertRaises(NotImplementedError, add, a_sv, c_dv) self.assertRaises(NotImplementedError, add, c_sv, a_dv) # mul may upcast the dense input if needed if config.cast_policy in ("custom", "numpy") or ( config.cast_policy == "numpy+floatX" and config.floatX == "float64" ): # The result should be a float64 (not implemented). self.assertRaises(NotImplementedError, mul, a_sv, b_dv) elif config.cast_policy == "numpy+floatX" and config.floatX == "float32": # The result should be a float32. assert mul(a_sv, b_dv).dtype == "float32" else: raise NotImplementedError() self.assertRaises(NotImplementedError, mul, b_sv, a_dv) assert mul(b_sv, c_dv).dtype == "int32" self.assertRaises(NotImplementedError, mul, c_sv, b_dv) assert mul(a_sv, c_dv).dtype == "float32" self.assertRaises(NotImplementedError, mul, c_sv, a_dv)
def _get_gradients_adagrad(self, J): """Get the AdaGrad gradients and squared gradients updates. The returned gradients still need to be multiplied with the general learning rate. Parameters ---------- J : theano variable cost Returns ------- theano variable gradients that are adapted by the AdaGrad algorithm theano variable updated sum of squares for all previous steps """ grads = T.grad(J, [self.__dict__[self.updatable_parameters[i]] for i in xrange(len(self.updatable_parameters))]) for i, _ in enumerate(grads): grads[i] = debug_print(grads[i], 'grads_' + self.updatable_parameters[i]) updated_squares = dict() # Add squared gradient to the squared gradient matrix for AdaGrad and # recalculate the gradient. for i, p in enumerate(self.updatable_parameters): # We need to handle sparse gradient variables differently if isinstance(grads[i], sparse.SparseVariable): # Add the sqares to the matrix power = debug_print(sparse.structured_pow(grads[i], 2.), 'pow_' + p) # Remove zeros (might happen when squaring near zero values) power = sparse.remove0(power) updated_squares[p] = self.__dict__['adagrad_matrix_' + p] + power # Get only those squares that will be altered, for all others we # don't have gradients, i.e., we don't need to consider them at # all. sqrt_matrix = sparse.sp_ones_like(power) sqrt_matrix = debug_print(updated_squares[p] * sqrt_matrix, 'adagrad_squares_subset_' + p) # Take the square root of the matrix subset. sqrt_matrix = debug_print(sparse.sqrt(sqrt_matrix), 'adagrad_sqrt_' + p) # Calc 1. / the square root. sqrt_matrix = debug_print(sparse.structured_pow(sqrt_matrix, -1.), 'adagrad_pow-1_' + p) grads[i] = sparse.mul(grads[i], sqrt_matrix) else: power = debug_print(T.pow(grads[i], 2.), 'pow_' + p) updated_squares[p] = self.__dict__['adagrad_matrix_' + p] + power # Call sqrt only for those items that are non-zero. denominator = T.switch(T.neq(updated_squares[p], 0.0), T.sqrt(updated_squares[p]), T.ones_like(updated_squares[p], dtype=floatX)) grads[i] = T.mul(grads[i], 1. / denominator) updated_squares[p] = debug_print(updated_squares[p], 'upd_squares_' + p) for i, _ in enumerate(grads): grads[i] = debug_print(grads[i], 'grads_updated_' + self.updatable_parameters[i]) return grads, updated_squares
def __init__(self, feature_count, classifier=False, k=8, stdev=0.1, sparse=False): self.classifier = classifier d = feature_count # *** Symbolic variables *** if sparse: X = S.csr_matrix(name='inputs', dtype='float32') else: X = T.matrix() y = T.vector() beta_w1 = T.scalar() beta_v = T.scalar() # *** Model parameters *** # bias term (intercept) w0_init = np.zeros(1) self.w0 = theano.shared(w0_init, allow_downcast=True) # first order coefficients w1_init = np.zeros(d) self.w1 = theano.shared(w1_init, allow_downcast=True) # interaction factors v_init = stdev * np.random.randn(k, d) self.v = theano.shared(v_init, allow_downcast=True) # *** The Model *** # The formula for pairwise interactions is from the bottom left # of page 997 of Rendle 2010, "Factorization Machines." # This version scales linearly in k and d, as opposed to O(d^2). if sparse: interactions = 0.5 * T.sum((S.dot(X, T.transpose(self.v)) ** 2) - \ S.dot(S.mul(X,X), T.transpose(self.v ** 2)), axis=1) y_hat = T.addbroadcast(self.w0, 0) + S.dot(X, self.w1) + interactions else: interactions = 0.5 * T.sum((T.dot(X, T.transpose(self.v)) ** 2) - \ T.dot(X ** 2, T.transpose(self.v ** 2)), axis=1) y_hat = T.addbroadcast(self.w0, 0) + T.dot(X, self.w1) + interactions if self.classifier: y_hat = T.nnet.sigmoid(y_hat) # *** Loss Function *** if self.classifier: error = T.mean(T.nnet.binary_crossentropy(y_hat, y)) else: error = T.mean((y - y_hat)**2) # regularization L2 = beta_w1 * T.mean(self.w1**2) + beta_v * T.mean(self.v**2) loss = error + L2 # *** Learning *** updates = [] params = [self.w0, self.w1, self.v] grads = T.grad(cost=loss, wrt=params) # RMSProp lr, rho, epsilon = 0.001, 0.9, 1e-6 for p, g in zip(params, grads): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g**2 gradient_scaling = T.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - lr * g)) self.theano_train = theano.function(inputs=[X, y, beta_w1, beta_v], outputs=loss, updates=updates, allow_input_downcast=True) self.theano_cost = theano.function(inputs=[X, y, beta_w1, beta_v], outputs=loss, allow_input_downcast=True) # *** Prediction *** self.theano_predict = theano.function(inputs=[X], outputs=y_hat, allow_input_downcast=True)
def propagate_thy_beliefs(self): ''' Call this function to receive a string containing the path of the belief propagation algorithm. We implement the algorithm listed in the paper mentioned in the comments above Pseudocode: -> Create an empty theano vector whose definitions will be iteratively changed. -> Call compile_message_node_to_factor from the o node of the head predicate. -> Let the functions recursively call each other -> Collect their things somehow. @TODO: how. what format. Shall we use theano variables altogether or what -> Return said stuff. ''' # print "graph:bp: Starting belief propagation." equation = self._comiple_message_node_(self.head_predicate.o, "Fictional Label") symbols = self._comiple_message_symbols_node_(self.head_predicate.o, "Fictional Label") #Define an empty dvector to be used as the 'y' label (which will later contain n hot information about desired entities) y = sparse.csr_dmatrix('y') # Do a softmax over the final BP Equation equation = sparse.structured_exp(equation) equation = sparse.row_scale(equation, 1.0 / sparse.sp_sum(equation, axis=1)) # Collect all the parameters (shared vars), found in the factors of this graph. #parameters is a list of matrices (relation) parameters = [x.M for x in symbols] #Cross entropy loss # loss = - y * T.log(equation) + (y - 1)*T.log(1-equation) # unregularized cross-entropy loss in theano a = sparse.mul(y, sparse.structured_log(equation)) b = sparse.mul( sparse.structured_add(y, -1.0), sparse.structured_log(sparse.structured_add(equation, -1.0))) loss = sparse.sub(b, a) # Unregularied Loss loss_dense = sparse.dense_from_sparse(loss) cost = loss_dense.mean() # cost = sparse.sp_sum(loss, axis = 1)/float(ne) gradients = theano.grad(cost, parameters) updated_matrices = [ sparse.sub(parameters[i], 0.1 * gradients[i]) for i in range(len(parameters)) ] # updated_matrices = [sparse.sub(parameters[i], sparse.row_scale(gradients[i], 0.1)) for i in range(len(parameters))] # updated_matrices = [parameters[i] - 0.1 * gradients[i] for i in range(len(parameters))] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # DEBUG # print "Equation: ", equation # print "Type of equation: ",type(equation) # print "Symbols: ", symbols # print "graph:bp: Belief propagation complete." # print "Parameters are" # for p in parameters: # print p," and the type is :",type(p) # print gradients # print "Updated Matrices are :", type(updated_matrices[0]) # print colored(type(self.head_predicate.i.u),'red') # print "Inputs: \n" # print type(self.head_predicate.i.u) # print type(y) # print [ type(x) for x in parameters ] # raw_input("Verify Symbols and Gradients ") # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ function = theano.function( inputs=[self.head_predicate.i.u, y] + parameters, #Inputs to this is the head predicates' symbolic var, and another dvector # inputs = [self.head_predicate.i.u,parameters[0]], #Inputs to this is the head predicates' symbolic var, and another dvector # outputs = updated_matrices #Output to this thing is the BP algorithm's output expression outputs=[equation] + updated_matrices # mode=theano.compile.MonitorMode( # pre_func=self.inspect_inputs, # post_func=self.inspect_outputs) #Output to this thing is the BP algorithm's output expression # updates=tuple([(parameters[i], parameters[i] - 0.1 * gradients[i]) for i in range(len(parameters))]) #Updates are the gradients of cost wrt parameters ) return function, symbols
def _get_gradients_adagrad(self, J): """Get the AdaGrad gradients and squared gradients updates. The returned gradients still need to be multiplied with the general learning rate. Parameters ---------- J : theano variable cost Returns ------- theano variable gradients that are adapted by the AdaGrad algorithm theano variable updated sum of squares for all previous steps """ grads = T.grad(J, [ self.__dict__[self.updatable_parameters[i]] for i in xrange(len(self.updatable_parameters)) ]) for i, _ in enumerate(grads): grads[i] = debug_print(grads[i], 'grads_' + self.updatable_parameters[i]) updated_squares = dict() # Add squared gradient to the squared gradient matrix for AdaGrad and # recalculate the gradient. for i, p in enumerate(self.updatable_parameters): # We need to handle sparse gradient variables differently if isinstance(grads[i], sparse.SparseVariable): # Add the sqares to the matrix power = debug_print(sparse.structured_pow(grads[i], 2.), 'pow_' + p) # Remove zeros (might happen when squaring near zero values) power = sparse.remove0(power) updated_squares[p] = self.__dict__['adagrad_matrix_' + p] + power # Get only those squares that will be altered, for all others we # don't have gradients, i.e., we don't need to consider them at # all. sqrt_matrix = sparse.sp_ones_like(power) sqrt_matrix = debug_print(updated_squares[p] * sqrt_matrix, 'adagrad_squares_subset_' + p) # Take the square root of the matrix subset. sqrt_matrix = debug_print(sparse.sqrt(sqrt_matrix), 'adagrad_sqrt_' + p) # Calc 1. / the square root. sqrt_matrix = debug_print( sparse.structured_pow(sqrt_matrix, -1.), 'adagrad_pow-1_' + p) grads[i] = sparse.mul(grads[i], sqrt_matrix) else: power = debug_print(T.pow(grads[i], 2.), 'pow_' + p) updated_squares[p] = self.__dict__['adagrad_matrix_' + p] + power # Call sqrt only for those items that are non-zero. denominator = T.switch( T.neq(updated_squares[p], 0.0), T.sqrt(updated_squares[p]), T.ones_like(updated_squares[p], dtype=floatX)) grads[i] = T.mul(grads[i], 1. / denominator) updated_squares[p] = debug_print(updated_squares[p], 'upd_squares_' + p) for i, _ in enumerate(grads): grads[i] = debug_print( grads[i], 'grads_updated_' + self.updatable_parameters[i]) return grads, updated_squares
def test_upcast(self): array1 = numpy.array([[1, 0], [3, 0], [0, 6]], dtype='float32') array2 = numpy.array([[1, 0], [3, 0], [0, 6]], dtype='int32') array3 = numpy.array([[1, 0], [3, 0], [0, 6]], dtype='int8') # AddSS and MulSS for mtype in _mtypes: a = mtype(array1) aR = as_sparse_variable(a) b = mtype(array2) bR = as_sparse_variable(b) c = mtype(array3) cR = as_sparse_variable(c) # Ops that do not upcast self.assertRaises(NotImplementedError, add, aR, bR) self.assertRaises(NotImplementedError, add, bR, aR) self.assertRaises(NotImplementedError, add, bR, cR) self.assertRaises(NotImplementedError, add, cR, bR) self.assertRaises(NotImplementedError, add, aR, cR) self.assertRaises(NotImplementedError, add, cR, aR) self.assertRaises(NotImplementedError, mul, aR, bR) self.assertRaises(NotImplementedError, mul, bR, aR) self.assertRaises(NotImplementedError, mul, bR, cR) self.assertRaises(NotImplementedError, mul, cR, bR) self.assertRaises(NotImplementedError, mul, aR, cR) self.assertRaises(NotImplementedError, mul, cR, aR) # AddSD and MulSD for mtype in _mtypes: a = mtype(array1) a_sv = as_sparse_variable(a) a_dv = tensor.as_tensor_variable(array1) b = mtype(array2) b_sv = as_sparse_variable(b) b_dv = tensor.as_tensor_variable(array2) c = mtype(array3) c_sv = as_sparse_variable(c) c_dv = tensor.as_tensor_variable(array3) # add does not upcast self.assertRaises(NotImplementedError, add, a_sv, b_dv) self.assertRaises(NotImplementedError, add, b_sv, a_dv) self.assertRaises(NotImplementedError, add, b_sv, c_dv) self.assertRaises(NotImplementedError, add, c_sv, b_dv) self.assertRaises(NotImplementedError, add, a_sv, c_dv) self.assertRaises(NotImplementedError, add, c_sv, a_dv) # mul may upcast the dense input if needed if (config.cast_policy in ('custom', 'numpy') or (config.cast_policy == 'numpy+floatX' and config.floatX == 'float64')): # The result should be a float64 (not implemented). self.assertRaises(NotImplementedError, mul, a_sv, b_dv) elif (config.cast_policy == 'numpy+floatX' and config.floatX == 'float32'): # The result should be a float32. assert mul(a_sv, b_dv).dtype == 'float32' else: raise NotImplementedError() self.assertRaises(NotImplementedError, mul, b_sv, a_dv) assert mul(b_sv, c_dv).dtype == 'int32' self.assertRaises(NotImplementedError, mul, c_sv, b_dv) assert mul(a_sv, c_dv).dtype == 'float32' self.assertRaises(NotImplementedError, mul, c_sv, a_dv)