def tensor_to_cuda(x): if isinstance(x.type, tensor.TensorType) and x.type.dtype == "float32": y = CudaNdarrayType(broadcastable=x.type.broadcastable)() if x.name: y.name = x.name + "[cuda]" return y else: return x
def make_node( self, # model parameters and bookkeeping variables V, UT, Uinv, QT, omega, w_bar, # minibatch value inputs HT, KindexesT): """outputs will be: AT, q, s, work_d, work_m""" # The following are supposed to reside on the GPU V = as_cuda_ndarray_variable(V) UT = as_cuda_ndarray_variable(UT) Uinv = as_cuda_ndarray_variable(Uinv) QT = as_cuda_ndarray_variable(QT) omega = as_cuda_ndarray_variable(omega) w_bar = as_cuda_ndarray_variable(w_bar) HT = as_cuda_ndarray_variable(HT) # This is on GPU KindexesT = as_tensor_variable(KindexesT) # List of op parameters params = [V, UT, Uinv, QT, omega, w_bar, HT, KindexesT] # make sure parameters are either all of dtype float32 or all of dtype float64 (except for Kindexes which are integers) elem_type = V.dtype if elem_type != "float32" and elem_type != "float64": raise TypeError( "GpuFactoredSphericalOp parameter V must have dtype of float32 or float64" ) check_tensor_variables_ndim_and_dtype(2, elem_type, ["V", "UT", "Uinv", "QT", "HT"], locals()) check_tensor_variables_ndim_and_dtype(1, elem_type, ["omega", "w_bar"], locals()) check_tensor_variables_ndim_and_dtype(2, "int32", ["KindexesT"], locals()) # Now properly set up outputs to compute: AT, q, s outputs = [ CudaNdarrayType(broadcastable=(False, False))(), # AT CudaNdarrayType(broadcastable=(False, ))(), # q CudaNdarrayType(broadcastable=(False, ))(), # s CudaNdarrayType(broadcastable=(False, False))(), # work_d CudaNdarrayType(broadcastable=(False, False))() # work_m ] return Apply(self, params, outputs)
def cuda_shared_constructor(value, name=None, strict=False, allow_downcast=None, borrow=False, broadcastable=None): """SharedVariable Constructor for CudaNdarrayType""" # THIS CONSTRUCTOR TRIES TO CAST VALUE TO A FLOAT32, WHICH THEN GOES ONTO THE CARD # SO INT shared vars, float64 shared vars, etc. all end up on the card. # THIS IS NOT THE DEFAULT BEHAVIOUR THAT WE WANT. # SEE float32_shared_constructor #TODO: what should strict mean in this context, since we always have to make a copy? if strict: _value = value else: _value = theano._asarray(value, dtype='float32') if not isinstance(_value, numpy.ndarray): raise TypeError('ndarray required') if _value.dtype.num != CudaNdarrayType.typenum: raise TypeError('float32 ndarray required') if broadcastable is None: broadcastable = (False,) * len(value.shape) type = CudaNdarrayType(broadcastable=broadcastable) print "trying to return?" try: rval = CudaNdarraySharedVariable(type=type, value=_value, name=name, strict=strict) except Exception, e: print "ERROR", e raise
def may_share_memory(a, b, raise_other_type=True): a_ndarray = isinstance(a, np.ndarray) b_ndarray = isinstance(b, np.ndarray) if a_ndarray and b_ndarray: return TensorType.may_share_memory(a, b) a_cuda = _is_cuda(a) b_cuda = _is_cuda(b) if a_cuda and b_cuda: return CudaNdarrayType.may_share_memory(a, b) a_gpua = _is_gpua(a) b_gpua = _is_gpua(b) if a_gpua and b_gpua: return gpuarray.pygpu.gpuarray.may_share_memory(a, b) a_sparse = _is_sparse(a) b_sparse = _is_sparse(b) if (not(a_ndarray or a_sparse or a_cuda or a_gpua) or not(b_ndarray or b_sparse or b_cuda or b_gpua)): if raise_other_type: raise TypeError("may_share_memory support only ndarray" " and scipy.sparse, CudaNdarray or GpuArray type") return False if a_cuda or b_cuda or a_gpua or b_gpua: return False return SparseType.may_share_memory(a, b)
def __init__(self, computeGradient = True): super(GpuCtc,self).__init__() self.computeGradient = computeGradient self.costs = T.fvector(name="ctc_cost") if self.computeGradient: self.gradients = CudaNdarrayVariable(name="ctc_grad", type=CudaNdarrayType(broadcastable=[False, False, False]))
def make_node(self, output_spike, H_out, weights): if output_spike.type.ndim != 4: raise TypeError('output_spike must be 4D tensor') if H_out.type.ndim != 4: raise TypeError('H_out must be 4D tensor') if weights.type.ndim != 4: raise TypeError('weights must be 4D tensor') # if LR.type.ndim != 1: # raise TypeError('LR must be 1D tensor') # if weight_update.type.ndim != 4: # raise TypeError('weight_update must be 4D tensor') output_spike = as_cuda_ndarray_variable(output_spike) H_out = as_cuda_ndarray_variable(H_out) weights = as_cuda_ndarray_variable(weights) # LR= as_cuda_ndarray_variable(LR) #weight_update = as_cuda_ndarray_variable(weight_update) print 'MAKENODE: ', output_spike.shape, H_out.shape, weights.shape # broadcastable = [output_spike.type.broadcastable[0], H_out.type.broadcastable[0],weights.type.broadcastable[0], # weight_update,False, False, False, False] # otype = CudaNdarrayType(broadcastable=[False] * 4) broadcastable = [False, False, False, False, False] return Apply(self, [output_spike, H_out, weights], [CudaNdarrayType(broadcastable)()])
def may_share_memory(a, b, raise_other_type=True): a_ndarray = isinstance(a, np.ndarray) b_ndarray = isinstance(b, np.ndarray) if a_ndarray and b_ndarray: return TensorType.may_share_memory(a, b) a_cuda = _is_cuda(a) b_cuda = _is_cuda(b) if a_cuda and b_cuda: return CudaNdarrayType.may_share_memory(a, b) a_gpua = _is_gpua(a) b_gpua = _is_gpua(b) if a_gpua and b_gpua: return gpuarray.pygpu.gpuarray.may_share_memory(a, b) a_sparse = _is_sparse(a) b_sparse = _is_sparse(b) if (not (a_ndarray or a_sparse or a_cuda or a_gpua) or not (b_ndarray or b_sparse or b_cuda or b_gpua)): if raise_other_type: raise TypeError("may_share_memory support only ndarray" " and scipy.sparse, CudaNdarray or GpuArray type") return False if a_cuda or b_cuda or a_gpua or b_gpua: return False return SparseType.may_share_memory(a, b)
def make_node(self, initial_state, inp_state, inp_update, inp_reset, state_to_state, state_to_update, state_to_reset): weights = [state_to_state, state_to_update, state_to_reset] batch_size = inp_state.shape[1] assert initial_state.dtype == "float32" assert initial_state.ndim == 1 initial_state = as_cuda_ndarray_variable( tensor.repeat(initial_state[None, :], batch_size, 0)) for i, w in enumerate(weights): weights[i] = as_cuda_ndarray_variable(w) inputs = [inp_state, inp_update, inp_reset] for i, b in enumerate(inputs): inputs[i] = as_cuda_ndarray_variable(b) for w in weights: assert w.dtype == "float32" assert w.ndim == 2 for i in inputs: assert i.dtype == "float32" assert i.ndim == 3 out_type = CudaNdarrayType((False, False)) return theano.Apply(self, [initial_state] + inputs + weights, [out_type()])
def make_node(self, x, y): if x.type.ndim != 2: raise TypeError(x) if y.type.ndim != 2: raise TypeError(y) otype = CudaNdarrayType( (x.type.broadcastable[0], y.type.broadcastable[1])) return Apply(self, [x, y], [otype()])
def float32_shared_constructor(value, name=None, strict=False, allow_downcast=None, borrow=False, broadcastable=None, target='gpu'): """ SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or CudaNdarray. """ if target != 'gpu': raise TypeError('not for gpu') if theano.sandbox.cuda.use.device_number is None: theano.sandbox.cuda.use("gpu", force=True, default_to_move_computation_to_gpu=False, move_shared_float32_to_gpu=False, enable_cuda=False) # if value isn't a float32 ndarray, or a CudaNdarray then raise if not isinstance(value, (numpy.ndarray, theano.sandbox.cuda.CudaNdarray)): raise TypeError('ndarray or CudaNdarray required') if isinstance( value, numpy.ndarray) and value.dtype.num != CudaNdarrayType.typenum: raise TypeError('float32 ndarray required') if broadcastable is None: broadcastable = (False, ) * len(value.shape) type = CudaNdarrayType(broadcastable=broadcastable) get_value_return_ndarray = True if isinstance(value, theano.sandbox.cuda.CudaNdarray): get_value_return_ndarray = False if borrow: deviceval = value else: deviceval = value.copy() else: # type.broadcastable is guaranteed to be a tuple, which this next # function requires deviceval = type_support_filter(value, type.broadcastable, False, None) try: rval = CudaNdarraySharedVariable(type=type, value=deviceval, name=name, strict=strict) except Exception as e: print("ERROR", e) raise rval.get_value_return_ndarray = get_value_return_ndarray return rval
def make_node(self, img, kern): if img.type.ndim != 4: raise TypeError('img must be 4D tensor') if kern.type.ndim != 4: raise TypeError('kern must be 4D tensor') broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], False, False] return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def make_node(self, x, y): # we suppose type checking has been done, but make sure. assert (x.type.ndim == 1 and y.type.ndim == 1 and x.type.dtype == 'float32' and y.type.dtype == 'float32') bz = [x.type.broadcastable[0], y.type.broadcastable[0]] outputs = [CudaNdarrayType(dtype='float32', broadcastable=bz)()] return Apply(self, [x, y], outputs)
def make_node(self, inp1, inp2): inp1 = as_cuda_ndarray_variable(inp1) inp2 = as_cuda_ndarray_variable(inp2) assert inp1.ndim == 2 assert inp2.ndim == 2 return theano.Apply( self, [inp1, inp2], [CudaNdarrayType(broadcastable=[False] * inp1.type.ndim)()])
def make_node(self, x, y, a): if x.type.ndim != 2: raise TypeError(x) if y.type.ndim != 2: raise TypeError(y) if not tensor.blas._as_scalar(a): raise TypeError(a) otype = CudaNdarrayType( (x.type.broadcastable[0], y.type.broadcastable[1])) return Apply(self, [x, y, a], [otype()])
def test_int_pow(): a = CudaNdarrayType([False])() f = theano.function([a], (a * 4).sum(), mode=mode_with_gpu) op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()] assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu'] f = theano.function([a], tensor.pow(a, 4).sum(), mode=mode_with_gpu) op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()] assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']
def make_node(self, img, kern, desc): if img.type.ndim != 4: raise TypeError('img must be 4D tensor') if kern.type.ndim != 4: raise TypeError('kern must be 4D tensor') if not isinstance(desc.type, CDataType) \ or desc.type.ctype != 'cudnnConvolutionDescriptor_t': raise TypeError('desc must be cudnnConvolutionDescriptor_t') broadcastable = (img.type.broadcastable[0], kern.type.broadcastable[0], False, False) return Apply(self, [img, kern, desc], [CudaNdarrayType(broadcastable)()])
def test_dump_load(self): if not cuda_ndarray.cuda_enabled: raise SkipTest('Optional package cuda disabled') x = CudaNdarraySharedVariable('x', CudaNdarrayType((1, 1), name='x'), [[1]], False) with open('test', 'wb') as f: dump(x, f) with open('test', 'rb') as f: x = load(f) assert x.name == 'x' assert_allclose(x.get_value(), [[1]])
def make_node(self, kern, topgrad, desc): kern = as_cuda_ndarray_variable(kern) topgrad = as_cuda_ndarray_variable(topgrad) if kern.type.ndim != 4: raise TypeError('kern must be 4D tensor') if topgrad.type.ndim != 4: raise TypeError('topgrad must be 4D tensor') if not isinstance(desc.type, CDataType) \ or desc.type.ctype != 'cudnnConvolutionDescriptor_t': raise TypeError('desc must be cudnnConvolutionDescriptor_t') broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1], False, False] return Apply(self, [kern, topgrad, desc], [CudaNdarrayType(broadcastable)()])
def make_node(self, acts, input_lengths, flat_labels, label_lengths): if not isinstance(acts.type, CudaNdarrayType): raise Exception("Activations should be CudaNdarrayType, not %s" % (acts.type, )) acts_ = acts input_lengths_ = T.as_tensor_variable(input_lengths) flat_labels_ = T.as_tensor_variable(flat_labels) label_lengths_ = T.as_tensor_variable(label_lengths) if acts_.dtype != "float32": raise Exception("acts must be float32 instead of %s" % acts.dtype) if input_lengths.dtype != "int32": raise Exception("input_lengths must be int32 instead of %s" % input_lengths.dtype) if flat_labels.dtype != "int32": raise Exception("flat_labels must be int32 instead of %s" % flat_labels.dtype) if label_lengths.dtype != "int32": raise Exception("label_lengths must be int32 instead of %s" % label_lengths.dtype) # Normally a singleton Op instance is created, and different Apply nodes are # created for different inputs. # Here, we create an Op instance specifically for this application, # and store the gradient variable in it so that it can be used by grad(). op = GpuCtc() op.costs = T.fvector(name="ctc_cost") op.gradients = CudaNdarrayVariable( name="gpu_ctc_grad", type=CudaNdarrayType(broadcastable=[False, False, False])) # Don't compute gradient unless needed op.computeGradient = theano.shared(np.asarray([1], dtype=np.int32)) applyNode = theano.Apply(op, inputs=[ acts_, input_lengths_, flat_labels_, label_lengths_, op.computeGradient ], outputs=[op.costs, op.gradients]) # Return only the cost. Gradient will be returned by grad() self.default_output = 0 return applyNode
def make_node(self, dCdy, x, a, b, l, s): for input, ndim in ((dCdy, 2 + len(self.patch_shape)), (x, 2 + len(self.patch_shape)), (a, 2), (b, 2), (l, 2), (s, 2)): if not input.type.ndim == ndim: raise TypeError() dCdy, x, a, b, l, s = tuple(map(gpu_contiguous, (dCdy, x, a, b, l, s))) inputs = list(map(as_cuda_ndarray_variable, (dCdy, x, a, b, l, s))) # we could return the much smaller dCdl, dCds but that # gives us very little room to parallelize (e.g. with batch # size 100 and 3 spatial dimensions we have only 600 # independently computable output elements). output_type = CudaNdarrayType( broadcastable=list(inputs[0].type.broadcastable) + [False], dtype=inputs[0].type.dtype) dydl = output_type() dyds = output_type() return Apply(self, inputs, [dydl, dyds])
def make_node(self, kern, topgrad, shape=None): kern = as_cuda_ndarray_variable(kern) topgrad = as_cuda_ndarray_variable(topgrad) if kern.type.ndim != 4: raise TypeError('kern must be 4D tensor') if topgrad.type.ndim != 4: raise TypeError('topgrad must be 4D tensor') if shape is None: if self.subsample != (1, 1): raise ValueError('shape must be given if subsample != (1, 1)') height_width = [] else: height_width = [shape[0], shape[1]] assert shape[0].ndim == 0 assert shape[1].ndim == 0 broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1], False, False] return Apply(self, [kern, topgrad] + height_width, [CudaNdarrayType(broadcastable)()])
def make_node(self, cond, ift, iff): if any(ift.broadcastable) or any(iff.broadcastable): raise ValueError( "GPURowSwitch cannot operate on broadcastable " "output arguments (ift %s, iff %s)." % ift.broadcastable, iff.broadcastable) out_type = ift.dtype cond = as_cuda_ndarray_variable(T.cast(cond.flatten(), "float32")) ift = as_cuda_ndarray_variable(ift) iff = as_cuda_ndarray_variable(iff) assert ift.type.dtype == iff.type.dtype assert cond.ndim == 1, cond.ndim assert ift.ndim == iff.ndim return theano.gof.Apply(self, [cond, ift, iff], [ CudaNdarrayType(broadcastable=ift.broadcastable, dtype=out_type)() ])
def make_node(self, img, topgrad, shape=None): img = as_cuda_ndarray_variable(img) topgrad = as_cuda_ndarray_variable(topgrad) if img.type.ndim != 4: raise TypeError('img must be 4D tensor') if topgrad.type.ndim != 4: raise TypeError('topgrad must be 4D tensor') if shape is None: if self.subsample != (1, 1) or self.border_mode == "half": raise ValueError('shape must be given if subsample != (1, 1)' ' or border_mode == "half"') height_width = [] else: height_width = [shape[0], shape[1]] assert shape[0].ndim == 0 assert shape[1].ndim == 0 broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1], False, False] return Apply(self, [img, topgrad] + height_width, [CudaNdarrayType(broadcastable)()])
def make_node(self, x, ilist): x_ = as_cuda_ndarray_variable(x) ilist_ = gpu_contiguous(T.cast( ilist, dtype=config.floatX)) # T.as_tensor_variable(ilist) #if ilist_.type.dtype[:3] not in ('int', 'uin'): # raise TypeError('index must be integers') if ilist_.type.ndim != 1: raise TypeError('index must be vector') if x_.type.ndim == 0: raise TypeError('cannot index into a scalar') # # c code suppose it is int64 # if x.ndim in [1, 2, 3] and ilist_.dtype in [ # 'int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']: # ilist_ = tensor.cast(ilist_, 'int64') bcast = (ilist_.broadcastable[0], ) + x_.broadcastable[1:] return theano.gof.Apply( self, [x_, ilist_], [CudaNdarrayType(dtype=x.dtype, broadcastable=bcast)()])
def make_node(self, img, topgrad, desc, h, w): img = as_cuda_ndarray_variable(img) topgrad = as_cuda_ndarray_variable(topgrad) if img.type.ndim != 4: raise TypeError('img must be 4D tensor') if topgrad.type.ndim != 4: raise TypeError('topgrad must be 4D tensor') if not isinstance(desc.type, CDataType) \ or desc.type.ctype != 'cudnnConvolutionDescriptor_t': raise TypeError('desc must be cudnnConvolutionDescriptor_t') h = as_scalar(h) w = as_scalar(w) broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1], False, False] return Apply(self, [img, topgrad, desc, h, w], [CudaNdarrayType(broadcastable)()])
def make_node(self, cond, ift, iff): if any(ift.broadcastable) or any(iff.broadcastable): raise ValueError( "GpuMaskedCAReduce cannot operate on " "broadcastable output arguments (ift %s, iff %s)." % ift.broadcastable, iff.broadcastable) out_type = ift.dtype cond = as_cuda_ndarray_variable(T.cast(cond.flatten(), "float32")) ift = as_cuda_ndarray_variable(ift) iff = as_cuda_ndarray_variable(iff) # TODO check contiguous? assert ift.type.dtype == iff.type.dtype assert cond.ndim == 1, cond.ndim assert ift.ndim == iff.ndim out_bcast = ift.broadcastable[1:] return theano.gof.Apply( self, [cond, ift, iff], [CudaNdarrayType(broadcastable=out_bcast, dtype=out_type)()])
def may_share_memory(a, b, raise_other_type=True): a_ndarray = isinstance(a, numpy.ndarray) b_ndarray = isinstance(b, numpy.ndarray) a_sparse = _is_sparse(a) b_sparse = _is_sparse(b) a_cuda = _is_cuda(a) b_cuda = _is_cuda(b) if not(a_ndarray or a_sparse or a_cuda) or not(b_ndarray or b_sparse or b_cuda): if raise_other_type: raise TypeError("may_share_memory support only ndarray and scipy.sparse and CudaNdarray type") return False if a_ndarray and b_ndarray: return TensorType.may_share_memory(a,b) if a_cuda and b_cuda: from theano.sandbox.cuda.type import CudaNdarrayType return CudaNdarrayType.may_share_memory(a,b) if a_cuda or b_cuda: return False return SparseType.may_share_memory(a,b)
def float32_shared_constructor(value, name=None, strict=False, allow_downcast=None, borrow=False, broadcastable=None): """SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or CudaNdarray""" # if value isn't a float32 ndarray, or a CudaNdarray then raise if not isinstance(value, (numpy.ndarray, theano.sandbox.cuda.CudaNdarray)): raise TypeError('ndarray or CudaNdarray required') if isinstance( value, numpy.ndarray) and value.dtype.num != CudaNdarrayType.typenum: raise TypeError('float32 ndarray required') if broadcastable is None: broadcastable = (False, ) * len(value.shape) type = CudaNdarrayType(broadcastable=broadcastable) get_value_return_ndarray = True if isinstance(value, theano.sandbox.cuda.CudaNdarray): get_value_return_ndarray = False if borrow: deviceval = value else: deviceval = value.copy() else: # type.broadcastable is guaranteed to be a tuple, which this next # function requires deviceval = type_support_filter(value, type.broadcastable, False, None) try: rval = CudaNdarraySharedVariable(type=type, value=deviceval, name=name, strict=strict) except Exception, e: print "ERROR", e raise
def may_share_memory(a, b, raise_other_type=True): a_ndarray = isinstance(a, numpy.ndarray) b_ndarray = isinstance(b, numpy.ndarray) a_sparse = _is_sparse(a) b_sparse = _is_sparse(b) a_cuda = _is_cuda(a) b_cuda = _is_cuda(b) if (not (a_ndarray or a_sparse or a_cuda) or not (b_ndarray or b_sparse or b_cuda)): if raise_other_type: raise TypeError("may_share_memory support only ndarray" " and scipy.sparse and CudaNdarray type") return False if a_ndarray and b_ndarray: return TensorType.may_share_memory(a, b) if a_cuda and b_cuda: from theano.sandbox.cuda.type import CudaNdarrayType return CudaNdarrayType.may_share_memory(a, b) if a_cuda or b_cuda: return False return SparseType.may_share_memory(a, b)
def output_type(self, inp): return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
def make_node( self, # model parameters and bookkeeping variables V, UT, Uinv, QT, omega, w_bar, # minibatch value inputs HT, KindexesT, # workspace work_d, work_m, # minibatch gradient inputs grad_AT, grad_q, grad_s, # learning rate eta): """output will be: grad_HT """ # The following are supposed to reside on the GPU V = as_cuda_ndarray_variable(V) UT = as_cuda_ndarray_variable(UT) Uinv = as_cuda_ndarray_variable(Uinv) QT = as_cuda_ndarray_variable(QT) omega = as_cuda_ndarray_variable(omega) w_bar = as_cuda_ndarray_variable(w_bar) HT = as_cuda_ndarray_variable(HT) # This is on CPU KindexesT = as_tensor_variable(KindexesT) # The following are supposed to reside on the GPU work_d = as_cuda_ndarray_variable(work_d) work_m = as_cuda_ndarray_variable(work_m) grad_AT = as_cuda_ndarray_variable(grad_AT) grad_q = as_cuda_ndarray_variable(grad_q) grad_s = as_cuda_ndarray_variable(grad_s) # This is on CPU eta = as_tensor_variable(eta) # parametr list params = [ V, UT, Uinv, QT, omega, w_bar, HT, KindexesT, work_d, work_m, grad_AT, grad_q, grad_s, eta ] # make sure parameters are either all of dtype float32 or all of dtype float64 (except for Kindexes which are integers) elem_type = V.dtype if elem_type != "float32" and elem_type != "float64": raise TypeError( "GpuFactoredSphericalOp parameter V must have dtype of float32 or float64" ) check_tensor_variables_ndim_and_dtype(0, elem_type, ["eta"], locals()) check_tensor_variables_ndim_and_dtype( 2, elem_type, ["V", "UT", "Uinv", "QT", "HT", "grad_AT", "work_d", "work_m"], locals()) check_tensor_variables_ndim_and_dtype( 1, elem_type, ["omega", "w_bar", "grad_q", "grad_s"], locals()) check_tensor_variables_ndim_and_dtype(2, "int32", ["KindexesT"], locals()) # Now properly set up outputs to compute: grad_HT outputs = [CudaNdarrayType(broadcastable=(False, False))()] return Apply(self, params, outputs)
def make_node(self, V, U, UinvT, Q, H, Y_indexes, Y_values, learning_rate, use_qtilde=0, use_lower=1, invup_mode=1, stabilize_period=10, unfactorize_period=100, debug_print=0): # The following are supposed to reside on the GPU V = as_cuda_ndarray_variable(V) U = as_cuda_ndarray_variable(U) UinvT = as_cuda_ndarray_variable(UinvT) Q = as_cuda_ndarray_variable(Q) H = as_cuda_ndarray_variable(H) # The following are on the CPU Y_indexes = as_tensor_variable(Y_indexes) Y_values = as_tensor_variable(Y_values) learning_rate = as_tensor_variable(learning_rate) use_qtilde = as_tensor_variable(use_qtilde) use_lower = as_tensor_variable(use_lower) invup_mode = as_tensor_variable(invup_mode) stabilize_period = as_tensor_variable(stabilize_period) unfactorize_period = as_tensor_variable(unfactorize_period) debug_print = as_tensor_variable(debug_print) # print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" # for k,v in locals().items(): # print k,':',type(v) # print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" params = [ V, U, UinvT, Q, H, Y_indexes, Y_values, learning_rate, use_qtilde, use_lower, invup_mode, stabilize_period, unfactorize_period, debug_print ] # make sure parameters are either all of dtype float32 or all of dtype float64 (except for Y_indexes which are integers) elem_type = V.dtype if elem_type != "float32" and elem_type != "float64": raise TypeError( "LargeSparseTargets parameter V must have dtype of float32 or float64" ) check_tensor_variables_ndim_and_dtype(0, elem_type, ["learning_rate"], locals()) check_tensor_variables_ndim_and_dtype( 2, elem_type, ["V", "U", "UinvT", "Q", "H", "Y_values"], locals()) check_tensor_variables_ndim_and_dtype(2, "int32", ["Y_indexes"], locals()) # T.matrix(elem_type) # Now properly set up outputs to compute if self.what_to_output == 0: # output scalar cost outputs = [T.scalar(elem_type)] elif self.what_to_output == 1: # output grad_H outputs = [CudaNdarrayType(broadcastable=(False, False))()] elif self.what_to_output == 2: # output cost and grad_H outputs = [ T.scalar(elem_type), CudaNdarrayType(broadcastable=(False, False))() ] else: raise ValueError( "Invalid value for what_to_output: must be 0,1, or 2") return Apply(self, params, outputs)