def testMultiThreaded(self): """ Test that name/device scope are properly local to the thread and don't interfere """ global SUCCESS_COUNT self.assertEquals(scope.CurrentNameScope(), "") self.assertEquals(scope.CurrentDeviceScope(), None) threads = [] for i in range(4): threads.append( threading.Thread( target=thread_runner, args=(i, self), )) for t in threads: t.start() with scope.NameScope("master"): self.assertEquals(scope.CurrentDeviceScope(), None) self.assertEquals(scope.CurrentNameScope(), "master/") for t in threads: t.join() self.assertEquals(scope.CurrentNameScope(), "master/") self.assertEquals(scope.CurrentDeviceScope(), None) # Ensure all threads succeeded self.assertEquals(SUCCESS_COUNT, 4)
def testDevicescopeBasic(self): self.assertEquals(scope.CurrentDeviceScope(), None) dsc = core.DeviceOption(workspace.GpuDeviceType, 9) with scope.DeviceScope(dsc): self.assertEquals(scope.CurrentDeviceScope(), dsc) self.assertEquals(scope.CurrentDeviceScope(), None)
def testDevicescopeBasic(self): self.assertEquals(scope.CurrentDeviceScope(), None) dsc = core.DeviceOption(_gpu_device_type(), 9) with scope.DeviceScope(dsc): self.assertEquals(scope.CurrentDeviceScope(), dsc) self.assertEquals(scope.CurrentDeviceScope(), None)
def testDevicescopeBasic(self): self.assertEquals(scope.CurrentDeviceScope(), None) dsc = core.DeviceOption(caffe2_pb2.CUDA, 9) with scope.DeviceScope(dsc): self.assertEquals(scope.CurrentDeviceScope(), dsc) self.assertEquals(scope.CurrentDeviceScope(), None)
def testDevicescopeAssertion(self): self.assertEquals(scope.CurrentDeviceScope(), None) dsc = core.DeviceOption(_gpu_device_type(), 9) try: with scope.DeviceScope(dsc): self.assertEquals(scope.CurrentDeviceScope(), dsc) raise Exception() except Exception: pass self.assertEquals(scope.CurrentDeviceScope(), None)
def testDevicescopeAssertion(self): self.assertEquals(scope.CurrentDeviceScope(), None) dsc = core.DeviceOption(caffe2_pb2.CUDA, 9) try: with scope.DeviceScope(dsc): self.assertEquals(scope.CurrentDeviceScope(), dsc) raise Exception() except Exception: pass self.assertEquals(scope.CurrentDeviceScope(), None)
def testDevicescopeAssertion(self): self.assertEquals(scope.CurrentDeviceScope(), None) dsc = core.DeviceOption(workspace.GpuDeviceType, 9) try: with scope.DeviceScope(dsc): self.assertEquals(scope.CurrentDeviceScope(), dsc) raise Exception() except Exception: pass self.assertEquals(scope.CurrentDeviceScope(), None)
def add_init_params(self, init_net): ''' Adds layer initialization operators to passed net. ''' for param in self.params: # TODO(amalevich): Either return back to lambdas, that add # all params (looks a bit safer and breaking less # abstractions) or extend Net interface to this type of # operations better # TODO(xlwang) init_net._net.op has type google.protobuf.\ # internal.containers.RepeatedCompositeFieldContainer, but # the version of protobuf in fbcode does not support append # so extend is used init_op = param.initializer current_device_scope = scope.CurrentDeviceScope() if not init_op: continue if not init_op.HasField('device_option') and\ current_device_scope: init_op = caffe2_pb2.OperatorDef() init_op.CopyFrom(param.initializer) init_op.device_option.CopyFrom(current_device_scope) # do not add duplicated init ops if any( utils.OpAlmostEqual(op, init_op, 'debug_info') for op in init_net._net.op): continue init_net._net.op.extend([init_op])
def _run(self, net, param_init_net, param_info): param = param_info.blob grad = param_info.grad if self.base_learning_rate == 0: return assert self.base_learning_rate > 0 # We need negative sign for LR when used directly with WeightedSum # below. lr_sign = -1 if self.momentum else 1 lr, _ = self.build_lr(net, param_init_net, base_learning_rate=self.base_learning_rate * lr_sign, policy=self.policy, **(self.init_kwargs)) dev = scope.CurrentDeviceScope() if dev is None: dev = core.DeviceOption(caffe2_pb2.CPU) # Each GPU/CPU must have its own ONE blob, thus modify the name # to include device information. ONE = param_init_net.ConstantFill([], "ONE_{}_{}{}".format( dev.device_type, dev.cuda_gpu_id, dev.node_name), shape=[1], value=1.0) self._aux_params.shared.append(ONE) if self.momentum > 0: momentum_data = param_init_net.ConstantFill(param, str(param) + "_momentum", value=0.) self._aux_params.local.append(momentum_data) if isinstance(grad, core.GradientSlice): grad = self.dedup(net, self.sparse_dedup_aggregator, grad) if self.momentum > 0.: net.SparseMomentumSGDUpdate( [grad.values, momentum_data, lr, param, grad.indices], [grad.values, momentum_data, param], momentum=self.momentum, nesterov=self.nesterov) else: net.ScatterWeightedSum( [param, ONE, grad.indices, grad.values, lr], param) else: if self.momentum > 0.: net.MomentumSGDUpdate([grad, momentum_data, lr, param], [grad, momentum_data, param], momentum=self.momentum, nesterov=self.nesterov) else: coeff = lr net.WeightedSum([param, ONE, grad, coeff], param)
def FeedBlob(name, arr, device_option=None): """Feeds a blob into the workspace. Inputs: name: the name of the blob. arr: either a TensorProto object or a numpy array object to be fed into the workspace. device_option (optional): the device option to feed the data with. Returns: True or False, stating whether the feed is successful. """ if type(arr) is caffe2_pb2.TensorProto: arr = utils.Caffe2TensorToNumpyArray(arr) if type(arr) is np.ndarray and arr.dtype.kind in 'SU': # Plain NumPy strings are weird, let's use objects instead arr = arr.astype(np.object) if device_option is None: device_option = scope.CurrentDeviceScope() if device_option and device_option.device_type == caffe2_pb2.CUDA: if arr.dtype == np.dtype('float64'): logger.warning( "CUDA operators do not support 64-bit doubles, " + "please use arr.astype(np.float32) or np.int32 for ints." + " Blob: {}".format(name) + " type: {}".format(str(arr.dtype))) name = StringifyBlobName(name) if device_option is not None: return C.feed_blob(name, arr, StringifyProto(device_option)) else: return C.feed_blob(name, arr)
def load_from_db(filename, db_type, device_option=None, *args, **kwargs): # global_init_net in meta_net_def will load parameters from # predictor_constants.PREDICTOR_DBREADER create_db = core.CreateOperator( 'CreateDB', [], [core.BlobReference(predictor_constants.PREDICTOR_DBREADER)], db=filename, db_type=db_type) assert workspace.RunOperatorOnce(create_db), ( 'Failed to create db {}'.format(filename)) # predictor_constants.META_NET_DEF is always stored before the parameters load_meta_net_def = core.CreateOperator( 'Load', [core.BlobReference(predictor_constants.PREDICTOR_DBREADER)], [core.BlobReference(predictor_constants.META_NET_DEF)]) assert workspace.RunOperatorOnce(load_meta_net_def) blob = workspace.FetchBlob(predictor_constants.META_NET_DEF) meta_net_def = serde.deserialize_protobuf_struct( blob if isinstance(blob, bytes) else str(blob).encode('utf-8'), metanet_pb2.MetaNetDef) if device_option is None: device_option = scope.CurrentDeviceScope() if device_option is not None: # Set the device options of all loaded blobs for kv in meta_net_def.nets: net = kv.value for op in net.op: op.device_option.CopyFrom(device_option) return meta_net_def
def _run(self, net, param_init_net, param_info): dev = scope.CurrentDeviceScope() if dev is None: dev = core.DeviceOption(caffe2_pb2.CPU) ONE = param_init_net.ConstantFill([], "ONE_{}_{}".format( dev.device_type, dev.cuda_gpu_id), shape=[1], value=1.0) WD = param_init_net.ConstantFill([], "wd_{}_{}".format( dev.device_type, dev.cuda_gpu_id), shape=[1], value=self.weight_decay) if isinstance(param_info.grad, core.GradientSlice): raise ValueError( "Weight decay does not yet support sparse gradients") else: net.WeightedSum( [param_info.grad, ONE, param_info.blob, WD], param_info.grad, )
def init_data_input_workers( net, input_blob_names, fetch_fun, batch_size, num_worker_threads=2, input_source_name="train", max_buffered_batches=800, init_fun=None, external_loggers=None, dont_rebatch=False, batch_columns=None, timeout=600 ): global global_coordinator device_option = scope.CurrentDeviceScope() if (device_option is None): device_option = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU) metrics = Metrics(external_loggers) batch_feeder = BatchFeeder( net, input_blob_names, batch_size, device_option, scope.CurrentNameScope(), input_source_name, global_coordinator.get_queue(input_source_name, max_buffered_batches), metrics, dont_rebatch, batch_columns ) # Create coordinator object coordinator = WorkerCoordinator( input_source_name, init_fun, batch_feeder) # Launch fetch worker threads worker_ids = [ global_coordinator.get_new_worker_id() for i in range(num_worker_threads) ] workers = [ threading.Thread( target=run_worker, name="data_workers fetcher id {}".format(worker_id), args=[coordinator, DataWorker(coordinator, worker_id, fetch_fun, metrics, batch_size, batch_feeder)], ) for worker_id in worker_ids ] workers.append(threading.Thread( target=enqueuer, name="Enqueuer {} {}".format(input_source_name, scope.CurrentNameScope()), args=[coordinator, batch_feeder])) coordinator._workers = workers global_coordinator.add(coordinator) return global_coordinator
def _run(self, net, param_init_net, param_info): dev = scope.CurrentDeviceScope() if dev is None: dev = core.DeviceOption(caffe2_pb2.CPU) ONE = param_init_net.ConstantFill([], "ONE_{}_{}".format( dev.device_type, dev.cuda_gpu_id), shape=[1], value=1.0) SS = param_init_net.ConstantFill([], "SS_{}_{}".format( dev.device_type, dev.cuda_gpu_id), shape=[1], value=self.sparse_scale) if isinstance(param_info.grad, core.GradientSlice): raise ValueError( "Weight decay does not yet support sparse gradients") else: param_sign = net.Sign( [param_info.blob], ['{}_sign'.format(param_info.blob)], ) net.WeightedSum( [param_info.grad, ONE, param_sign, SS], param_info.grad, )
def _run(self, net, param_init_net, param_info): param = param_info.blob grad = param_info.grad assert self.alpha > 0 assert not isinstance(grad, core.GradientSlice), \ "RmsPropOptimizer doesn't support sparse gradients" dev = scope.CurrentDeviceScope() if dev is None: dev = core.DeviceOption(caffe2_pb2.CPU) ONE = param_init_net.ConstantFill([], "ONE_{}_{}".format( dev.device_type, dev.cuda_gpu_id), shape=[1], value=1.0) lr, _ = self.build_lr(net, param_init_net, base_learning_rate=-self.alpha, policy=self.policy, **(self.init_kwargs)) grad_o = param_init_net.ConstantFill( [param], str(param) + "_grad_o", values=0.0, ) ms = param_init_net.ConstantFill( [param], str(param) + "_mean_squares", values=0.0, ) mom = param_init_net.ConstantFill( [param], str(param) + "_momentum", values=0.0, ) self._aux_params.local.append(ms) self._aux_params.local.append(mom) net.RmsProp( [grad, ms, mom, ONE], [grad_o, ms, mom], decay=self.decay, momentum=self.momentum, epsilon=self.epsilon, engine=self.engine, ) net.MomentumSGDUpdate( [grad_o, mom, lr, param], [grad_o, mom, param], )
def testTags(self): self.assertEquals(scope.CurrentDeviceScope(), None) extra_info1 = ["key1:value1"] extra_info2 = ["key2:value2"] extra_info3 = ["key3:value3"] extra_info_1_2 = ["key1:value1", "key2:value2"] extra_info_1_2_3 = ["key1:value1", "key2:value2", "key3:value3"] with scope.DeviceScope(core.DeviceOption(0, extra_info=extra_info1)): self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info1) with scope.DeviceScope(core.DeviceOption(0, extra_info=extra_info2)): self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info_1_2) with scope.DeviceScope( core.DeviceOption(0, extra_info=extra_info3)): self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info_1_2_3) self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info_1_2) self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info1) self.assertEquals(scope.CurrentDeviceScope(), None)
def build_lr(self, net, param_init_net, base_learning_rate, learning_rate_blob=None, policy="fixed", iter_val=0, **kwargs): if learning_rate_blob is None: learning_rate_blob = self.make_unique_blob_name('lr') optimization_iter_blob = _OPTIMIZER_ITERATION_NAME if not param_init_net.BlobIsDefined(optimization_iter_blob): # Add training operators. with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): iteration = param_init_net.ConstantFill( [], optimization_iter_blob, shape=[1], value=iter_val, dtype=core.DataType.INT64) iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"]) net.AtomicIter([iter_mutex, iteration], [iteration]) else: iteration = param_init_net.GetBlobRef(optimization_iter_blob) if not net.BlobIsDefined(learning_rate_blob): # There is one interesting thing here: since we are minimizing, we are # doing "descent" so the learning rate is set to be negative. lr = net.LearningRate([iteration], learning_rate_blob, base_lr=-base_learning_rate, policy=policy, **kwargs) else: lr = net.GetBlobRef(learning_rate_blob) if self._lr_multiplier is not None: current_scope = scope.CurrentDeviceScope() if (current_scope is not None and current_scope.device_type == caffe2_pb2.CUDA and not self._lr_multiplier_on_gpu): lr_multiplier = net.CopyFromCPUInput( self._lr_multiplier, self.make_unique_blob_name('lr_multiplier')) else: lr_multiplier = self._lr_multiplier scaled_lr = net.Mul( [lr, lr_multiplier], self.make_unique_blob_name('scaled_lr'), broadcast=1, ) lr = scaled_lr return lr, iteration
def thread_runner(idx, testobj): global SUCCESS_COUNT testobj.assertEquals(scope.CurrentNameScope(), "") testobj.assertEquals(scope.CurrentDeviceScope(), None) namescope = "namescope_{}".format(idx) dsc = core.DeviceOption(caffe2_pb2.CUDA, idx) with scope.DeviceScope(dsc): with scope.NameScope(namescope): testobj.assertEquals(scope.CurrentNameScope(), namescope + "/") testobj.assertEquals(scope.CurrentDeviceScope(), dsc) time.sleep(0.01 + idx * 0.01) testobj.assertEquals(scope.CurrentNameScope(), namescope + "/") testobj.assertEquals(scope.CurrentDeviceScope(), dsc) testobj.assertEquals(scope.CurrentNameScope(), "") testobj.assertEquals(scope.CurrentDeviceScope(), None) SUCCESS_COUNT += 1
def build_lr(self, net, param_init_net, base_learning_rate, learning_rate_blob=None, policy="fixed", iter_val=0, **kwargs): if learning_rate_blob is None: learning_rate_blob = self.make_unique_blob_name('lr') iteration = utils.BuildUniqueMutexIter(param_init_net, net, iter_val=iter_val) if not net.BlobIsDefined(learning_rate_blob): # There is one interesting thing here: since we are minimizing, we are # doing "descent" so the learning rate is set to be negative. lr = net.LearningRate([iteration], learning_rate_blob, base_lr=-base_learning_rate, policy=policy, **kwargs) else: lr = net.GetBlobRef(learning_rate_blob) if self._lr_multiplier is not None: lr_multiplier = net.CopyFromCPUInput( self._lr_multiplier, self.make_unique_blob_name('lr_multiplier')) lr = net.Mul( [lr, lr_multiplier], self.make_unique_blob_name('scaled_lr'), broadcast=1, ) if self._local_lr_multiplier is not None: current_scope = scope.CurrentDeviceScope() if (current_scope is not None and current_scope.device_type == caffe2_pb2.CUDA and not self._local_lr_multiplier_on_gpu): local_lr_multiplier = net.CopyFromCPUInput( self._local_lr_multiplier, self.make_unique_blob_name('local_lr_multiplier')) else: local_lr_multiplier = self._local_lr_multiplier lr = net.Mul( [lr, local_lr_multiplier], self.make_unique_blob_name('local_scaled_lr'), broadcast=1, ) return lr, iteration
def get_lr_blob_name(self): """Returns an LR blob name. The name will be unique to the current device and optimizer instance. """ classname = self.__class__.__name__ s = scope.CurrentDeviceScope() if s.device_type == caffe2_pb2.CUDA: return '%s_%d_lr_gpu%d' % (classname, self._instance_num, s.cuda_gpu_id) else: return '%s_%d_lr_cpu' % (classname, self._instance_num)
def init_data_input_workers( net, input_blob_names, fetch_fun, batch_size, num_worker_threads=2, input_source_name="train", max_buffered_batches=800, init_fun=None, external_loggers=None, ): global global_coordinator device_option = scope.CurrentDeviceScope() if (device_option is None): device_option = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU) # Create coordinator object coordinator = DataInputCoordinator( net, input_blob_names, batch_size, device_option, scope.CurrentNameScope(), input_source_name, global_coordinator.get_queue(input_source_name, max_buffered_batches), init_fun=init_fun, external_loggers=external_loggers, ) # Launch fetch worker threads worker_ids = [ global_coordinator.get_new_worker_id() for i in range(num_worker_threads) ] workers = [ threading.Thread( target=fetcher, name="data_workers fetcher id {}".format(worker_id), args=[ coordinator, worker_id, fetch_fun, batch_size, input_blob_names ], ) for worker_id in worker_ids ] workers.append( threading.Thread(target=enqueuer, name="Enqueuer {} {}".format( input_source_name, scope.CurrentNameScope()), args=[coordinator])) coordinator._workers = workers global_coordinator.add(coordinator) return global_coordinator
def get_lr_blob_name(self): """Returns an LR blob name. The name will be unique to the current device and optimizer instance. """ current_scope = scope.CurrentDeviceScope() if current_scope is None: return self.get_cpu_lr_blob_name() if current_scope.device_type == caffe2_pb2.CUDA: return self.get_gpu_lr_blob_name(current_scope.cuda_gpu_id) else: return self.get_cpu_lr_blob_name()
def make_unique_blob_name(self, base_str): """ Returns a blob name that will be unique to the current device and optimizer instance. """ current_scope = scope.CurrentDeviceScope() if current_scope is None: return self.get_cpu_blob_name(base_str) if current_scope.device_type == caffe2_pb2.CUDA: return self.get_gpu_blob_name(base_str, current_scope.cuda_gpu_id) else: return self.get_cpu_blob_name(base_str)
def Accuracy(self, blob_in, blob_out, **kwargs): dev = kwargs['device_option'] if 'device_option' in kwargs else scope.CurrentDeviceScope() blobs_in_dev = [] # if device_option is CPU (or None, so assumed to be CPU), nothing needs to be done if dev == None or dev.device_type == caffe2_pb2.CPU: blobs_in_dev = blob_in else: # Otherwise insert copy operators pred_host = self.net.CopyGPUToCPU(blob_in[0], blob_in[0]+"_host") label_host = self.net.CopyGPUToCPU(blob_in[1], blob_in[1]+"_host") blobs_in_dev = [pred_host, label_host] # Now use the Host version of the accuracy op self.net.Accuracy(blobs_in_dev, blob_out, device_option=core.DeviceOption(caffe2_pb2.CPU, 0), **kwargs)
def lrn(model, blob_in, blob_out, order="NCHW", use_cudnn=False, **kwargs): """LRN""" dev = kwargs['device_option'] if 'device_option' in kwargs \ else scope.CurrentDeviceScope() is_cpu = dev is None or dev.device_type == caffe2_pb2.CPU if use_cudnn and (not is_cpu): kwargs['engine'] = 'CUDNN' blobs_out = blob_out else: blobs_out = [blob_out, "_" + blob_out + "_scale"] lrn = model.net.LRN(blob_in, blobs_out, order=order, **kwargs) if use_cudnn and (not is_cpu): return lrn else: return lrn[0]
def Accuracy(model, blob_in, blob_out, **kwargs): dev = kwargs['device_option'] if 'device_option' in kwargs \ else scope.CurrentDeviceScope() is_cpu = dev is None or dev.device_type == caffe2_pb2.CPU # We support top_k > 1 only on CPU if not is_cpu and 'top_k' in kwargs and kwargs['top_k'] > 1: pred_host = model.net.CopyGPUToCPU(blob_in[0], blob_in[0] + "_host") label_host = model.net.CopyGPUToCPU(blob_in[1], blob_in[1] + "_host") # Now use the Host version of the accuracy op model.net.Accuracy([pred_host, label_host], blob_out, device_option=core.DeviceOption(caffe2_pb2.CPU, 0), **kwargs) else: model.net.Accuracy(blob_in, blob_out)
def init_data_input_workers( net, input_blob_names, fetch_fun, batch_size, num_worker_threads=2, input_source_name="train", max_buffered_batches=100, ): global global_coordinator device_option = scope.CurrentDeviceScope() if (device_option is None): device_option = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU) # Create coordinator object coordinator = DataInputCoordinator( net, input_blob_names, batch_size, device_option, scope.CurrentNameScope(), input_source_name, max_buffered_batches, ) # Launch fetch worker threads workers = [ threading.Thread( target=fetcher, args=[ coordinator, global_coordinator._fetcher_id_seq + i, fetch_fun, batch_size, input_blob_names ], ) for i in range(num_worker_threads) ] global_coordinator._fetcher_id_seq += num_worker_threads workers.append(threading.Thread(target=enqueuer, args=[coordinator])) coordinator._workers = workers global_coordinator.add(coordinator) return global_coordinator
def _Workspace_feed_blob(ws, name, arr, device_option=None): if type(arr) is caffe2_pb2.TensorProto: arr = utils.Caffe2TensorToNumpyArray(arr) if type(arr) is np.ndarray and arr.dtype.kind in 'SU': # Plain NumPy strings are weird, let's use objects instead arr = arr.astype(np.object) if device_option is None: device_option = scope.CurrentDeviceScope() if device_option and device_option.device_type == caffe2_pb2.CUDA: if arr.dtype == np.dtype('float64'): logger.warning( "CUDA operators do not support 64-bit doubles, " + "please use arr.astype(np.float32) or np.int32 for ints." + " Blob: {}".format(name) + " type: {}".format(str(arr.dtype))) name = StringifyBlobName(name) print("device option is:") print(device_option) return ws.create_blob(name).feed(arr)
def _run(self, net, param_init_net, param_info): param = param_info.blob grad = param_info.grad if self.base_learning_rate == 0: return assert self.base_learning_rate > 0, ( "Expect positive base learning rate, got {}".format( self.base_learning_rate)) self._clear_local_lr_multiplier() # TODO(zqq): support LARS for sparse parameters if self.lars is not None and not isinstance(grad, core.GradientSlice): assert self.lars >= 0, ( 'Lars offset must be nonnegative, got {}'.format(self.lars)) lr_lars_multiplier = net.Lars( [param, grad], self.make_unique_blob_name(str(param) + "_lars"), offset=self.lars) current_scope = scope.CurrentDeviceScope() self._add_local_lr_multiplier( lr_lars_multiplier, is_gpu_blob=(current_scope is not None and current_scope.device_type == caffe2_pb2.CUDA), ) # We need negative sign for LR when used directly with WeightedSum # below. lr_sign = -1 if self.momentum else 1 lr, _ = self.build_lr(net, param_init_net, base_learning_rate=self.base_learning_rate * lr_sign, policy=self.policy, **(self.init_kwargs)) dev = scope.CurrentDeviceScope() if dev is None: dev = core.DeviceOption(caffe2_pb2.CPU) # Each GPU/CPU must have its own ONE blob, thus modify the name # to include device information. ONE = param_init_net.ConstantFill([], "ONE_{}_{}{}".format( dev.device_type, dev.cuda_gpu_id, dev.node_name), shape=[1], value=1.0) self._aux_params.shared.append(ONE) if self.momentum > 0: momentum_data = param_init_net.ConstantFill(param, str(param) + "_momentum", value=0.) self._aux_params.local.append(momentum_data) if isinstance(grad, core.GradientSlice): grad = self.dedup(net, self.sparse_dedup_aggregator, grad) if self.momentum > 0.: net.SparseMomentumSGDUpdate( [grad.values, momentum_data, lr, param, grad.indices], [grad.values, momentum_data, param], momentum=self.momentum, nesterov=self.nesterov) else: net.ScatterWeightedSum( [param, ONE, grad.indices, grad.values, lr], param) else: if self.momentum > 0.: net.MomentumSGDUpdate([grad, momentum_data, lr, param], [grad, momentum_data, param], momentum=self.momentum, nesterov=self.nesterov) else: coeff = lr net.WeightedSum([param, ONE, grad, coeff], param)
def _run(self, net, param_init_net, param_info): param = param_info.blob grad = param_info.grad if self.alpha <= 0: return self._clear_local_lr_multiplier() if self.lars is not None and not isinstance(grad, core.GradientSlice): assert self.lars >= 0, ( 'Lars offset must be nonnegative, got {}'.format(self.lars)) lr_lars_multiplier = net.Lars( [param, grad], self.make_unique_blob_name(str(param) + "_lars"), offset=self.lars) current_scope = scope.CurrentDeviceScope() self._add_local_lr_multiplier( lr_lars_multiplier, is_gpu_blob=(current_scope is not None and current_scope.device_type == caffe2_pb2.CUDA), ) lr, _ = self.build_lr(net, param_init_net, base_learning_rate=self.alpha, policy=self.policy, **(self.init_kwargs)) if self.rowWise: shapes, types = workspace.InferShapesAndTypes([param_init_net]) if str(param) not in shapes: # Type/shape inference is not available for this param, fallback # on Shape/Slice logic shape = param_init_net.Shape(param, str(param) + "_shape") num_rows = param_init_net.Slice([shape], str(shape) + "_numrows", starts=[0], ends=[1]) param_squared_sum = param_init_net.ConstantFill( num_rows, str(param) + "_avg_squared_sum", input_as_shape=1, value=0.0) else: param_squared_sum = param_init_net.ConstantFill( [], str(param) + "_avg_squared_sum", shape=[shapes[str(param)][0]], value=0.0) else: param_squared_sum = param_init_net.ConstantFill([param], str(param) + "_squared_sum", value=0.0) self._aux_params.local.append(param_squared_sum) if self.rowWise: assert isinstance(grad, core.GradientSlice),\ 'If SparseAdagrad with rowWise=True, gradient must be '\ 'a gradientslice. PLease ensure that rowWise is not enabled '\ 'for the dense Adagrad optimizer, as it is not supported.' if isinstance(grad, core.GradientSlice): assert self.decay == 1.,\ 'Decay is not implemented for SparseAdagrad and must be set to 1' grad = self.dedup(net, self.sparse_dedup_aggregator, grad) if self.rowWise: op = 'RowWiseSparseAdagrad' else: op = 'SparseAdagrad' net.__getattr__(op)( [param, param_squared_sum, grad.indices, grad.values, lr], [param, param_squared_sum], epsilon=self.epsilon, engine=self.engine) else: output_args = [param, param_squared_sum] if self.output_effective_lr_and_update: output_args.append(str(param) + '_effective_lr') output_args.append(str(param) + '_update') elif self.output_effective_lr: output_args.append(str(param) + '_effective_lr') net.Adagrad([param, param_squared_sum, grad, lr], output_args, epsilon=self.epsilon, decay=float(self.decay), engine=self.engine)