def ExtractFeatures(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(",")] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus if num_gpus > 0: log.info("Running on GPUs: {}".format(gpus)) else: log.info("Running on CPU") my_arg_scope = { "order": "NCHW", "use_cudnn": True, "cudnn_exhaustive_search": True } model = cnn.CNNModelHelper(name="Extract Features", **my_arg_scope) video_input_args = dict( batch_size=args.batch_size, clip_per_video=args.clip_per_video, decode_type=args.decode_type, length_rgb=args.clip_length_rgb, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, video_res_type=args.video_res_type, short_edge=min(args.scale_h, args.scale_w), num_decode_threads=args.num_decode_threads, do_multi_label=args.multi_label, num_of_class=args.num_labels, random_mirror=False, random_crop=False, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=args.input_type == 0, get_optical_flow=args.input_type == 1, get_video_id=args.get_video_id, get_start_frame=args.get_start_frame, use_local_file=args.use_local_file, crop_per_clip=args.crop_per_clip, ) reader_args = dict( name="extract_features" + "_reader", input_data=args.test_data, ) reader, num_examples = reader_utils.create_data_reader( model, **reader_args) def input_fn(model): model_helper.AddVideoInput(model, reader, **video_input_args) def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=args.model_name, model_depth=args.model_depth, num_labels=args.num_labels, batch_size=args.batch_size, num_channels=args.num_channels, crop_size=args.crop_size, clip_length=(args.clip_length_of if args.input_type == 1 else args.clip_length_rgb), loss_scale=loss_scale, is_test=1, multi_label=args.multi_label, channel_multiplier=args.channel_multiplier, bottleneck_multiplier=args.bottleneck_multiplier, use_dropout=args.use_dropout, use_convolutional_pred=args.use_convolutional_pred, use_pool1=args.use_pool1, ) if num_gpus > 0: data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, # 'None' since we aren't training devices=gpus, optimize_gradient_memory=True, ) else: model._device_type = caffe2_pb2.CPU model._devices = [0] device_opt = core.DeviceOption(model._device_type, 0) with core.DeviceScope(device_opt): # Because our loaded models are named with "gpu_x", keep the naming for now. # TODO: Save model using `data_parallel_model.ExtractPredictorNet` # to extract the model for "gpu_0". It also renames # the input and output blobs by stripping the "gpu_x/" prefix with core.NameScope("{}_{}".format("gpu", 0)): input_fn(model) create_model_ops(model, 1.0) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) if args.db_type == "pickle": model_loader.LoadModelFromPickleFile(model, args.load_model_path) elif args.db_type == "minidb": if num_gpus > 0: model_helper.LoadModel(args.load_model_path, args.db_type) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(args.load_model_path, args.db_type) else: log.warning("Unsupported db_type: {}".format(args.db_type)) data_parallel_model.FinalizeAfterCheckpoint(model) def fetchActivations(model, outputs, num_iterations): all_activations = {} for counter in range(num_iterations): workspace.RunNet(model.net.Proto().name) num_devices = 1 # default for cpu if num_gpus > 0: num_devices = num_gpus for g in range(num_devices): for output_name in outputs: blob_name = "gpu_{}/{}".format(g, output_name) activations = workspace.FetchBlob(blob_name) if output_name not in all_activations: all_activations[output_name] = [] all_activations[output_name].append(activations) if counter % 20 == 0: log.info("{}/{} iterations".format(counter, num_iterations)) # each key holds a list of activations obtained from each minibatch. # we now concatenate these lists to get the final arrays. # concatenating during the loop requires a realloc and can get slow. for key in all_activations: all_activations[key] = np.concatenate(all_activations[key]) return all_activations outputs = [name.strip() for name in args.features.split(",")] assert len(outputs) > 0 if args.num_iterations > 0: num_iterations = args.num_iterations else: if num_gpus > 0: examples_per_iteration = args.batch_size * num_gpus else: examples_per_iteration = args.batch_size num_iterations = int(num_examples / examples_per_iteration) activations = fetchActivations(model, outputs, num_iterations) # saving extracted features for index in range(len(outputs)): log.info("Read '{}' with shape {}".format( outputs[index], activations[outputs[index]].shape)) if args.output_path: output_path = args.output_path else: output_path = os.path.dirname(args.test_data) + "/features.pickle" log.info("Writing to {}".format(output_path)) if args.save_h5: with h5py.File(output_path, "w") as handle: for name, activation in activations.items(): handle.create_dataset(name, data=activation) else: with open(output_path, "wb") as handle: pickle.dump(activations, handle) # perform sanity check if args.sanity_check == 1: # check clip accuracy assert args.multi_label == 0 clip_acc = 0 softmax = activations["softmax"] label = activations["label"] for i in range(len(softmax)): sorted_preds = np.argsort(softmax[i]) sorted_preds[:] = sorted_preds[::-1] if sorted_preds[0] == label[i]: clip_acc += 1 log.info("Sanity check --- clip accuracy: {}".format(clip_acc / len(softmax))) elif args.sanity_check == 2: # check auc assert args.multi_label == 1 prob = activations["prob"] label = activations["label"] mean_auc, mean_ap, mean_wap, _ = metric.mean_ap_metric(prob, label) log.info("Sanity check --- AUC: {}, mAP: {}, mWAP: {}".format( mean_auc, mean_ap, mean_wap))
def test_forward_optim_tree_harder(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() m.net.Proto().type = "dag" m.net.Proto().num_workers = 4 m.net.AddExternalInput("label") m.net.AddExternalInput("data") with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim) # Branch fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim) fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim) fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim) fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum") fc5sum.Relu([], "relu1") \ .Softmax([], "pred1") \ .LabelCrossEntropy(["label"], ["xent1"]) \ .AveragedLoss([], "loss1") fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim) fc6.Relu([], fc6) \ .Softmax([], "pred2") \ .LabelCrossEntropy(["label"], ["xent2"]) \ .AveragedLoss([], "loss2") blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.optimize_inference_for_dag( m.net, ["name_x/data"], "name_x/") blobs_after = count_blobs(optim_proto) # Extra test with when one of the parameters is also an input. # This caused a bug before. optim_proto_extra_input = memonger.optimize_inference_for_dag( m.net, ["name_x/data", "name_x/fc1_w"], "name_x/") blobs_after_extra_input = count_blobs(optim_proto_extra_input) self.assertEqual(blobs_after, blobs_after_extra_input) ### print(str(optim_proto)) self.assertLess(blobs_after, blobs_before) # Test networks produce exactly same results data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss1 = workspace.FetchBlob("name_x/loss1") loss2 = workspace.FetchBlob("name_x/loss2") workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("name_x/loss1") optimized_loss2 = workspace.FetchBlob("name_x/loss2") np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(loss2, optimized_loss2)
def build_embedding_encoder( model, encoder_params, inputs, input_lengths, vocab_size, embeddings, embedding_size, use_attention, num_gpus=0, scope=None, ): with core.NameScope(scope or ''): if num_gpus == 0: embedded_encoder_inputs = model.net.Gather( [embeddings, inputs], ['embedded_encoder_inputs'], ) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): embedded_encoder_inputs_cpu = model.net.Gather( [embeddings, inputs], ['embedded_encoder_inputs_cpu'], ) embedded_encoder_inputs = model.CopyCPUToGPU( embedded_encoder_inputs_cpu, 'embedded_encoder_inputs', ) assert len(encoder_params['encoder_layer_configs']) == 1 encoder_num_units = ( encoder_params['encoder_layer_configs'][0]['num_units']) with core.NameScope(scope or ''): encoder_initial_cell_state = model.param_init_net.ConstantFill( [], ['encoder_initial_cell_state'], shape=[encoder_num_units], value=0.0, ) encoder_initial_hidden_state = model.param_init_net.ConstantFill( [], 'encoder_initial_hidden_state', shape=[encoder_num_units], value=0.0, ) # Choose corresponding rnn encoder function if encoder_params['use_bidirectional_encoder']: rnn_encoder_func = rnn_bidirectional_encoder encoder_output_dim = 2 * encoder_num_units else: rnn_encoder_func = rnn_unidirectional_encoder encoder_output_dim = encoder_num_units ( encoder_outputs, final_encoder_hidden_state, final_encoder_cell_state, ) = rnn_encoder_func( model, embedded_encoder_inputs, input_lengths, encoder_initial_hidden_state, encoder_initial_cell_state, embedding_size, encoder_num_units, use_attention, scope=scope, ) weighted_encoder_outputs = None return ( encoder_outputs, weighted_encoder_outputs, final_encoder_hidden_state, final_encoder_cell_state, encoder_output_dim, )
def GpuNameScope(gpu_id): """Create a name scope for GPU device `gpu_id`.""" with core.NameScope('gpu_{:d}'.format(gpu_id)): yield
def Parallelize_GPU( model_helper_obj, input_builder_fun, forward_pass_builder_fun, param_update_builder_fun, devices=range(0, workspace.NumCudaDevices()), rendezvous=None, net_type='dag', broadcast_computed_params=True, optimize_gradient_memory=False, ): ''' Function to create a model that can run on many GPUs. model_helper_obj: an object of ModelHelperBase, such as CNNModelHelper input_builder_fun: Function that adds the input operators Note: Remember to instantiate reader outside of this function so all GPUs share same reader object. Signature: input_builder_fun(model) forward_pass_builder_fun: Function to add the operators to the model. Must return list of loss-blob references that are used to build the gradient. Signature: forward_pass_builder_fun(model) param_update_builder_fun: Function that adds operators that are run after gradient update, such as updating the weights and weight decaying. Function is also passed the learning rate scaling factor. You should multiple the learning rate by the factor to maintain invariant of same results with same total batch size, regardless of number of gpus. Signature: param_update_builder_fun(model, lr_scale) devices: List of GPU ids, such as [0, 1, 2, 3], rendezvous: used for rendezvous in distributed computation, if None then only one node is used. To create rendezvous, use <TBD>. net_type: Network type ''' log.info("Parallelizing model for devices: {}".format(devices)) extra_workers = 8 if rendezvous is not None else 0 # best-guess model_helper_obj.net.Proto().num_workers = len(devices) * 4 + extra_workers model_helper_obj.net.Proto().type = net_type # Store some information in the model -- a bit ugly model_helper_obj._devices = devices model_helper_obj._rendezvous = rendezvous model_helper_obj._grad_names = [] assert isinstance(model_helper_obj, model_helper.ModelHelperBase) assert model_helper_obj.params == [], "Model needs to be empty" # Add input and model log.info("Create input and model training operators") losses_by_gpu = {} for device in devices: device_opt = core.DeviceOption(caffe2_pb2.CUDA, device) with core.DeviceScope(device_opt): with core.NameScope("gpu_{}".format(device)): log.info("Model for GPU: {}".format(device)) input_builder_fun(model_helper_obj) losses = forward_pass_builder_fun(model_helper_obj) # Losses are not needed for test net if param_update_builder_fun is not None: assert isinstance(losses, list), \ 'Model builder function must return list of loss blobs' for loss in losses: assert isinstance(loss, core.BlobReference), \ 'Model builder func must return list of loss blobs' losses_by_gpu[device] = losses # Create parameter map model_helper_obj._device_grouped_blobs =\ _GroupByDevice(devices, model_helper_obj.params) # computed params computed_params_grouped =\ _GroupByDevice(devices, model_helper_obj.computed_params) model_helper_obj._device_grouped_blobs.update(computed_params_grouped) model_helper_obj._param_names =\ model_helper_obj._device_grouped_blobs.keys() model_helper_obj._computed_param_names = computed_params_grouped.keys() if (param_update_builder_fun is None): log.info("Parameter update function not defined --> only forward") return log.info("Adding gradient operators") _AddGradientOperators(devices, model_helper_obj, losses_by_gpu) # Group gradients by device and register to blob lookup param_to_grad = model_helper_obj.param_to_grad grads_ordered = [ param_to_grad[p] for p in model_helper_obj.params if p in param_to_grad ] gradients_grouped = _GroupByDevice( devices, grads_ordered, ) model_helper_obj._device_grouped_blobs.update(gradients_grouped) model_helper_obj._grad_names = gradients_grouped.keys() log.info("Add gradient all-reduces for SyncSGD") if broadcast_computed_params: _BroadcastComputedParams(devices, model_helper_obj, rendezvous) _AllReduceGradients(devices, model_helper_obj, rendezvous) log.info("Post-iteration operators for updating params") num_shards = 1 if rendezvous is None else rendezvous['num_shards'] lr_scale = 1.0 / (len(devices) * num_shards) for device in devices: device_opt = core.DeviceOption(caffe2_pb2.CUDA, device) with core.DeviceScope(device_opt): with core.NameScope("gpu_{}".format(device)): param_update_builder_fun(model_helper_obj, lr_scale) _AnalyzeOperators(model_helper_obj) # Configure dagnet to run with only one worker on the first iteration, # to prevent concurrency problems with allocs and nccl. arg = model_helper_obj.Proto().arg.add() arg.name = "first_iter_only_one_worker" arg.i = 1 # Add initial parameter syncs log.info("Add initial parameter sync") if (rendezvous is not None): _AddDistributedParameterSync( devices, model_helper_obj, model_helper_obj.param_init_net, model_helper_obj.param_init_net, rendezvous, ) _SyncParams(devices, model_helper_obj, model_helper_obj.param_init_net) if optimize_gradient_memory: _OptimizeGradientMemory(model_helper_obj, losses_by_gpu, devices)
def test_dataset_ops(self): """ 1. Defining the schema of our dataset. This example schema could represent, for example, a search query log. """ schema = Struct( # fixed size vector, which will be stored as a matrix when batched ('dense', Scalar((np.float32, 3))), # could represent a feature map from feature ID to float value ('floats', Map(Scalar(np.int32), Scalar(np.float32))), # could represent a multi-valued categorical feature map ('int_lists', Map( Scalar(np.int32), List(Scalar(np.int64)), )), # could represent a multi-valued, weighted categorical feature map ('id_score_pairs', Map( Scalar(np.int32), Map(Scalar(np.int64), Scalar(np.float32), keys_name='ids', values_name='scores'), )), # additional scalar information ('metadata', Struct( ('user_id', Scalar(np.int64)), ('user_embed', Scalar((np.float32, 2))), ('query', Scalar(str)), )), ) """ This is what the flattened fields for this schema look like, along with its type. Each one of these fields will be stored, read and writen as a tensor. """ expected_fields = [ ('dense', (np.float32, 3)), ('floats:lengths', np.int32), ('floats:values:keys', np.int32), ('floats:values:values', np.float32), ('int_lists:lengths', np.int32), ('int_lists:values:keys', np.int32), ('int_lists:values:values:lengths', np.int32), ('int_lists:values:values:values', np.int64), ('id_score_pairs:lengths', np.int32), ('id_score_pairs:values:keys', np.int32), ('id_score_pairs:values:values:lengths', np.int32), ('id_score_pairs:values:values:values:ids', np.int64), ('id_score_pairs:values:values:values:scores', np.float32), ('metadata:user_id', np.int64), ('metadata:user_embed', (np.float32, 2)), ('metadata:query', str), ] zipped = zip(expected_fields, schema.field_names(), schema.field_types()) for (ref_name, ref_type), name, dtype in zipped: self.assertEquals(ref_name, name) self.assertEquals(np.dtype(ref_type), dtype) """ 2. The contents of our dataset. Contents as defined below could represent, for example, a log of search queries along with dense, sparse features and metadata. The datset below has 3 top-level entries. """ contents_raw = [ # dense [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]], # floats [1, 2, 3], # len [11, 21, 22, 31, 32, 33], # key [1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value # int lists [2, 0, 1], # len [11, 12, 31], # key [2, 4, 3], # value:len [111, 112, 121, 122, 123, 124, 311, 312, 313], # value:value # id score pairs [1, 2, 2], # len [11, 21, 22, 31, 32], # key [1, 1, 2, 2, 3], # value:len [111, 211, 221, 222, 311, 312, 321, 322, 323], # value:ids [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3], # val:score # metadata [123, 234, 456], # user_id [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]], # user_embed ['dog posts', 'friends who like to', 'posts about ca'], # query ] # convert the above content to ndarrays, checking against the schema contents = from_blob_list(schema, contents_raw) """ 3. Creating and appending to the dataset. We first create an empty dataset with the given schema. Then, a Writer is used to append these entries to the dataset. """ ds = dataset.Dataset(schema) net = core.Net('init') with core.NameScope('init'): ds.init_empty(net) content_blobs = NewRecord(net, contents) FeedRecord(content_blobs, contents) writer = ds.writer(init_net=net) writer.write_record(net, content_blobs) workspace.RunNetOnce(net) """ 4. Iterating through the dataset contents. If we were to iterate through the top level entries of our dataset, this is what we should expect to see: """ entries_raw = [ ( [[1.1, 1.2, 1.3]], # dense [1], [11], [1.1], # floats [2], [11, 12], [2, 4], [111, 112, 121, 122, 123, 124], # intlst [1], [11], [1], [111], [11.1], # id score pairs [123], [[0.2, 0.8]], ['dog posts'], # metadata ), ( [[2.1, 2.2, 2.3]], # dense [2], [21, 22], [2.1, 2.2], # floats [0], [], [], [], # int list [2], [21, 22], [1, 2], [211, 221, 222], [21.1, 22.1, 22.2], [234], [[0.5, 0.5]], ['friends who like to'], # metadata ), ( [[3.1, 3.2, 3.3]], # dense [3], [31, 32, 33], [3.1, 3.2, 3.3], # floats [1], [31], [3], [311, 312, 313], # int lst [2], [31, 32], [2, 3], [311, 312, 321, 322, 323], [31.1, 31.2, 32.1, 32.2, 32.3], # id score list [456], [[0.7, 0.3]], ['posts about ca'], # metadata ), # after the end of the dataset, we will keep getting empty vectors ( [], ) * 16, ([], ) * 16, ] entries = [from_blob_list(schema, e) for e in entries_raw] """ Let's go ahead and create the reading nets. We will run `read` net multiple times and assert that we are reading the entries the way we stated above. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.reader(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for entry in entries: workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) """ 5. Reading/writing in a single plan If all of operations on the data are expressible as Caffe2 operators, we don't need to load the data to python, iterating through the dataset in a single Plan. Where we will process the dataset a little and store it in a second dataset. We can reuse the same Reader since it supports reset. """ reset_net = core.Net('reset_net') reader.reset(reset_net) read_step, batch = reader.execution_step() """ We will add the line number * 1000 to the feature ids. """ process_net = core.Net('process') line_no = Const(process_net, 0, dtype=np.int32) const_one = Const(process_net, 1000, dtype=np.int32) process_net.Add([line_no, const_one], [line_no]) field = batch.floats.keys.get() process_net.Print(field, []) process_net.Add([field, line_no], field, broadcast=1, axis=0) """ Lets create a second dataset and append to it. """ ds2 = dataset.Dataset(schema, name='dataset2') ds2.init_empty(reset_net) writer = ds2.writer(reset_net) writer.write_record(process_net, batch) # commit is not necessary for DatasetWriter but will add it for # generality of the example commit_net = core.Net('commit') writer.commit(commit_net) """ Time to create and run a plan which will do the processing """ plan = core.Plan('process') plan.AddStep(core.execution_step('reset', reset_net)) plan.AddStep(read_step.AddNet(process_net)) plan.AddStep(core.execution_step('commit', commit_net)) workspace.RunPlan(plan) """ Now we should have dataset2 populated. """ ds2_data = FetchRecord(ds2.content()) field = ds2_data.floats.keys field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000]) _assert_records_equal(contents, ds2_data) """ 6. Slicing a dataset You can create a new schema from pieces of another schema and reuse the same data. """ subschema = Struct(('top_level', schema.int_lists.values)) int_list_contents = contents.int_lists.values.field_names() self.assertEquals(len(subschema.field_names()), len(int_list_contents)) """ 7. Random Access a dataset """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob) reader.computeoffset(read_init_net) should_stop, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for i in range(len(entries)): k = idx[i] if i in idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) workspace.RunNet(str(read_next_net)) self.assertEquals(True, workspace.FetchBlob(should_stop)) """ 8. Random Access a dataset with loop_over = true """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob, loop_over=True) reader.computeoffset(read_init_net) should_stop, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for _ in range(len(entries) * 3): workspace.RunNet(str(read_next_net)) self.assertEquals(False, workspace.FetchBlob(should_stop)) """ 9. Sort and shuffle a dataset This sort the dataset using the score of a certain column, and then shuffle within each chunk of size batch_size * shuffle_size before shuffling the chunks. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.random_reader(read_init_net) reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2) reader.computeoffset(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) expected_idx = np.array([2, 1, 0]) for i in range(len(entries)): k = expected_idx[i] if i in expected_idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) """ Trim a dataset """ trim_net = core.Net('trim_ds') ds.trim(trim_net, multiple_of=2) workspace.RunNetOnce(trim_net) trimmed = FetchRecord(ds.content()) EXPECTED_SIZES = [2, 2, 3, 3, 2, 2, 2, 6, 2, 3, 3, 4, 4, 2, 2, 2] actual_sizes = [d.shape[0] for d in trimmed.field_blobs()] self.assertEquals(EXPECTED_SIZES, actual_sizes)
def run_model(self, V, gpu_devices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): gpu_vecs_gathered = [] gpu_vecs = [] for num, vec in enumerate(self.vecs): gpu_vec = model.param_init_net.CopyCPUToGPU( vec, 'gpuvec_{}'.format(num), ) if num != 2: model.params.append(gpu_vec) gpu_vecs.append(gpu_vec) for num, gpu_vec in enumerate(gpu_vecs): gpu_vec_gathered = model.net.Gather( [gpu_vec, 'indices'], ['gpu_vec_gathered_{}'.format(num)]) gpu_vecs_gathered.append(gpu_vec_gathered) assert len(gpu_vecs_gathered) == 3 fc = model.net.FC( [ gpu_vecs_gathered[2], gpu_vecs_gathered[0], gpu_vecs_gathered[1], ], ['fc'], ) _, loss = model.net.SoftmaxWithLoss( [fc, 'label'], ['ce_loss', 'avg_loss'], only_loss=True, ) loss = model.Scale(loss, scale=loss_scale) model.net.Print(loss, [], limit=10) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: model.net.ScatterWeightedSum( [ param, ONE, param_grad.indices, param_grad.values, ONE, ], param, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) batch_size = 32 batch_per_device = batch_size // len(gpu_devices) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) ''' self.vecs consists of 3 big blobs on which we call Gather: 1) FC weights, shape=(V, 16) 2) FC bias, shape=(V) 3) FC input, shape=(batch_per_device, 16) ''' self.vecs = [ model.param_init_net.UniformFill([], "vec_{}".format(num), shape=[V, 16]) for num in range(2) ] self.vecs.append( model.param_init_net.UniformFill( [], "vec_2", shape=[batch_per_device, 16])) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): for num, vec in enumerate(self.vecs[:-1]): model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec) # Each run has same input, independent of number of gpus for i in range(0, 10): np.random.seed(2603) full_indices = np.random.permutation(V)[:batch_size].reshape( batch_size) full_labels = full_indices[:] % batch_per_device for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en].astype(np.int32) labels = full_labels[st:en].astype(np.int32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/indices".format(g), indices) workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = [ np.random.rand(V, 16).astype(np.float32), np.random.rand(V).astype(np.float32), np.random.rand(V, 16).astype(np.float32), ] for vec, orig_vec in zip(self.vecs, orig_vecs): workspace.FeedBlob(vec, orig_vec) for g in gpu_devices: for num, orig_vec in enumerate(orig_vecs): workspace.FeedBlob( "gpu_{}/gpuvec_{}".format(g, num), orig_vec, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) idx = workspace.FetchBlob('gpu_0/indices') grad_slices = [ workspace.FetchBlob('gpu_{}/gpu_vec_gathered_{}_grad'.format( g, num)) for g in gpu_devices for num in range(2) ] for grad_slice in grad_slices: # print (len(idx), len(grad_slice)) assert len(idx) == len(grad_slice), ( 'Number of indices {} is not same as number of gradient ' 'slices {}. This might lead to illegal memory access'. format(len(idx), len(grad_slice)))
def rnn_bidirectional_encoder(model, embedded_inputs, input_lengths, initial_hidden_state, initial_cell_state, embedding_size, encoder_num_units, use_attention): """ Bidirectional (forward pass and backward pass) LSTM encoder.""" with core.NameScope('', reset=True): # Forward pass ( outputs_fw, final_hidden_state_fw, _, final_cell_state_fw, ) = recurrent.LSTM( model=model, input_blob=embedded_inputs, seq_lengths=input_lengths, initial_states=(initial_hidden_state, initial_cell_state), dim_in=embedding_size, dim_out=encoder_num_units, scope='forward_encoder', outputs_with_grads=([0] if use_attention else [1, 3]), ) # Backward pass reversed_embedded_inputs = model.net.ReversePackedSegs( [embedded_inputs, input_lengths], ['reversed_embedded_inputs'], ) ( outputs_bw, final_hidden_state_bw, _, final_cell_state_bw, ) = recurrent.LSTM( model=model, input_blob=reversed_embedded_inputs, seq_lengths=input_lengths, initial_states=(initial_hidden_state, initial_cell_state), dim_in=embedding_size, dim_out=encoder_num_units, scope='backward_encoder', outputs_with_grads=([0] if use_attention else [1, 3]), ) outputs_bw = model.net.ReversePackedSegs( [outputs_bw, input_lengths], ['outputs_bw'], ) # Concatenate forward and backward results outputs, _ = model.net.Concat( [outputs_fw, outputs_bw], ['outputs', 'outputs_dim'], axis=2, ) final_hidden_state, _ = model.net.Concat( [final_hidden_state_fw, final_hidden_state_bw], ['final_hidden_state', 'final_hidden_state_dim'], axis=2, ) final_cell_state, _ = model.net.Concat( [final_cell_state_fw, final_cell_state_bw], ['final_cell_state', 'final_cell_state_dim'], axis=2, ) return outputs, final_hidden_state, final_cell_state
def LoadModelFromPickleFile( model, pkl_file, use_gpu=True, root_gpu_id=0, bgr2rgb=False, inflating=True, collapsing=True, center_init=False, ): ws_blobs = workspace.Blobs() with open(pkl_file, 'r') as fopen: blobs = pickle.load(fopen) if 'blobs' in blobs: blobs = blobs['blobs'] unscoped_blob_names = OrderedDict() for blob in model.GetAllParams(): unscoped_blob_names[unscope_name(str(blob))] = True if use_gpu: device_opt = caffe2_pb2.CUDA else: device_opt = caffe2_pb2.CPU with core.NameScope('gpu_{}'.format(root_gpu_id)): with core.DeviceScope(core.DeviceOption(device_opt, root_gpu_id)): for unscoped_blob_name in unscoped_blob_names.keys(): scoped_blob_name = scoped_name(unscoped_blob_name) if unscoped_blob_name not in blobs: log.info('{} not found'.format(unscoped_blob_name)) continue if scoped_blob_name in ws_blobs: ws_blob = workspace.FetchBlob(scoped_blob_name) target_shape = ws_blob.shape if target_shape == blobs[unscoped_blob_name].shape: log.info('copying {}'.format(unscoped_blob_name)) if bgr2rgb and unscoped_blob_name == 'conv1_w': feeding_blob = FlipBGR2RGB( blobs[unscoped_blob_name] ) else: feeding_blob = blobs[unscoped_blob_name] elif ws_blob.ndim == 5: # inflate from FC to 1x1x1 conv if blobs[unscoped_blob_name].ndim == 2: log.info('convolutionalize {}'.format( unscoped_blob_name) ) feeding_blob = blobs[unscoped_blob_name] feeding_blob = np.reshape( feeding_blob, feeding_blob.shape + (1, 1, 1) ) else: # may need to inflate if not inflating: log.info( '{} found, but inflating is ignored'.format( unscoped_blob_name ) ) continue feeding_blob = InflateBlob( blobs[unscoped_blob_name], target_shape, unscoped_blob_name, (0 if center_init else 1) ) elif ws_blob.ndim == 4: # may need to collapse if not collapsing: log.info( '{} found, but collapsing is ignored'.format( unscoped_blob_name ) ) continue feeding_blob = CollapseBlob( blobs[unscoped_blob_name], target_shape, unscoped_blob_name ) # either copy, inflate, or collapse blob workspace.FeedBlob( scoped_blob_name, feeding_blob.astype(np.float32, copy=False) )
def read(self, read_net): with core.NameScope(read_net.NextName(self.name)): status = read_net.NextName() fields = read_net.SafeDequeueBlobs( self.blobs_queue, self._schema.field_names() + [status]) return (fields[-1], fields[:-1])
def Test(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus if num_gpus > 0: total_batch_size = args.batch_size * num_gpus log.info("Running on GPUs: {}".format(gpus)) log.info("total_batch_size: {}".format(total_batch_size)) else: total_batch_size = args.batch_size log.info("Running on CPU") log.info("total_batch_size: {}".format(total_batch_size)) # Model building functions def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=args.model_name, model_depth=args.model_depth, num_labels=args.num_labels, num_channels=args.num_channels, crop_size=args.crop_size, clip_length=( args.clip_length_of if args.input_type == 1 else args.clip_length_rgb ), loss_scale=loss_scale, is_test=1, pred_layer_name=args.pred_layer_name, ) test_model = cnn.CNNModelHelper( order="NCHW", name="video_model_test", use_cudnn=(True if args.use_cudnn == 1 else False), cudnn_exhaustive_search=True, ) test_reader, number_of_examples = model_builder.create_data_reader( test_model, name="test_reader", input_data=args.test_data, ) if args.num_iter <= 0: num_iter = int(number_of_examples / total_batch_size) else: num_iter = args.num_iter def test_input_fn(model): model_helper.AddVideoInput( test_model, test_reader, batch_size=args.batch_size, clip_per_video=args.clip_per_video, decode_type=1, length_rgb=args.clip_length_rgb, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, num_decode_threads=4, num_of_class=args.num_labels, random_mirror=False, random_crop=False, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=(args.input_type == 0), get_optical_flow=(args.input_type == 1), get_video_id=args.get_video_id, use_local_file=args.use_local_file, ) if num_gpus > 0: data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, devices=gpus ) else: test_model._device_type = caffe2_pb2.CPU test_model._devices = [0] device_opt = core.DeviceOption(test_model._device_type, 0) with core.DeviceScope(device_opt): # Because our loaded models are named with "gpu_x", keep the naming for now. # TODO: Save model using `data_parallel_model.ExtractPredictorNet` # to extract the model for "gpu_0". It also renames # the input and output blobs by stripping the "gpu_x/" prefix with core.NameScope("{}_{}".format("gpu", 0)): test_input_fn(test_model) create_model_ops(test_model, 1.0) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) if args.db_type == 'minidb': if num_gpus > 0: model_helper.LoadModel(args.load_model_path, args.db_type) data_parallel_model.FinalizeAfterCheckpoint(test_model) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(args.load_model_path, args.db_type) elif args.db_type == 'pickle': if num_gpus > 0: model_loader.LoadModelFromPickleFile( test_model, args.load_model_path, use_gpu=True, root_gpu_id=gpus[0] ) data_parallel_model.FinalizeAfterCheckpoint(test_model) else: model_loader.LoadModelFromPickleFile( test_model, args.load_model_path, use_gpu=False ) else: log.warning("Unsupported db_type: {}".format(args.db_type)) # metric counters for classification clip_acc = 0 video_top1 = 0 video_topk = 0 video_count = 0 clip_count = 0 for i in range(num_iter): workspace.RunNet(test_model.net.Proto().name) num_devices = 1 # default for cpu if args.num_gpus > 0: num_devices = args.num_gpus for g in range(num_devices): # get labels label = workspace.FetchBlob( "gpu_{}".format(g) + '/label' ) # get predictions predicts = workspace.FetchBlob("gpu_{}".format(g) + '/softmax') assert predicts.shape[0] == args.batch_size * args.clip_per_video for j in range(args.batch_size): # get label for one video sample_label = label[j * args.clip_per_video] # get clip accuracy for k in range(args.clip_per_video): c1, _ = metric.accuracy_metric( predicts[j * args.clip_per_video + k, :], label[j * args.clip_per_video + k]) clip_acc = clip_acc + c1 # get all clip predictions for one video all_clips = predicts[ j * args.clip_per_video:(j + 1) * args.clip_per_video, :] # aggregate predictions into one video_pred = PredictionAggregation(all_clips, args.aggregation) c1, ck = metric.accuracy_metric( video_pred, sample_label, args.top_k) video_top1 = video_top1 + c1 video_topk = video_topk + ck video_count = video_count + args.batch_size clip_count = clip_count + label.shape[0] if i > 0 and i % args.display_iter == 0: log.info('Iter {}/{}: clip: {}, top1: {}, top 5: {}'.format( i, num_iter, clip_acc / clip_count, video_top1 / video_count, video_topk / video_count)) log.info("Test accuracy: clip: {}, top 1: {}, top{}: {}".format( clip_acc / clip_count, video_top1 / video_count, args.top_k, video_topk / video_count )) if num_gpus > 0: flops, params = model_helper.GetFlopsAndParams(test_model, gpus[0]) else: flops, params = model_helper.GetFlopsAndParams(test_model) log.info('FLOPs: {}, params: {}'.format(flops, params))
def _build_decoder( self, model, step_model, model_params, scope, previous_tokens, timestep, fake_seq_lengths, ): attention_type = model_params['attention'] assert attention_type in ['none', 'regular'] use_attention = (attention_type != 'none') with core.NameScope(scope): encoder_embeddings = seq2seq_util.build_embeddings( model=model, vocab_size=self.source_vocab_size, embedding_size=model_params['encoder_embedding_size'], name='encoder_embeddings', freeze_embeddings=False, ) ( encoder_outputs, weighted_encoder_outputs, final_encoder_hidden_state, final_encoder_cell_state, encoder_output_dim, ) = seq2seq_util.build_embedding_encoder( model=model, encoder_params=model_params['encoder_type'], inputs=self.encoder_inputs, input_lengths=self.encoder_lengths, vocab_size=self.source_vocab_size, embeddings=encoder_embeddings, embedding_size=model_params['encoder_embedding_size'], use_attention=use_attention, num_gpus=0, scope=scope, ) with core.NameScope(scope): # [max_source_length, beam_size, encoder_output_dim] encoder_outputs = model.net.Tile( encoder_outputs, 'encoder_outputs_tiled', tiles=self.beam_size, axis=1, ) if weighted_encoder_outputs is not None: weighted_encoder_outputs = model.net.Tile( weighted_encoder_outputs, 'weighted_encoder_outputs_tiled', tiles=self.beam_size, axis=1, ) decoder_embeddings = seq2seq_util.build_embeddings( model=model, vocab_size=self.target_vocab_size, embedding_size=model_params['decoder_embedding_size'], name='decoder_embeddings', freeze_embeddings=False, ) embedded_tokens_t_prev = step_model.net.Gather( [decoder_embeddings, previous_tokens], 'embedded_tokens_t_prev', ) decoder_num_units = ( model_params['decoder_layer_configs'][0]['num_units'] ) with core.NameScope(scope): if not use_attention and final_encoder_hidden_state is not None: final_encoder_hidden_state = model.net.Tile( final_encoder_hidden_state, 'final_encoder_hidden_state_tiled', tiles=self.beam_size, axis=1, ) if not use_attention and final_encoder_cell_state is not None: final_encoder_cell_state = model.net.Tile( final_encoder_cell_state, 'final_encoder_cell_state_tiled', tiles=self.beam_size, axis=1, ) initial_states = seq2seq_util.build_initial_rnn_decoder_states( model=model, encoder_num_units=encoder_output_dim, decoder_num_units=decoder_num_units, final_encoder_hidden_state=final_encoder_hidden_state, final_encoder_cell_state=final_encoder_cell_state, use_attention=use_attention, ) if use_attention: decoder_cell = rnn_cell.LSTMWithAttentionCell( encoder_output_dim=encoder_output_dim, encoder_outputs=encoder_outputs, decoder_input_dim=model_params['decoder_embedding_size'], decoder_state_dim=decoder_num_units, name=self.scope(scope, 'decoder'), attention_type=attention.AttentionType.Regular, weighted_encoder_outputs=weighted_encoder_outputs, forget_bias=0.0, lstm_memory_optimization=False, attention_memory_optimization=True, ) decoder_output_dim = decoder_num_units + encoder_output_dim else: decoder_cell = rnn_cell.LSTMCell( name=self.scope(scope, 'decoder'), input_size=model_params['decoder_embedding_size'], hidden_size=decoder_num_units, forget_bias=0.0, memory_optimization=False, ) decoder_output_dim = decoder_num_units states_prev = step_model.net.AddExternalInputs(*[ s + '_prev' for s in decoder_cell.get_state_names() ]) _, states = decoder_cell.apply( model=step_model, input_t=embedded_tokens_t_prev, seq_lengths=fake_seq_lengths, states=states_prev, timestep=timestep, ) if use_attention: with core.NameScope(scope or ''): decoder_outputs, _ = step_model.net.Concat( [states[0], states[2]], [ 'states_and_context_combination', '_states_and_context_combination_concat_dims', ], axis=2, ) else: decoder_outputs = states[0] state_configs = [ BeamSearchForwardOnly.StateConfig( initial_value=initial_state, state_prev_link=BeamSearchForwardOnly.LinkConfig( blob=state_prev, offset=0, window=1, ), state_link=BeamSearchForwardOnly.LinkConfig( blob=state, offset=1, window=1, ), ) for initial_state, state_prev, state in zip( initial_states, states_prev, states, ) ] with core.NameScope(scope): decoder_outputs_flattened, _ = step_model.net.Reshape( [decoder_outputs], [ 'decoder_outputs_flattened', 'decoder_outputs_and_contexts_combination_old_shape', ], shape=[-1, decoder_output_dim], ) output_logits = seq2seq_util.output_projection( model=step_model, decoder_outputs=decoder_outputs_flattened, decoder_output_size=decoder_output_dim, target_vocab_size=self.target_vocab_size, decoder_softmax_size=model_params['decoder_softmax_size'], ) # [1, beam_size, target_vocab_size] output_probs = step_model.net.Softmax( output_logits, 'output_probs', ) output_log_probs = step_model.net.Log( output_probs, 'output_log_probs', ) if use_attention: attention_weights = decoder_cell.get_attention_weights() else: attention_weights = step_model.net.ConstantFill( [self.encoder_inputs], 'zero_attention_weights_tmp_1', value=0.0, ) attention_weights = step_model.net.Transpose( attention_weights, 'zero_attention_weights_tmp_2', ) attention_weights = step_model.net.Tile( attention_weights, 'zero_attention_weights_tmp', tiles=self.beam_size, axis=0, ) return ( state_configs, output_log_probs, attention_weights, )
def ExtractFeatures(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus if num_gpus > 0: log.info("Running on GPUs: {}".format(gpus)) else: log.info("Running on CPU") log.info("Running on GPUs: {}".format(gpus)) my_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True } model = cnn.CNNModelHelper(name="Extract Features", **my_arg_scope) reader, num_examples = model_builder.create_data_reader( model, name="reader", input_data=args.test_data, ) def input_fn(model): model_helper.AddVideoInput( model, reader, batch_size=args.batch_size, clip_per_video=args.clip_per_video, decode_type=args.decode_type, length_rgb=args.clip_length_rgb, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, num_decode_threads=args.num_decode_threads, num_of_class=args.num_labels, random_mirror=False, random_crop=False, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=(args.input_type == 0), get_optical_flow=(args.input_type == 1), get_video_id=args.get_video_id, use_local_file=args.use_local_file, ) def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=args.model_name, model_depth=args.model_depth, num_labels=args.num_labels, num_channels=args.num_channels, crop_size=args.crop_size, clip_length=(args.clip_length_of if args.input_type == 1 else args.clip_length_rgb), loss_scale=loss_scale, is_test=1, ) if num_gpus > 0: data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, # 'None' since we aren't training devices=gpus, ) else: model._device_type = caffe2_pb2.CPU model._devices = [0] device_opt = core.DeviceOption(model._device_type, 0) with core.DeviceScope(device_opt): with core.NameScope("{}_{}".format("gpu", 0)): input_fn(model) create_model_ops(model, 1.0) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) if args.db_type == 'minidb': if num_gpus > 0: model_helper.LoadModel(args.load_model_path, args.db_type) data_parallel_model.FinalizeAfterCheckpoint(model) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(args.load_model_path, args.db_type) elif args.db_type == 'pickle': if num_gpus > 0: model_loader.LoadModelFromPickleFile(model, args.load_model_path, use_gpu=True, root_gpu_id=gpus[0]) else: model_loader.LoadModelFromPickleFile( model, args.load_model_path, use_gpu=False, ) else: log.warning("Unsupported db_type: {}".format(args.db_type)) def fetchActivations(model, outputs, num_iterations): all_activations = {} for counter in range(num_iterations): workspace.RunNet(model.net.Proto().name) num_devices = 1 # default for cpu for g in gpus if num_gpus > 0 else range(num_devices): for output_name in outputs: blob_name = 'gpu_{}/'.format(g) + output_name activations = workspace.FetchBlob(blob_name) if output_name not in all_activations: all_activations[output_name] = [] all_activations[output_name].append(activations) if counter % 20 == 0: log.info('{}/{} iterations'.format(counter, num_iterations)) # each key holds a list of activations obtained from each minibatch. # we now concatenate these lists to get the final arrays. # concatenating during the loop requires a realloc and can get slow. for key in all_activations: all_activations[key] = np.concatenate(all_activations[key]) return all_activations outputs = [name.strip() for name in args.features.split(',')] assert len(outputs) > 0 if args.num_iterations > 0: num_iterations = args.num_iterations else: if num_gpus > 0: examples_per_iteration = args.batch_size * num_gpus else: examples_per_iteration = args.batch_size num_iterations = int(num_examples / examples_per_iteration) activations = fetchActivations(model, outputs, num_iterations) # saving extracted features for index in range(len(outputs)): log.info("Read '{}' with shape {}".format( outputs[index], activations[outputs[index]].shape)) if args.output_path: output_path = args.output_path else: output_path = os.path.dirname(args.test_data) + '/features.pickle' log.info('Writing to {}'.format(output_path)) with open(output_path, 'wb') as handle: pickle.dump(activations, handle) # perform sanity check if args.sanity_check == 1: # check clip accuracy clip_acc = 0 softmax = activations['softmax'] label = activations['label'] for i in range(len(softmax)): sorted_preds = \ np.argsort(softmax[i]) sorted_preds[:] = sorted_preds[::-1] if sorted_preds[0] == label[i]: clip_acc += 1 log.info('Sanity check --- clip accuracy: {}'.format(clip_acc / len(softmax)))
def Parallelize( model_helper_obj, input_builder_fun, forward_pass_builder_fun, param_update_builder_fun=None, optimizer_builder_fun=None, post_sync_builder_fun=None, devices=None, rendezvous=None, net_type='dag', broadcast_computed_params=True, optimize_gradient_memory=False, use_nccl=False, max_concurrent_distributed_ops=16, cpu_device=False, ): ''' Function to create a model that can run on many GPUs or CPUs. model_helper_obj: an object of ModelHelper input_builder_fun: Function that adds the input operators Note: Remember to instantiate reader outside of this function so all devices share same reader object. Signature: input_builder_fun(model) forward_pass_builder_fun: Function to add the operators to the model. Must return list of loss-blob references that are used to build the gradient. Loss scale parameter is passed, as you should scale the loss of your model by 1.0 / the total number of devices. Signature: forward_pass_builder_fun(model, loss_scale) param_update_builder_fun: Function that adds operators that are run after gradient update, such as updating the weights and weight decaying. This is called for each GPU separately. Signature: param_update_builder_fun(model) optimizer_builder_fun: Alternative to param_update_builder_fun, allows one to add an optimizer for the whole model. Called only once, without name or devicescope. post_sync_builder_fun: Function applied after initial parameter sync has been completed, such as keeping multi-precision parameters in sync. Signature: post_sync_builder_fun(model) devices: List of GPU ids, such as [0, 1, 2, 3], rendezvous: used for rendezvous in distributed computation, if None then only one node is used. To create rendezvous, use <TBD>. net_type: Network type optimize_gradient_memory: whether to apply 'memonger' to share blobs in gradient computation to reduce memory footprint cpu_device Use CPU instead of GPU ''' if devices is None: devices = list(range(0, workspace.NumCudaDevices())), if not cpu_device: for gpu in devices: if gpu >= workspace.NumCudaDevices(): log.warning( "** Only {} GPUs available, GPUs {} requested".format( workspace.NumCudaDevices(), devices)) break model_helper_obj._device_type = caffe2_pb2.CUDA model_helper_obj._device_prefix = "gpu" device_name = "GPU" else: model_helper_obj._device_type = caffe2_pb2.CPU model_helper_obj._device_prefix = "cpu" device_name = "CPU" log.info("Parallelizing model for devices: {}".format(devices)) extra_workers = 8 if rendezvous is not None else 0 # best-guess num_workers = len(devices) * 4 + extra_workers max_concurrent_distributed_ops =\ min(max_concurrent_distributed_ops, num_workers - 1) model_helper_obj.net.Proto().num_workers = num_workers model_helper_obj.net.Proto().type = net_type # Store some information in the model -- a bit ugly model_helper_obj._devices = devices model_helper_obj._rendezvous = rendezvous model_helper_obj._grad_names = [] assert isinstance(model_helper_obj, model_helper.ModelHelper) # Keep track of params that were in the model before: they are not # data parallel, so we need to handle them separately non_datapar_params = copy.copy(model_helper_obj.params) # Add input and model log.info("Create input and model training operators") losses_by_gpu = {} num_shards = 1 if rendezvous is None else rendezvous['num_shards'] loss_scale = 1.0 / (len(devices) * num_shards) has_parameter_updates = param_update_builder_fun is not None or \ optimizer_builder_fun is not None assert not ( param_update_builder_fun is not None and optimizer_builder_fun is not None ), 'Can only specify one of param_update_builder_fun, optimizer_builder_fun' for device in devices: device_opt = core.DeviceOption(model_helper_obj._device_type, device) with core.DeviceScope(device_opt): with core.NameScope("{}_{}".format(model_helper_obj._device_prefix, device)): log.info("Model for {} : {}".format(device_name, device)) input_builder_fun(model_helper_obj) losses = forward_pass_builder_fun(model_helper_obj, loss_scale) # Losses are not needed for test net if has_parameter_updates: assert isinstance(losses, list), \ 'Model builder function must return list of loss blobs' for loss in losses: assert isinstance(loss, core.BlobReference), \ 'Model builder func must return list of loss blobs' losses_by_gpu[device] = losses _ValidateParams(model_helper_obj.params) # Create parameter map model_helper_obj._device_grouped_blobs =\ _GroupByDevice(model_helper_obj, devices, model_helper_obj.params, non_datapar_params) # computed params computed_params_grouped =\ _GroupByDevice(model_helper_obj, devices, model_helper_obj.GetComputedParams(''), []) model_helper_obj._device_grouped_blobs.update(computed_params_grouped) model_helper_obj._param_names =\ list(viewkeys(model_helper_obj._device_grouped_blobs)) model_helper_obj._computed_param_names =\ list(viewkeys(computed_params_grouped)) if not has_parameter_updates: log.info("Parameter update function not defined --> only forward") _InferBlobDevice(model_helper_obj) return log.info("Adding gradient operators") _AddGradientOperators(devices, model_helper_obj, losses_by_gpu) _ValidateParams(model_helper_obj.params) # Group gradients by device and register to blob lookup param_to_grad = model_helper_obj.param_to_grad grads_ordered = [ param_to_grad[p] for p in model_helper_obj.params if p in param_to_grad ] non_datapar_grads = [param_to_grad[p] for p in non_datapar_params] gradients_grouped = _GroupByDevice(model_helper_obj, devices, grads_ordered, non_datapar_grads) model_helper_obj._device_grouped_blobs.update(gradients_grouped) model_helper_obj._grad_names = list(viewkeys(gradients_grouped)) model_helper_obj._losses_by_gpu = losses_by_gpu _InferBlobDevice(model_helper_obj) log.info("Add gradient all-reduces for SyncSGD") if broadcast_computed_params: _BroadcastComputedParams(devices, model_helper_obj, rendezvous, use_nccl) if len(model_helper_obj._grad_names) > 0: # Gradients in reverse order reverse_ordered_grads = _GetReverseOrderedGrads(model_helper_obj) assert (len(reverse_ordered_grads) > 0) _AllReduceBlobs( reverse_ordered_grads, devices, model_helper_obj, model_helper_obj.net, rendezvous, use_nccl, max_concurrent_distributed_ops, ) else: log.info("NOTE: Param builder function did not create any parameters.") log.info("Post-iteration operators for updating params") num_shards = 1 if rendezvous is None else rendezvous['num_shards'] if param_update_builder_fun is not None: for device in devices: device_opt = core.DeviceOption(model_helper_obj._device_type, device) with core.DeviceScope(device_opt): with core.NameScope("{}_{}".format( model_helper_obj._device_prefix, device)): param_update_builder_fun(model_helper_obj) else: log.info("Calling optimizer builder function") optimizer_builder_fun(model_helper_obj) (sync_blobs, sync_names) = _ComputeBlobsToSync(model_helper_obj) sync_blobs_grouped = _GroupByDevice( model_helper_obj, devices, sync_blobs, [], ) model_helper_obj._device_grouped_blobs.update(sync_blobs_grouped) _InferBlobDevice(model_helper_obj) _AnalyzeOperators(model_helper_obj) # Configure dagnet to run with only one worker on the first iteration, # to prevent concurrency problems with allocs and nccl. arg = model_helper_obj.Proto().arg.add() arg.name = "first_iter_only_one_worker" arg.i = 1 # Add initial parameter syncs log.info("Add initial parameter sync") _SyncAllParams(devices, model_helper_obj, model_helper_obj.param_init_net, model_helper_obj.param_init_net, rendezvous, sync_names, max_concurrent_distributed_ops=1) # Handle any operations that need to be done after parameter sync # i.e. making sure multi-precision copies of parameters are up-to-date if post_sync_builder_fun is not None: for device in devices: device_opt = core.DeviceOption(model_helper_obj._device_type, device) with core.DeviceScope(device_opt): with core.NameScope("{}_{}".format( model_helper_obj._device_prefix, device)): post_sync_builder_fun(model_helper_obj) if optimize_gradient_memory: _OptimizeGradientMemorySimple(model_helper_obj, losses_by_gpu, devices) model_helper_obj._data_parallel_model_init_nets = [ model_helper_obj.param_init_net, ] model_helper_obj._data_parallel_model_nets = [model_helper_obj.net]
def initialize_gpu_0_from_weights_file(model, weights_file): logger.info('Loading from: {}'.format(weights_file)) is_first_init = 'trainedCOCO' in weights_file ws_blobs = workspace.Blobs() with open(weights_file, 'r') as f: src_blobs = pickle.load(f) if 'cfg' in src_blobs: saved_cfg = yaml.load(src_blobs['cfg']) configure_bbox_reg_weights(model, saved_cfg) if 'blobs' in src_blobs: # Backwards compat--dictionary used to be only blobs, now they are # stored under the 'blobs' key src_blobs = src_blobs['blobs'] # Initialize weights on GPU 0 only unscoped_param_names = OrderedDict() # Print these out in model order for blob in model.params: unscoped_param_names[utils.blob.unscope_name(str(blob))] = True with core.NameScope('gpu_0'): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): for unscoped_param_name in unscoped_param_names.keys(): if (unscoped_param_name.find(']_') >= 0 and unscoped_param_name not in src_blobs): # Special case for sharing initialization from a pretrained # model: # If a blob named '_[xyz]_foo' is in model.params and not in # the initialization blob dictionary, then load source blob # 'foo' into destination blob '_[xyz]_foo' src_name = unscoped_param_name[ unscoped_param_name.find(']_') + 2:] else: src_name = unscoped_param_name if src_name not in src_blobs: logger.info('{:s} not found'.format(src_name)) continue dst_name = core.ScopedName(unscoped_param_name) has_momentum = src_name + '_momentum' in src_blobs has_momentum_str = ' [+ momentum]' if has_momentum else '' logger.info('{:s}{:} loaded from weights file into {:s}: {}'. format( src_name, has_momentum_str, dst_name, src_blobs[src_name].shape)) pretrained_w = src_blobs[src_name] if dst_name in ws_blobs: # If the blob is already in the workspace, make sure that it # matches the shape of the loaded blob ws_blob = workspace.FetchBlob(dst_name) if ws_blob.shape != src_blobs[src_name].shape: pretrained_w = inflate_weights( pretrained_w, ws_blob, src_name, src_blobs) workspace.FeedBlob( dst_name, pretrained_w.astype(np.float32, copy=False)) if has_momentum and not is_first_init: # when feeding momentum, we're probably resuming from # previous checkpoint. So all the inflated stuff won't be # needed in that case workspace.FeedBlob( dst_name + '_momentum', src_blobs[src_name + '_momentum'].astype( np.float32, copy=False)) # Add _rm/_riv BN mean/var params, in case the pre-trained model contains it. # Needed to test the scratch trained models. for src_name in src_blobs.keys(): if src_name.endswith('_rm') or src_name.endswith('_riv'): with core.NameScope('gpu_0'): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): dst_name = core.ScopedName(src_name) workspace.FeedBlob(dst_name, src_blobs[src_name]) logger.info('Loaded BN param {}'.format(src_name)) # We preserve blobs that are in the weights file but not used by the current # model. We load these into CPU memory under the '__preserve__/' namescope. # These blobs will be stored when saving a model to a weights file. This # feature allows for alternating optimization of Faster R-CNN in which blobs # unused by one step can still be preserved forward and used to initialize # another step. for src_name in src_blobs.keys(): if (src_name not in unscoped_param_names and not src_name.endswith('_momentum') and src_blobs[src_name] is not None): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): workspace.FeedBlob( '__preserve__/{:s}'.format(src_name), src_blobs[src_name]) logger.info( '{:s} preserved in workspace (unused)'.format(src_name))
def test_resnet_shared_grads(self, with_shapes, gc, dc): model = cnn.CNNModelHelper( order="NCHW", name="test", cudnn_exhaustive_search=True, ) with core.NameScope("gpu_0"): data = model.net.AddExternalInput("gpu_0/data") label = model.net.AddExternalInput("gpu_0/label") (_softmax, loss) = resnet.create_resnet50( model, data, num_input_channels=3, num_labels=1000, label=label, is_test=False, ) param_to_grad = model.AddGradientOperators([loss]) (shapes, types) = workspace.InferShapesAndTypes( [model.param_init_net, model.net], {'gpu_0/data': [4, 3, 227, 227], 'gpu_0/label': [4]}, ) count_before = count_blobs(model.net.Proto()) optim_proto = memonger.share_grad_blobs( model.net, ["gpu_0/loss"], set(model.param_to_grad.values()), "gpu_0/", share_activations=True, dont_share_blobs=set([str(param_to_grad["gpu_0/conv1_w"])]), blob_shapes=shapes if with_shapes else None, ) count_after = count_blobs(optim_proto) self.assertTrue(count_after < count_before) # Run model and compare results. We check that the loss is same # and also that the final gradient (conv1_w_grad is same) workspace.RunNetOnce(model.param_init_net) data = np.random.rand(4, 3, 227, 227).astype(np.float32) label = (np.random.rand(4) * 1000).astype(np.int32) workspace.FeedBlob("gpu_0/data", data) workspace.FeedBlob("gpu_0/label", label) workspace.RunNetOnce(model.net) model.net.Proto().type = 'dag' model.net.Proto().num_workers = 4 loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"]) workspace.FeedBlob(param_to_grad["gpu_0/conv1_w"], np.array([0.0])) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") optim_conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"]) print("before: {} after: {}".format(count_before, count_after)) np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(conv1_w_grad, optim_conv1_w_grad)
def __init__(self, model, input_record, seed=0, modulo=None, use_hashing=True, use_divide_mod=False, divisor=None, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) assert use_hashing + use_divide_mod < 2, "use_hashing and use_divide_mod cannot be set true at the same time." if use_divide_mod: assert divisor >= 1, 'Unexpected divisor: {}'.format(divisor) self.divisor = self.create_param( param_name='divisor', shape=[1], initializer=('GivenTensorInt64Fill', { 'values': np.array([divisor]) }), optimizer=model.NoOptim) self.seed = seed self.use_hashing = use_hashing self.use_divide_mod = use_divide_mod if schema.equal_schemas(input_record, IdList): self.modulo = modulo or self.extract_hash_size( input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, expected_value=input_record.items.metadata.expected_value) with core.NameScope(name): self.output_schema = schema.NewRecord(model.net, IdList) self.output_schema.items.set_metadata(metadata) elif schema.equal_schemas(input_record, IdScoreList): self.modulo = modulo or self.extract_hash_size( input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, expected_value=input_record.keys.metadata.expected_value) with core.NameScope(name): self.output_schema = schema.NewRecord(model.net, IdScoreList) self.output_schema.keys.set_metadata(metadata) else: assert False, "Input type must be one of (IdList, IdScoreList)" assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo) if input_record.lengths.metadata: self.output_schema.lengths.set_metadata( input_record.lengths.metadata) # operators in this layer do not have CUDA implementation yet. # In addition, since the sparse feature keys that we are hashing are # typically on CPU originally, it makes sense to have this layer on CPU. self.tags.update([Tags.CPU_ONLY])
def test_forward_optim_tree_daggy(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() m.Proto().type = "dag" m.Proto().num_workers = 4 with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim) # Branch fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim) fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim) fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim) fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum") fc5.Relu([], fc5sum) \ .Softmax([], "pred1") \ .LabelCrossEntropy(["label"], ["xent1"]) \ .AveragedLoss([], "loss1") fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim) fc6.Relu([], fc6) \ .Softmax([], "pred2") \ .LabelCrossEntropy(["label"], ["xent2"]) \ .AveragedLoss([], "loss2") blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.optimize_inference_for_dag( m.net, ["name_x/data"], "name_x") self.assertTrue( memonger.verify_graph_equality(m.net.Proto(), optim_proto)) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) # Test networks produce exactly same results data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss1 = workspace.FetchBlob("name_x/loss1") loss2 = workspace.FetchBlob("name_x/loss2") workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("name_x/loss1") optimized_loss2 = workspace.FetchBlob("name_x/loss2") np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(loss2, optimized_loss2)
def run_model(self, V, gpu_devices, cpu_indices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): if cpu_indices: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): gathered_cpu = model.net.Gather([self.vecs, 'indices'], 'gathered_cpu') gathered = model.CopyCPUToGPU(gathered_cpu, "gathered") else: gpu_vecs = model.param_init_net.CopyCPUToGPU( self.vecs, "gpuvecs", ) model.params.append(gpu_vecs) gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered') flattened = model.Flatten(gathered, "flattened") fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: param_momentum = model.param_init_net.ConstantFill( [param], param + '_momentum', value=0.0, ) model.net.SparseMomentumSGDUpdate( [ param_grad.values, param_momentum, LR, param, param_grad.indices, ], [param_grad.values, param_momentum, param], momentum=0.1, nesterov=0, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) self.vecs = model.param_init_net.UniformFill([], "vecs", shape=[V, 16]) if cpu_indices: model.params.append(self.vecs) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs if cpu_indices: with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): for param in model.GetParams(): param_grad = model.param_to_grad[param] model.ScatterWeightedSum([ param, self.ONE_CPU, param_grad.indices, param_grad.values, self.LR ], self.vecs) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_indices = np.random.permutation(V)[:batch_size * 16].reshape( batch_size, 16) full_labels = full_indices[:, 0] % 2 batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en, :].astype(np.int32) labels = full_labels[st:en].astype(np.float32) device_for_indices = core.DeviceOption(caffe2_pb2.CPU) if not cpu_indices: device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g) with core.DeviceScope(device_for_indices): workspace.FeedBlob("gpu_{}/indices".format(g), indices) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = np.random.rand(V, 16).astype(np.float32) workspace.FeedBlob(self.vecs, orig_vecs) if not cpu_indices: for g in gpu_devices: workspace.FeedBlob( "gpu_{}/gpuvecs".format(g), orig_vecs, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) if len(gpu_devices) == 2: if not cpu_indices: idx = workspace.FetchBlob("gpu_0/indices") idx = list(idx.flatten()) n = len(idx) nu = len(set(idx)) assert n == nu, "We cannot have duplicate indices" # Sanity check to see the vecs were updated self.assertFalse(np.allclose(workspace.FetchBlob(self.vecs), orig_vecs)) return [ workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"), workspace.FetchBlob("gpu_0/fc_w") ]
def test_cpu2gpu_gpu2cpu_gradients(self): model = model_helper.ModelHelper(name="copy_test") batch = 32 cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0) gpu_opt = core.DeviceOption(workspace.GpuDeviceType, 0) with core.NameScope("cpu"): with core.DeviceScope(cpu_opt): x_cpu = brew.fc(model, 'data', 'x_cpu', 16, 8) with core.NameScope("gpu_0"): with core.DeviceScope(gpu_opt): x_gpu = model.CopyCPUToGPU(x_cpu, "x_gpu") pred_gpu = brew.fc(model, x_gpu, "pred_gpu", 8, 4) pred_cpu = model.CopyGPUToCPU(pred_gpu, "pred_cpu") with core.DeviceScope(cpu_opt): with core.NameScope("cpu"): (softmax, loss) = model.SoftmaxWithLoss( [pred_cpu, "label"], ["softmax", "loss"], ) gradient_map = model.AddGradientOperators([loss]) # Add param updates (for cpu and gpu) init_net = model.param_init_net with core.DeviceScope(cpu_opt): with core.NameScope("cpu"): ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.) LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0) for param in model.GetParams(): model.WeightedSum( [param, ONE, gradient_map[param], LR], param, ) with core.NameScope("gpu_0"): with core.DeviceScope(gpu_opt): ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.) LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0) for param in model.GetParams(): model.WeightedSum( [param, ONE, gradient_map[param], LR], param, ) with core.DeviceScope(cpu_opt): workspace.FeedBlob( 'cpu/data', np.random.rand(batch, 16).astype(np.float32), ) workspace.FeedBlob( 'cpu/label', np.random.randint(4, size=batch).astype(np.int32), ) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) initial_params = {p: workspace.FetchBlob(p) for p in model.GetParams()} workspace.RunNet(model.net.Proto().name) updated_params = {p: workspace.FetchBlob(p) for p in model.GetParams()} for p in model.GetParams(): g = gradient_map[p] expected = initial_params[p] - 2.0 * workspace.FetchBlob(g) actual = updated_params[p] self.assertTrue( np.array_equal(expected, updated_params[p]), "Mismatch: {}: {}, {}".format(p, expected, actual), )
def run(self, index, inputFeatures, accumulate=True, image_path=None): """ index - index of the dataset entry inputFeatures - features input to the head accumulate - whether to save to predictions in self.all_... members image_path - path to the annotated image, to which the predictions correspond """ timers = self.timers # Format the inputs to the mask rcnn head features = {} for k, v in inputFeatures.iteritems(): assert v.dim() == 3, 'Batch mode not allowed' features[k] = np.expand_dims(v.data.cpu().numpy(), axis=0) gpu_dev = caffe2_core.DeviceOption(caffe2_pb2.CUDA, self.gpu_id) name_scope = 'gpu_{}'.format(self.gpu_id) # Clean the workspace to make damn sure that nothing comes from the # possible forwarding of target features, depending on the use of this # module parameters = [str(s) for s in self.model.params] + [ str(s) + '_momentum' for s in self.model.TrainableParams() ] for b in workspace.Blobs(): if not b in parameters: workspace.FeedBlob(b, np.array([])) # Produce the top level of the pyramid of features with caffe2_core.NameScope(name_scope): with caffe2_core.DeviceScope(gpu_dev): workspace.FeedBlob( caffe2_core.ScopedName("predicted_fpn_res5_2_sum"), features['fpn_res5_2_sum']) workspace.RunOperatorOnce(self.subsampler) features[ u'fpn_res5_2_sum_subsampled_2x'] = workspace.FetchBlob( caffe2_core.ScopedName( "predicted_fpn_res5_2_sum_subsampled_2x")) # Forward the rest of the features in the head of the model im_info = np.array([[1024., 2048., 1.]], dtype=np.float32) im_scales = np.array([1.]) im_shape = (1024, 2048, 3) with caffe2_core.NameScope(name_scope): with caffe2_core.DeviceScope(gpu_dev): cls_boxes_i, cls_segms_i = im_detect_all_given_features( self.model, self.subsampler, features, im_info, im_scales, im_shape, timers) # If required, store the results in the class's members if accumulate: extend_results(index, self.all_boxes_ann_frame, cls_boxes_i) if cls_segms_i is not None and accumulate: extend_results(index, self.all_segms_ann_frame, cls_segms_i) if image_path is not None and accumulate: self.id_sequences.append(image_path) if index % 10 == 0: ave_total_time = np.sum([t.average_time for t in timers.values()]) det_time = (timers['im_detect_bbox'].average_time + timers['im_detect_mask'].average_time) misc_time = (timers['misc_bbox'].average_time + timers['misc_mask'].average_time) print(('im_detect: ' '{:d}/{:d} {:.3f}s + {:.3f}s => avg total time: {:.3f}s' ).format(index, self.num_images, det_time, misc_time, ave_total_time)) return cls_boxes_i, cls_segms_i
def _LSTM( cell_class, model, input_blob, seq_lengths, initial_states, dim_in, dim_out, scope, outputs_with_grads=(0, ), return_params=False, memory_optimization=False, forget_bias=0.0, forward_only=False, drop_states=False, return_last_layer_only=True, ): ''' Adds a standard LSTM recurrent network operator to a model. cell_class: LSTMCell or compatible subclass model: CNNModelHelper object new operators would be added to input_blob: the input sequence in a format T x N x D where T is sequence size, N - batch size and D - input dimension seq_lengths: blob containing sequence lengths which would be passed to LSTMUnit operator initial_states: a list of (2 * num_layers) blobs representing the initial hidden and cell states of each layer. If this argument is None, these states will be added to the model as network parameters. dim_in: input dimension dim_out: number of units per LSTM layer (use int for single-layer LSTM, list of ints for multi-layer) outputs_with_grads : position indices of output blobs for LAST LAYER which will receive external error gradient during backpropagation. These outputs are: (h_all, h_last, c_all, c_last) return_params: if True, will return a dictionary of parameters of the LSTM memory_optimization: if enabled, the LSTM step is recomputed on backward step so that we don't need to store forward activations for each timestep. Saves memory with cost of computation. forget_bias: forget gate bias (default 0.0) forward_only: whether to create a backward pass drop_states: drop invalid states, passed through to LSTMUnit operator return_last_layer_only: only return outputs from final layer (so that length of results does depend on number of layers) ''' if type(dim_out) is not list and type(dim_out) is not tuple: dim_out = [dim_out] num_layers = len(dim_out) cells = [] for i in range(num_layers): name = '{}/layer_{}'.format(scope, i) if num_layers > 1 else scope cell = cell_class( input_size=(dim_in if i == 0 else dim_out[i - 1]), hidden_size=dim_out[i], forget_bias=forget_bias, memory_optimization=memory_optimization, name=name, forward_only=forward_only, drop_states=drop_states, ) cells.append(cell) if num_layers > 1: multicell = MultiRNNCell( cells, name=scope, forward_only=forward_only, ) else: multicell = cells[0] if initial_states is None: initial_states = [] for i in range(num_layers): with core.NameScope(scope): suffix = '_{}'.format(i) if num_layers > 1 else '' initial_hidden = model.param_init_net.ConstantFill( [], 'initial_hidden_state' + suffix, shape=[dim_out[i]], value=0.0, ) initial_cell = model.param_init_net.ConstantFill( [], 'initial_cell_state' + suffix, shape=[dim_out[i]], value=0.0, ) initial_states.extend([initial_hidden, initial_cell]) model.params.extend([initial_hidden, initial_cell]) assert len(initial_states) == 2 * num_layers, \ "Incorrect initial_states, was expecting 2 * num_layers elements" \ + " but had only {}".format(len(initial_states)) # outputs_with_grads argument indexes into final layer outputs_with_grads = [4 * (num_layers - 1) + i for i in outputs_with_grads] _, result = multicell.apply_over_sequence( model=model, inputs=input_blob, seq_lengths=seq_lengths, initial_states=initial_states, outputs_with_grads=outputs_with_grads, ) if return_last_layer_only: result = result[4 * (num_layers - 1):] if return_params: result = list(result) + [{ 'input': cell.get_input_params(), 'recurrent': cell.get_recurrent_params(), }] return tuple(result)
def test_simple_model(self): model = model_helper.ModelHelper(name="mnist") # how come those inputs don't break the forward pass =.=a workspace.FeedBlob("data", np.random.randn(1, 3, 64, 64).astype(np.float32)) workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int)) with core.NameScope("conv1"): conv1 = brew.conv(model, "data", 'conv1', dim_in=1, dim_out=20, kernel=5) # Image size: 24 x 24 -> 12 x 12 pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2) # Image size: 12 x 12 -> 8 x 8 conv2 = brew.conv(model, pool1, 'conv2', dim_in=20, dim_out=100, kernel=5) # Image size: 8 x 8 -> 4 x 4 pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2) with core.NameScope("classifier"): # 50 * 4 * 4 stands for dim_out from previous layer multiplied by the image size fc3 = brew.fc(model, pool2, 'fc3', dim_in=100 * 4 * 4, dim_out=500) relu = brew.relu(model, fc3, fc3) pred = brew.fc(model, relu, 'pred', 500, 10) softmax = brew.softmax(model, pred, 'softmax') xent = model.LabelCrossEntropy([softmax, "label"], 'xent') # compute the expected loss loss = model.AveragedLoss(xent, "loss") model.net.RunAllOnMKL() model.param_init_net.RunAllOnMKL() model.AddGradientOperators([loss], skip=1) blob_name_tracker = {} graph = tb.model_to_graph_def( model, blob_name_tracker=blob_name_tracker, shapes={}, show_simplified=False, ) #self.assertEqual( # blob_name_tracker['GRADIENTS/conv1/conv1_b_grad'], # 'conv1/conv1_b_grad', #) self.maxDiff = None # We can't guarantee the order in which they appear, so we sort # both before we compare them with open('tests/expect/caffe_mnist.expect') as f: EXPECTED_MNIST = f.read() sep = "node {" expected = "\n".join( sorted(sep + "\n " + part.strip() for part in EXPECTED_MNIST.strip().split(sep) if part.strip())) actual = "\n".join( sorted(sep + "\n " + part.strip() for part in str(graph).strip().split(sep) if part.strip()))
def cudnn_LSTM(model, input_blob, initial_states, dim_in, dim_out, scope, recurrent_params=None, input_params=None, num_layers=1, return_params=False): ''' CuDNN version of LSTM for GPUs. input_blob Blob containing the input. Will need to be available when param_init_net is run, because the sequence lengths and batch sizes will be inferred from the size of this blob. initial_states tuple of (hidden_init, cell_init) blobs dim_in input dimensions dim_out output/hidden dimension scope namescope to apply recurrent_params dict of blobs containing values for recurrent gate weights, biases (if None, use random init values) See GetLSTMParamNames() for format. input_params dict of blobs containing values for input gate weights, biases (if None, use random init values) See GetLSTMParamNames() for format. num_layers number of LSTM layers return_params if True, returns (param_extract_net, param_mapping) where param_extract_net is a net that when run, will populate the blobs specified in param_mapping with the current gate weights and biases (input/recurrent). Useful for assigning the values back to non-cuDNN LSTM. ''' with core.NameScope(scope): weight_params = GetLSTMParamNames()['weights'] bias_params = GetLSTMParamNames()['biases'] input_weight_size = dim_out * dim_in upper_layer_input_weight_size = dim_out * dim_out recurrent_weight_size = dim_out * dim_out input_bias_size = dim_out recurrent_bias_size = dim_out def init(layer, pname, input_type): input_weight_size_for_layer = input_weight_size if layer == 0 else \ upper_layer_input_weight_size if pname in weight_params: sz = input_weight_size_for_layer if input_type == 'input' \ else recurrent_weight_size elif pname in bias_params: sz = input_bias_size if input_type == 'input' \ else recurrent_bias_size else: assert False, "unknown parameter type {}".format(pname) return model.param_init_net.UniformFill( [], "lstm_init_{}_{}_{}".format(input_type, pname, layer), shape=[sz]) # Multiply by 4 since we have 4 gates per LSTM unit first_layer_sz = input_weight_size + recurrent_weight_size + \ input_bias_size + recurrent_bias_size upper_layer_sz = upper_layer_input_weight_size + \ recurrent_weight_size + input_bias_size + \ recurrent_bias_size total_sz = 4 * (first_layer_sz + (num_layers - 1) * upper_layer_sz) weights = model.param_init_net.UniformFill([], "lstm_weight", shape=[total_sz]) model.params.append(weights) model.weights.append(weights) lstm_args = { 'hidden_size': dim_out, 'rnn_mode': 'lstm', 'bidirectional': 0, # TODO 'dropout': 1.0, # TODO 'input_mode': 'linear', # TODO 'num_layers': num_layers, 'engine': 'CUDNN' } param_extract_net = core.Net("lstm_param_extractor") param_extract_net.AddExternalInputs([input_blob, weights]) param_extract_mapping = {} # Populate the weights-blob from blobs containing parameters for # the individual components of the LSTM, such as forget/input gate # weights and bises. Also, create a special param_extract_net that # can be used to grab those individual params from the black-box # weights blob. These results can be then fed to InitFromLSTMParams() for input_type in ['input', 'recurrent']: param_extract_mapping[input_type] = {} p = recurrent_params if input_type == 'recurrent' else input_params if p is None: p = {} for pname in weight_params + bias_params: for j in range(0, num_layers): values = p[pname] if pname in p else init( j, pname, input_type) model.param_init_net.RecurrentParamSet( [input_blob, weights, values], weights, layer=j, input_type=input_type, param_type=pname, **lstm_args) if pname not in param_extract_mapping[input_type]: param_extract_mapping[input_type][pname] = {} b = param_extract_net.RecurrentParamGet( [input_blob, weights], ["lstm_{}_{}_{}".format(input_type, pname, j)], layer=j, input_type=input_type, param_type=pname, **lstm_args) param_extract_mapping[input_type][pname][j] = b (hidden_input_blob, cell_input_blob) = initial_states output, hidden_output, cell_output, rnn_scratch, dropout_states = \ model.net.Recurrent( [input_blob, cell_input_blob, cell_input_blob, weights], ["lstm_output", "lstm_hidden_output", "lstm_cell_output", "lstm_rnn_scratch", "lstm_dropout_states"], seed=random.randint(0, 100000), # TODO: dropout seed **lstm_args ) model.net.AddExternalOutputs(hidden_output, cell_output, rnn_scratch, dropout_states) if return_params: param_extract = param_extract_net, param_extract_mapping return output, hidden_output, cell_output, param_extract else: return output, hidden_output, cell_output
def test_gradient_optim_tree(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim) fc5.Relu([], fc5) \ .Softmax([], "pred1") \ .LabelCrossEntropy(["label"], ["xent1"]) \ .AveragedLoss([], "loss1") fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim) fc6.Relu([], fc6) \ .Softmax([], "pred2") \ .LabelCrossEntropy(["label"], ["xent2"]) \ .AveragedLoss([], "loss2") input_to_grad = m.AddGradientOperators( ["name_x/loss1", "name_x/loss2"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["name_x/loss1", "name_x/loss2"], set(viewvalues(m.param_to_grad)), "name_x", # "name_x//shared_gradinp_0_shared" if using "name_x/" share_activations=True, dont_share_blobs=set([ 'name_x/fc6', 'name_x/fc5', str(input_to_grad["name_x/fc1_w"]) ]), ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) self.assertTrue(has_blob(optim_proto, "name_x/fc6")) # Test networks produce exactly same gradients data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss1 = workspace.FetchBlob("name_x/loss1") loss2 = workspace.FetchBlob("name_x/loss2") grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0])) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("name_x/loss1") optimized_loss2 = workspace.FetchBlob("name_x/loss2") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(loss2, optimized_loss2) np.testing.assert_almost_equal(grad, optimized_grad)
def _build_decoder( self, model, step_model, model_params, scope, previous_tokens, timestep, fake_seq_lengths, ): attention_type = model_params['attention'] assert attention_type in ['none', 'regular'] use_attention = (attention_type != 'none') with core.NameScope(scope): encoder_embeddings = seq2seq_util.build_embeddings( model=model, vocab_size=self.source_vocab_size, embedding_size=model_params['encoder_embedding_size'], name='encoder_embeddings', freeze_embeddings=False, ) ( encoder_outputs, weighted_encoder_outputs, final_encoder_hidden_states, final_encoder_cell_states, encoder_units_per_layer, ) = seq2seq_util.build_embedding_encoder( model=model, encoder_params=model_params['encoder_type'], num_decoder_layers=len(model_params['decoder_layer_configs']), inputs=self.encoder_inputs, input_lengths=self.encoder_lengths, vocab_size=self.source_vocab_size, embeddings=encoder_embeddings, embedding_size=model_params['encoder_embedding_size'], use_attention=use_attention, num_gpus=0, forward_only=True, scope=scope, ) with core.NameScope(scope): if use_attention: # [max_source_length, beam_size, encoder_output_dim] encoder_outputs = model.net.Tile( encoder_outputs, 'encoder_outputs_tiled', tiles=self.beam_size, axis=1, ) if weighted_encoder_outputs is not None: weighted_encoder_outputs = model.net.Tile( weighted_encoder_outputs, 'weighted_encoder_outputs_tiled', tiles=self.beam_size, axis=1, ) decoder_embeddings = seq2seq_util.build_embeddings( model=model, vocab_size=self.target_vocab_size, embedding_size=model_params['decoder_embedding_size'], name='decoder_embeddings', freeze_embeddings=False, ) embedded_tokens_t_prev = step_model.net.Gather( [decoder_embeddings, previous_tokens], 'embedded_tokens_t_prev', ) decoder_cells = [] decoder_units_per_layer = [] for i, layer_config in enumerate(model_params['decoder_layer_configs']): num_units = layer_config['num_units'] decoder_units_per_layer.append(num_units) if i == 0: input_size = model_params['decoder_embedding_size'] else: input_size = ( model_params['decoder_layer_configs'][i - 1]['num_units'] ) cell = rnn_cell.LSTMCell( forward_only=True, input_size=input_size, hidden_size=num_units, forget_bias=0.0, memory_optimization=False, ) decoder_cells.append(cell) with core.NameScope(scope): if final_encoder_hidden_states is not None: for i in range(len(final_encoder_hidden_states)): if final_encoder_hidden_states[i] is not None: final_encoder_hidden_states[i] = model.net.Tile( final_encoder_hidden_states[i], 'final_encoder_hidden_tiled_{}'.format(i), tiles=self.beam_size, axis=1, ) if final_encoder_cell_states is not None: for i in range(len(final_encoder_cell_states)): if final_encoder_cell_states[i] is not None: final_encoder_cell_states[i] = model.net.Tile( final_encoder_cell_states[i], 'final_encoder_cell_tiled_{}'.format(i), tiles=self.beam_size, axis=1, ) initial_states = \ seq2seq_util.build_initial_rnn_decoder_states( model=model, encoder_units_per_layer=encoder_units_per_layer, decoder_units_per_layer=decoder_units_per_layer, final_encoder_hidden_states=final_encoder_hidden_states, final_encoder_cell_states=final_encoder_cell_states, use_attention=use_attention, ) attention_decoder = seq2seq_util.LSTMWithAttentionDecoder( encoder_outputs=encoder_outputs, encoder_output_dim=encoder_units_per_layer[-1], encoder_lengths=None, vocab_size=self.target_vocab_size, attention_type=attention_type, embedding_size=model_params['decoder_embedding_size'], decoder_num_units=decoder_units_per_layer[-1], decoder_cells=decoder_cells, weighted_encoder_outputs=weighted_encoder_outputs, name=scope, ) states_prev = step_model.net.AddExternalInputs(*[ '{}/{}_prev'.format(scope, s) for s in attention_decoder.get_state_names() ]) decoder_outputs, states = attention_decoder.apply( model=step_model, input_t=embedded_tokens_t_prev, seq_lengths=fake_seq_lengths, states=states_prev, timestep=timestep, ) state_configs = [ BeamSearchForwardOnly.StateConfig( initial_value=initial_state, state_prev_link=BeamSearchForwardOnly.LinkConfig( blob=state_prev, offset=0, window=1, ), state_link=BeamSearchForwardOnly.LinkConfig( blob=state, offset=1, window=1, ), ) for initial_state, state_prev, state in zip( initial_states, states_prev, states, ) ] with core.NameScope(scope): decoder_outputs_flattened, _ = step_model.net.Reshape( [decoder_outputs], [ 'decoder_outputs_flattened', 'decoder_outputs_and_contexts_combination_old_shape', ], shape=[-1, attention_decoder.get_output_dim()], ) output_logits = seq2seq_util.output_projection( model=step_model, decoder_outputs=decoder_outputs_flattened, decoder_output_size=attention_decoder.get_output_dim(), target_vocab_size=self.target_vocab_size, decoder_softmax_size=model_params['decoder_softmax_size'], ) # [1, beam_size, target_vocab_size] output_probs = step_model.net.Softmax( output_logits, 'output_probs', ) output_log_probs = step_model.net.Log( output_probs, 'output_log_probs', ) if use_attention: attention_weights = attention_decoder.get_attention_weights() else: attention_weights = step_model.net.ConstantFill( [self.encoder_inputs], 'zero_attention_weights_tmp_1', value=0.0, ) attention_weights = step_model.net.Transpose( attention_weights, 'zero_attention_weights_tmp_2', ) attention_weights = step_model.net.Tile( attention_weights, 'zero_attention_weights_tmp', tiles=self.beam_size, axis=0, ) return ( state_configs, output_log_probs, attention_weights, )
def Parallelize_GPU( model_helper_obj, input_builder_fun, forward_pass_builder_fun, param_update_builder_fun, devices=range(0, workspace.NumCudaDevices()), rendezvous=None, net_type='dag', broadcast_computed_params=True, optimize_gradient_memory=False, use_nccl=False, ): ''' Function to create a model that can run on many GPUs. model_helper_obj: an object of ModelHelper, such as CNNModelHelper input_builder_fun: Function that adds the input operators Note: Remember to instantiate reader outside of this function so all GPUs share same reader object. Signature: input_builder_fun(model) forward_pass_builder_fun: Function to add the operators to the model. Must return list of loss-blob references that are used to build the gradient. Loss scale parameter is passed, as you should scale the loss of your model by 1.0 / the total number of gpus. Signature: forward_pass_builder_fun(model, loss_scale) param_update_builder_fun: Function that adds operators that are run after gradient update, such as updating the weights and weight decaying. Signature: param_update_builder_fun(model) devices: List of GPU ids, such as [0, 1, 2, 3], rendezvous: used for rendezvous in distributed computation, if None then only one node is used. To create rendezvous, use <TBD>. net_type: Network type optimize_gradient_memory: whether to apply 'memonger' to share blobs in gradient computation to reduce memory footprint ''' log.info("Parallelizing model for devices: {}".format(devices)) extra_workers = 8 if rendezvous is not None else 0 # best-guess model_helper_obj.net.Proto().num_workers = len(devices) * 4 + extra_workers model_helper_obj.net.Proto().type = net_type # Store some information in the model -- a bit ugly model_helper_obj._devices = devices model_helper_obj._rendezvous = rendezvous model_helper_obj._grad_names = [] assert isinstance(model_helper_obj, model_helper.ModelHelper) # Keep track of params that were in the model before: they are not # data parallel, so we need to handle them separately non_datapar_params = copy.copy(model_helper_obj.params) # Add input and model log.info("Create input and model training operators") losses_by_gpu = {} num_shards = 1 if rendezvous is None else rendezvous['num_shards'] loss_scale = 1.0 / (len(devices) * num_shards) for device in devices: device_opt = core.DeviceOption(caffe2_pb2.CUDA, device) with core.DeviceScope(device_opt): with core.NameScope("gpu_{}".format(device)): log.info("Model for GPU: {}".format(device)) input_builder_fun(model_helper_obj) losses = forward_pass_builder_fun(model_helper_obj, loss_scale) # Losses are not needed for test net if param_update_builder_fun is not None: assert isinstance(losses, list), \ 'Model builder function must return list of loss blobs' for loss in losses: assert isinstance(loss, core.BlobReference), \ 'Model builder func must return list of loss blobs' losses_by_gpu[device] = losses _ValidateParams(model_helper_obj.params) # Create parameter map model_helper_obj._device_grouped_blobs =\ _GroupByDevice(devices, model_helper_obj.params, non_datapar_params) # computed params computed_params_grouped =\ _GroupByDevice(devices, model_helper_obj.computed_params, []) model_helper_obj._device_grouped_blobs.update(computed_params_grouped) model_helper_obj._param_names =\ model_helper_obj._device_grouped_blobs.keys() model_helper_obj._computed_param_names = computed_params_grouped.keys() if (param_update_builder_fun is None): log.info("Parameter update function not defined --> only forward") _InferBlobDevice(model_helper_obj) return log.info("Adding gradient operators") _AddGradientOperators(devices, model_helper_obj, losses_by_gpu) _ValidateParams(model_helper_obj.params) # Group gradients by device and register to blob lookup param_to_grad = model_helper_obj.param_to_grad grads_ordered = [param_to_grad[p] for p in model_helper_obj.params if p in param_to_grad] non_datapar_grads = [param_to_grad[p] for p in non_datapar_params] gradients_grouped = _GroupByDevice( devices, grads_ordered, non_datapar_grads ) model_helper_obj._device_grouped_blobs.update(gradients_grouped) model_helper_obj._grad_names = gradients_grouped.keys() model_helper_obj._losses_by_gpu = losses_by_gpu _InferBlobDevice(model_helper_obj) log.info("Add gradient all-reduces for SyncSGD") if broadcast_computed_params: _BroadcastComputedParams(devices, model_helper_obj, rendezvous) if len(model_helper_obj._grad_names) > 0: _AllReduceGradients(devices, model_helper_obj, rendezvous, use_nccl) else: log.info("NOTE: Param builder function did not create any parameters.") log.info("Post-iteration operators for updating params") num_shards = 1 if rendezvous is None else rendezvous['num_shards'] # The following check is necessary for ring reduce to work if rendezvous is not None: assert num_shards > 1, \ "Please use more than one shard for distributed training" for device in devices: device_opt = core.DeviceOption(caffe2_pb2.CUDA, device) with core.DeviceScope(device_opt): with core.NameScope("gpu_{}".format(device)): param_update_builder_fun(model_helper_obj) _InferBlobDevice(model_helper_obj) _AnalyzeOperators(model_helper_obj) # Configure dagnet to run with only one worker on the first iteration, # to prevent concurrency problems with allocs and nccl. arg = model_helper_obj.Proto().arg.add() arg.name = "first_iter_only_one_worker" arg.i = 1 # Add initial parameter syncs log.info("Add initial parameter sync") if (rendezvous is not None): _AddDistributedParameterSync( devices, model_helper_obj, model_helper_obj.param_init_net, model_helper_obj.param_init_net, rendezvous, ) _SyncParams(devices, model_helper_obj, model_helper_obj.param_init_net) if optimize_gradient_memory: _OptimizeGradientMemoryDEPRECATED(model_helper_obj, losses_by_gpu, devices)
def proc1(rec): with core.NameScope('proc1'): out = NewRecord(ops, rec) ops.Add([rec.uid(), rec.uid()], [out.uid()]) out.value.set(blob=rec.value(), unsafe=True) return out
def test_gradient_optim(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim) fc5.Relu([], fc5)\ .Softmax([], "pred") \ .LabelCrossEntropy(["label"], ["xent"]) \ .AveragedLoss([], "loss") input_to_grad = m.AddGradientOperators(["name_x/loss"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["name_x/loss"], set(viewvalues(m.param_to_grad)), "name_x/", share_activations=False, ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) optim_proto_wacts = memonger.share_grad_blobs( m.net, ["name_x/loss"], set(viewvalues(m.param_to_grad)), "name_x/", share_activations=True, dont_share_blobs=set([str(input_to_grad["name_x/fc1_w"])]), ) blobs_wact_optim = count_blobs(optim_proto_wacts) self.assertLessEqual(blobs_wact_optim, blobs_after) # Check that the last activations are not shared self.assertTrue(has_blob(optim_proto, "name_x/fc5")) self.assertTrue( has_blob(optim_proto_wacts, "name_x/fc5"), "Dont remap final activation", ) # Test networks produce exactly same gradients data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss = workspace.FetchBlob("name_x/loss") grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) workspace.RunNetOnce(optim_proto) optimized_loss = workspace.FetchBlob("name_x/loss") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad) workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0])) # Run with the forward optimization workspace.RunNetOnce(optim_proto_wacts) optimized_loss = workspace.FetchBlob("name_x/loss") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad)
def Test(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus if num_gpus > 0: total_batch_size = args.batch_size * num_gpus log.info("Running on GPUs: {}".format(gpus)) log.info("total_batch_size: {}".format(total_batch_size)) else: total_batch_size = args.batch_size log.info("Running on CPU") log.info("total_batch_size: {}".format(total_batch_size)) video_input_args = dict( batch_size=args.batch_size, clip_per_video=args.clip_per_video, decode_type=1, length_rgb=args.clip_length_rgb, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, video_res_type=args.video_res_type, short_edge=min(args.scale_h, args.scale_w), num_decode_threads=args.num_decode_threads, do_multi_label=args.multi_label, num_of_class=args.num_labels, random_mirror=False, random_crop=False, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=(args.input_type == 0 or args.input_type >= 3), get_optical_flow=(args.input_type == 1 or args.input_type >= 4), use_local_file=args.use_local_file, crop_per_clip=args.crop_per_clip, ) reader_args = dict( name="test_reader", input_data=args.test_data, ) # Model building functions def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=args.model_name, model_depth=args.model_depth, num_labels=args.num_labels, batch_size=args.batch_size * args.clip_per_video * args.crop_per_clip, num_channels=args.num_channels, crop_size=args.crop_size, clip_length=(args.clip_length_of if args.input_type == 1 else args.clip_length_rgb), loss_scale=loss_scale, is_test=1, pred_layer_name=args.pred_layer_name, multi_label=args.multi_label, channel_multiplier=args.channel_multiplier, bottleneck_multiplier=args.bottleneck_multiplier, use_dropout=args.use_dropout, conv1_temporal_stride=args.conv1_temporal_stride, conv1_temporal_kernel=args.conv1_temporal_kernel, use_convolutional_pred=args.use_convolutional_pred, use_pool1=args.use_pool1, ) test_model = cnn.CNNModelHelper( order="NCHW", name="video_model_test", use_cudnn=(True if args.use_cudnn == 1 else False), cudnn_exhaustive_search=True, ) test_reader, number_of_examples = reader_utils.create_data_reader( test_model, **reader_args) if args.num_iter <= 0: num_iter = int(math.ceil(number_of_examples / total_batch_size)) else: num_iter = args.num_iter def test_input_fn(model): model_helper.AddVideoInput(test_model, test_reader, **video_input_args) if num_gpus > 0: data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, devices=gpus, optimize_gradient_memory=True, ) else: test_model._device_type = caffe2_pb2.CPU test_model._devices = [0] device_opt = core.DeviceOption(test_model._device_type, 0) with core.DeviceScope(device_opt): # Because our loaded models are named with "gpu_x", keep the naming for now. # TODO: Save model using `data_parallel_model.ExtractPredictorNet` # to extract the model for "gpu_0". It also renames # the input and output blobs by stripping the "gpu_x/" prefix with core.NameScope("{}_{}".format("gpu", 0)): test_input_fn(test_model) create_model_ops(test_model, 1.0) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) if args.db_type == 'minidb': if num_gpus > 0: model_helper.LoadModel(args.load_model_path, args.db_type) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(args.load_model_path, args.db_type) elif args.db_type == 'pickle': if num_gpus > 0: model_loader.LoadModelFromPickleFile(test_model, args.load_model_path, use_gpu=True, root_gpu_id=gpus[0]) else: model_loader.LoadModelFromPickleFile(test_model, args.load_model_path, use_gpu=False) else: log.warning("Unsupported db_type: {}".format(args.db_type)) data_parallel_model.FinalizeAfterCheckpoint(test_model) # metric couters for multilabel all_prob_for_map = np.empty(shape=[0, args.num_labels], dtype=np.float) all_label_for_map = np.empty(shape=[0, args.num_labels], dtype=np.int32) # metric counters for closed-world classification clip_acc = 0 video_top1 = 0 video_topk = 0 video_count = 0 clip_count = 0 crop_per_video = args.clip_per_video * args.crop_per_clip for i in range(num_iter): workspace.RunNet(test_model.net.Proto().name) num_devices = 1 # default for cpu if num_gpus > 0: num_devices = num_gpus for g in range(num_devices): # get labels label = workspace.FetchBlob("gpu_{}".format(g) + '/label') # get predictions if args.multi_label: predicts = workspace.FetchBlob("gpu_{}".format(g) + '/prob') else: predicts = workspace.FetchBlob("gpu_{}".format(g) + '/softmax') assert predicts.shape[0] == args.batch_size * crop_per_video for j in range(args.batch_size): # get label for one video if args.multi_label: sample_label = label[j * crop_per_video, :] else: sample_label = label[j * crop_per_video] # get clip accuracy for k in range(crop_per_video): sorted_preds = \ np.argsort(predicts[j * crop_per_video + k, :]) sorted_preds[:] = sorted_preds[::-1] if sorted_preds[0] == label[j * crop_per_video + k]: clip_acc = clip_acc + 1 # get all clip predictions for one video all_clips = \ predicts[ j * crop_per_video:(j + 1) * crop_per_video, : ] # aggregate predictions into one video_pred = PredictionAggregation(all_clips, args.aggregation) if args.multi_label: video_pred = np.expand_dims(video_pred, axis=0) sample_label = np.expand_dims(sample_label, axis=0) all_prob_for_map = np.concatenate( (all_prob_for_map, video_pred), axis=0) all_label_for_map = np.concatenate( (all_label_for_map, sample_label), axis=0) else: sorted_video_pred = np.argsort(video_pred) sorted_video_pred[:] = sorted_video_pred[::-1] if sorted_video_pred[0] == sample_label: video_top1 = video_top1 + 1 if sample_label in sorted_video_pred[0:args.top_k]: video_topk = video_topk + 1 video_count = video_count + args.batch_size clip_count = clip_count + label.shape[0] if i > 0 and i % args.display_iter == 0: if args.multi_label: # mAP auc, ap, wap, aps = metric.mean_ap_metric( all_prob_for_map, all_label_for_map) log.info( 'Iter {}/{}: mAUC: {}, mAP: {}, mWAP: {}, mAP_all: {}'. format(i, num_iter, auc, ap, wap, np.mean(aps))) else: # accuracy log.info('Iter {}/{}: clip: {}, top1: {}, top 5: {}'.format( i, num_iter, clip_acc / clip_count, video_top1 / video_count, video_topk / video_count)) if args.multi_label: # mAP auc, ap, wap, aps = metric.mean_ap_metric(all_prob_for_map, all_label_for_map) log.info("Test mAUC: {}, mAP: {}, mWAP: {}, mAP_all: {}".format( auc, ap, wap, np.mean(aps))) if args.print_per_class_metrics: log.info("Test mAP per class: {}".format(aps)) else: # accuracy log.info("Test accuracy: clip: {}, top 1: {}, top{}: {}".format( clip_acc / clip_count, video_top1 / video_count, args.top_k, video_topk / video_count)) if num_gpus > 0: flops, params, inters = model_helper.GetFlopsAndParams( test_model, gpus[0]) else: flops, params, inters = model_helper.GetFlopsAndParams(test_model) log.info('FLOPs: {}, params: {}, inters: {}'.format(flops, params, inters))