def _BuildCopyInstruction(builder, produced_blob_object, op_conf, to_device_tag): x_devices = produced_blob_object.parallel_desc_symbol.machine_id2device_id_list x_device_tag = produced_blob_object.parallel_desc_symbol.device_tag bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject( ) bn_in_op2blob_object["in"] = produced_blob_object op_attribute = op_infer_util.Infer(op_conf, bn_in_op2blob_object) assert to_device_tag != x_device_tag, (to_device_tag, x_device_tag) cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString( str(op_attribute)) if to_device_tag == "cpu" and x_device_tag == "gpu": x_parallel_conf = produced_blob_object.parallel_desc_symbol.parallel_conf builder.NoBoxingCudaD2HStatelessCall(cfg_op_attribute, x_parallel_conf, bn_in_op2blob_object, TryReplaceDeviceTag) elif to_device_tag == "gpu" and x_device_tag == "cpu": out_parallel_desc_symbol = TryReplaceDeviceTag( builder, produced_blob_object.parallel_desc_symbol, to_device_tag) out_parallel_conf = out_parallel_desc_symbol.parallel_conf with _CudaHostPinBlob(builder, produced_blob_object): builder.NoBoxingCudaH2DStatelessCall(cfg_op_attribute, out_parallel_conf, bn_in_op2blob_object) else: raise NotImplementedError( "invalid device found. to_device_tag: %s, x_device_tag: %s" % (to_device_tag, x_device_tag)) sbp_parallel = bn_in_op2blob_object[ "out"].op_arg_parallel_attr.sbp_parallel sbp_parallel.CopyFrom( produced_blob_object.op_arg_parallel_attr.sbp_parallel) return bn_in_op2blob_object["out"]
def BuildAssignInstruction(builder): op_conf = op_conf_pb.OperatorConf() device_tag = flow.current_scope( ).device_parallel_desc_symbol.device_tag op_conf.device_tag = device_tag op_name = id_util.UniqueStr(OP_PREFIX) op_conf.name = op_name op_conf.user_conf.op_type_name = "logical_slice_assign" op_conf.user_conf.input["value"].s.append("{}/value_0".format(op_name)) op_conf.user_conf.input["ref"].s.append("{}/ref_0".format(op_name)) parallel_conf = ref_blob_object.parallel_desc_symbol.parallel_conf op_conf.user_conf.attr["parallel_conf"].at_string = str(parallel_conf) op_conf.user_conf.attr["start"].at_list_int64.val[:] = start op_conf.user_conf.attr["stop"].at_list_int64.val[:] = stop op_conf.user_conf.attr["step"].at_list_int64.val[:] = [1] * len(start) bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject( ) bn_in_op2blob_object["ref_0"] = ref_blob_object bn_in_op2blob_object["value_0"] = value_blob_object op_attribute = op_infer_util.Infer(op_conf, bn_in_op2blob_object, scope_symbol_id) cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString( str(op_attribute)) builder.StatelessCall(cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo)
def _GetCpu0VariableBlobFromNumpy( np_array: np.ndarray, dtype: flow.dtype) -> oneflow._oneflow_internal.EagerConsistentBlob: """ Add a variable on cpu 0, and feed the value of `np_array` Note: dtype argument cannot be eliminated by convert_numpy_dtype_to_oneflow_dtype(np_array.dtype), because np.int8 == np.char and numpy_dtype_to_oneflow_dtype(oneflow_dtype_to_numpy_dtype(flow.int8)) may be flow.char """ with flow.scope.placement("cpu", "0:0"): op_name = id_util.UniqueStr(OP_PREFIX) op_conf = get_variable.GenerateVariableOpConf( name=op_name, shape=np_array.shape, dtype=dtype, initializer=initializer_util.zeros_initializer(dtype=dtype), trainable=False, ) current_parallel_desc_sym = flow.current_scope( ).device_parallel_desc_symbol device_tag = current_parallel_desc_sym.device_tag op_conf.device_tag = device_tag op_attribute = op_infer_util.Infer(op_conf, {}) var_blob = get_variable.CreateEagerVariableBlob(op_attribute, job_name=FAKE_JOB_NAME) interface_op_read_and_write.FeedValueToInterfaceBlobObject( var_blob.blob_object, np_array) return var_blob
def build(builder): op_conf = op_conf_pb.OperatorConf() device_tag = flow.current_scope( ).device_parallel_desc_symbol.device_tag op_conf.device_tag = device_tag op_conf.name = op_name op_conf.user_conf.op_type_name = "logical_slice" op_conf.user_conf.input["x"].s.append("{}/x_0".format(op_name)) op_conf.user_conf.output["y"].s.append("{}/y_0".format(op_name)) parallel_conf = input_blob_object.parallel_desc_symbol.parallel_conf op_conf.user_conf.attr["parallel_conf"].at_string = str( parallel_conf) op_conf.user_conf.attr["start"].at_list_int64.val[:] = start op_conf.user_conf.attr["stop"].at_list_int64.val[:] = stop op_conf.user_conf.attr["step"].at_list_int64.val[:] = [ 1 ] * len(start) bn_in_op2blob_object = ( oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()) bn_in_op2blob_object["x_0"] = input_blob_object op_attribute = op_infer_util.Infer(op_conf, bn_in_op2blob_object, scope_symbol_id) cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString( str(op_attribute)) builder.StatelessCall( cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo, ) Yield(bn_in_op2blob_object["y_0"])
def BuildModelIOPathInputInstruction(builder): op_attribute = op_infer_util.Infer(op_conf, ibn2blob_object={}) parallel_conf = flow.current_scope( ).device_parallel_desc_symbol.parallel_conf cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString( str(op_attribute)) builder.StatelessCall(cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo)
def _GetEagerNcclAllReduce(parallel_conf, ibn2blob_object): op_conf = op_conf_pb.OperatorConf() op_conf.device_tag = "gpu" op_conf.name = "eager_nccl_all_reduce" op_conf.user_conf.op_type_name = "eager_nccl_all_reduce" op_conf.user_conf.input["in"].s.append("eager_nccl_all_reduce/in_0") op_conf.user_conf.output["out"].s.append("eager_nccl_all_reduce/out_0") op_conf.user_conf.attr["parallel_conf"].at_string = str(parallel_conf) return op_infer_util.Infer(op_conf, ibn2blob_object)
def BuildModelLoadInstruction(builder): path_blob_object = path_input_blob_objects["out"] model_load_blob_objects["path"] = path_blob_object op_attribute = op_infer_util.Infer( model_load_op_conf, ibn2blob_object=model_load_blob_objects) parallel_conf = path_blob_object.parallel_desc_symbol.parallel_conf cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString( str(op_attribute)) builder.StatelessCall( cfg_op_attribute, parallel_conf, model_load_blob_objects, boxing_util.BoxingTo, )
def BuildModelSaveInstruction(builder): path_blob_object = path_input_blob_objects["out"] model_save_blob_objects["path"] = path_blob_object for (i, blob) in enumerate(var_blobs): model_save_blob_objects["in_{}".format(i)] = blob.blob_object op_attribute = op_infer_util.Infer( model_save_op_conf, ibn2blob_object=model_save_blob_objects) parallel_conf = path_blob_object.parallel_desc_symbol.parallel_conf cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString( str(op_attribute)) builder.StatelessCall( cfg_op_attribute, parallel_conf, model_save_blob_objects, boxing_util.BoxingTo, )
def BuildAssignInstruction(builder, ref_blob_object, value_blob_object, op_conf): ref_parallel_conf = ref_blob_object.parallel_desc_symbol.parallel_conf ref_devices = ref_blob_object.parallel_desc_symbol.machine_id2device_id_list value_devices = value_blob_object.parallel_desc_symbol.machine_id2device_id_list assert ref_devices == value_devices, "\nref_devices: %s\nvalue_devices: %s" % ( ref_devices, value_devices, ) ref_device_tag = ref_blob_object.parallel_desc_symbol.device_tag value_device_tag = value_blob_object.parallel_desc_symbol.device_tag bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject( ) bn_in_op2blob_object["ref"] = ref_blob_object bn_in_op2blob_object["value"] = value_blob_object op_attribute = op_infer_util.Infer(op_conf, bn_in_op2blob_object) cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString( str(op_attribute)) if ref_device_tag == value_device_tag: builder.NoBoxingStatelessCall(cfg_op_attribute, ref_parallel_conf, bn_in_op2blob_object) elif ref_device_tag == "cpu" and value_device_tag == "gpu": value_parallel_conf = value_blob_object.parallel_desc_symbol.parallel_conf builder.NoBoxingCudaD2HStatelessCall( cfg_op_attribute, value_parallel_conf, bn_in_op2blob_object, TryReplaceDeviceTag, ) elif ref_device_tag == "gpu" and value_device_tag == "cpu": with _CudaHostPinBlob(builder, value_blob_object): builder.NoBoxingCudaH2DStatelessCall(cfg_op_attribute, ref_parallel_conf, bn_in_op2blob_object) else: raise NotImplementedError( "invalid device found. ref_device_tag: %s, value_device_tag: %s" % (ref_device_tag, value_device_tag))
def ConstructNaiveBoxingOpConf( produced_blob_object, consumer_op_arg_parallel_attr, in_parallel_num, out_parallel_num, ): op_conf = op_conf_pb.OperatorConf() op_conf.name = "undefined_boxing_op_name" op_conf.device_tag = "cpu" op_conf.boxing_conf.lbi.op_name = "undefined_boxing_op_name" op_conf.boxing_conf.lbi.blob_name = "undefined_boxing_blob_name" op_conf.boxing_conf.in_num = in_parallel_num op_conf.boxing_conf.out_num = out_parallel_num in_sbp_parallel = produced_blob_object.op_arg_parallel_attr.sbp_parallel if in_sbp_parallel.has_split_parallel(): op_conf.boxing_conf.concat_box.axis = in_sbp_parallel.split_parallel( ).axis() elif in_parallel_num == 1: op_conf.boxing_conf.concat_box.axis = 0 else: assert in_sbp_parallel.has_partial_sum_parallel() op_conf.boxing_conf.add_box.SetInParent() out_sbp_parallel = consumer_op_arg_parallel_attr.sbp_parallel if out_sbp_parallel.has_split_parallel(): out_axis = out_sbp_parallel.split_parallel().axis() else: assert out_parallel_num == 1 out_axis = 0 op_conf.boxing_conf.split_box.axis = out_axis shape = produced_blob_object.op_arg_blob_attr.shape op_conf.boxing_conf.split_box.part_num.extend( balanced_splitter.BalancedPartNums(shape[out_axis], out_parallel_num)) bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject( ) for i in range(in_parallel_num): bn_in_op2blob_object["in_%s" % i] = produced_blob_object return op_infer_util.Infer(op_conf, bn_in_op2blob_object)