예제 #1
0
def rename_op_input(
    predict_net: caffe2_pb2.NetDef,
    init_net: caffe2_pb2.NetDef,
    op_id: int,
    input_id: int,
    new_name: str,
    from_producer: bool = False,
):
    """
    Rename the op_id-th operator in predict_net, change it's input_id-th input's
        name to the new_name. It also does automatic re-route and change
        external_input and init_net if necessary.
    - It requires the input is only consumed by this op.
    - This function modifies predict_net and init_net in-place.
    - When from_producer is enable, this also updates other operators that consumes
        the same input. Be cautious because may trigger unintended behaviour.
    """
    assert isinstance(predict_net, caffe2_pb2.NetDef)
    assert isinstance(init_net, caffe2_pb2.NetDef)

    init_net_ssa, init_net_versions = core.get_ssa(init_net)
    predict_net_ssa, predict_net_versions = core.get_ssa(
        predict_net, copy.deepcopy(init_net_versions))

    versioned_inputs, versioned_outputs = predict_net_ssa[op_id]
    old_name, version = versioned_inputs[input_id]

    if from_producer:
        producer_map = get_producer_map(predict_net_ssa)
        if not (old_name, version) in producer_map:
            raise NotImplementedError(
                "Can't find producer, the input {} is probably from"
                " init_net, this is not supported yet.".format(old_name))
        producer = producer_map[(old_name, version)]
        rename_op_output(predict_net, producer[0], producer[1], new_name)
        return

    def contain_targets(op_ssa):
        return (old_name, version) in op_ssa[0]

    is_consumer = [contain_targets(op_ssa) for op_ssa in predict_net_ssa]
    if sum(is_consumer) > 1:
        raise IllegalGraphTransformError((
            "Input '{}' of operator(#{}) are consumed by other ops, please use"
            + " rename_op_output on the producer instead. Offending op: \n{}"
        ).format(old_name, op_id, predict_net.op[op_id]))

    # update init_net
    _rename_versioned_blob_in_proto(init_net, old_name, new_name, version,
                                    init_net_ssa, {}, init_net_versions)
    # update predict_net
    _rename_versioned_blob_in_proto(
        predict_net,
        old_name,
        new_name,
        version,
        predict_net_ssa,
        init_net_versions,
        predict_net_versions,
    )
예제 #2
0
def remove_dead_end_ops(net_def: caffe2_pb2.NetDef):
    """ remove ops if its output is not used or not in external_output """
    ssa, versions = core.get_ssa(net_def)
    versioned_external_output = [(name, versions[name]) for name in net_def.external_output]
    consumer_map = get_consumer_map(ssa)
    removed_op_ids = set()

    def _is_dead_end(versioned_blob):
        return not (
            versioned_blob in versioned_external_output
            or (
                len(consumer_map[versioned_blob]) > 0
                and all(x[0] not in removed_op_ids for x in consumer_map[versioned_blob])
            )
        )

    for i, ssa_i in reversed(list(enumerate(ssa))):
        versioned_outputs = ssa_i[1]
        if all(_is_dead_end(outp) for outp in versioned_outputs):
            removed_op_ids.add(i)

    # simply removing those deadend ops should have no effect to external_output
    new_ops = [op for i, op in enumerate(net_def.op) if i not in removed_op_ids]
    del net_def.op[:]
    net_def.op.extend(new_ops)
예제 #3
0
파일: frontend.py 프로젝트: yangfly/pytorch
    def _ssa_rewrite(cls, net, init_net, value_info):
        def ssa_name(name, version):
            return '{}_{}'.format(name, version)

        if init_net:
            for op in init_net.op:
                assert re.match('GivenTensor.*Fill', op.type), "type is {}, \n{}".format(op.type, op)
                assert len(op.output) == 1
                op.output[0] = ssa_name(op.output[0], 0)
            init_net.external_input[:] = [ssa_name(name, 0)
                                          for name in init_net.external_input]
            init_net.external_output[:] = [ssa_name(name, 0)
                                           for name in init_net.external_output]
        if value_info:
            ssa_value_info = {ssa_name(name, 0): value
                              for name, value in value_info.items()}
            value_info.clear()
            value_info.update(ssa_value_info)
        net.external_input[:] = [ssa_name(name, 0)
                                 for name in net.external_input]
        ssa, blob_versions = caffe2_core.get_ssa(net)
        assert len(net.op) == len(ssa)
        for op, (versioned_inputs, versioned_outputs) in zip(net.op, ssa):
            op.input[:] = [ssa_name(name, version)
                           for name, version in versioned_inputs]
            op.output[:] = [ssa_name(name, version)
                            for name, version in versioned_outputs]
        net.external_output[:] = [ssa_name(name, blob_versions[name])
                                  for name in net.external_output]
예제 #4
0
def get_params_from_init_net(
    init_net: caffe2_pb2.NetDef,
) -> [Dict[str, Any], Dict[str, caffe2_pb2.DeviceOption]]:
    """
    Take the output blobs from init_net by running it.
    Outputs:
        params: dict from blob name to numpy array
        device_options: dict from blob name to the device option of its creating op
    """
    # NOTE: this assumes that the params is determined by producer op with the
    # only exception be CopyGPUToCPU which is CUDA op but returns CPU tensor.
    def _get_device_option(producer_op):
        if producer_op.type == "CopyGPUToCPU":
            return caffe2_pb2.DeviceOption()
        else:
            return producer_op.device_option

    with ScopedWS("__get_params_from_init_net__", is_reset=True, is_cleanup=True) as ws:
        ws.RunNetOnce(init_net)
        params = {b: fetch_any_blob(b) for b in init_net.external_output}
    ssa, versions = core.get_ssa(init_net)
    producer_map = get_producer_map(ssa)
    device_options = {
        b: _get_device_option(init_net.op[producer_map[(b, versions[b])][0]])
        for b in init_net.external_output
    }
    return params, device_options
예제 #5
0
def get_sub_graph_external_input_output(
    predict_net: caffe2_pb2.NetDef, sub_graph_op_indices: List[int]
) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]:
    """
    Return the list of external input/output of sub-graph,
    each element is tuple of the name and corresponding version in predict_net.

    external input/output is defined the same way as caffe2 NetDef.
    """
    ssa, versions = core.get_ssa(predict_net)

    all_inputs = []
    all_outputs = []
    for op_id in sub_graph_op_indices:
        all_inputs += [inp for inp in ssa[op_id][0] if inp not in all_inputs]
        all_outputs += list(ssa[op_id][1])  # ssa output won't repeat

    # for versioned blobs, external inputs are just those blob in all_inputs
    # but not in all_outputs
    ext_inputs = [inp for inp in all_inputs if inp not in all_outputs]

    # external outputs are essentially outputs of this subgraph that are used
    # outside of this sub-graph (including predict_net.external_output)
    all_other_inputs = sum(
        (ssa[i][0] for i in range(len(ssa)) if i not in sub_graph_op_indices),
        [(outp, versions[outp]) for outp in predict_net.external_output],
    )
    ext_outputs = [outp for outp in all_outputs if outp in set(all_other_inputs)]

    return ext_inputs, ext_outputs
예제 #6
0
    def _ssa_rewrite(cls, net, init_net, value_info):
        def ssa_name(name, version):
            return '{}_{}'.format(name, version)

        if init_net:
            for op in init_net.op:
                assert re.match('GivenTensor.*Fill', op.type), "type is {}, \n{}".format(op.type, op)
                assert len(op.output) == 1
                op.output[0] = ssa_name(op.output[0], 0)
            init_net.external_input[:] = [ssa_name(name, 0)
                                          for name in init_net.external_input]
            init_net.external_output[:] = [ssa_name(name, 0)
                                           for name in init_net.external_output]
        if value_info:
            ssa_value_info = {ssa_name(name, 0): value
                              for name, value in value_info.items()}
            value_info.clear()
            value_info.update(ssa_value_info)
        net.external_input[:] = [ssa_name(name, 0)
                                 for name in net.external_input]
        ssa, blob_versions = caffe2_core.get_ssa(net)
        assert len(net.op) == len(ssa)
        for op, (versioned_inputs, versioned_outputs) in zip(net.op, ssa):
            op.input[:] = [ssa_name(name, version)
                           for name, version in versioned_inputs]
            op.output[:] = [ssa_name(name, version)
                            for name, version in versioned_outputs]
        net.external_output[:] = [ssa_name(name, blob_versions[name])
                                  for name in net.external_output]
    def _ssa_rewrite(cls, net, init_net, value_info):
        def ssa_name(name, version, version_cnt=None):
            if version == 0:
                return name
            if version_cnt and len(version_cnt.get(name, {})) <= 1:
                return name
            return '{}_{}'.format(name, version)

        if init_net:
            for op in init_net.op:
                assert re.match('GivenTensor.*Fill', op.type), "type is {}, \n{}".format(op.type, op)
                assert len(op.output) == 1

        ssa, blob_versions = caffe2_core.get_ssa(net)
        version_cnt = {}
        versioned_blobs = []
        for versioned_input, versioned_output in ssa:
            versioned_blobs += versioned_input
            versioned_blobs += versioned_output

        for (name, version) in versioned_blobs:
            if name not in version_cnt:
                version_cnt[name] = {version}
            else:
                version_cnt[name].add(version)

        assert len(net.op) == len(ssa)
        for op, (versioned_inputs, versioned_outputs) in zip(net.op, ssa):
            op.input[:] = [ssa_name(name, version, version_cnt)
                           for name, version in versioned_inputs]
            op.output[:] = [ssa_name(name, version, version_cnt)
                            for name, version in versioned_outputs]
        net.external_output[:] = [ssa_name(name, blob_versions[name], version_cnt)
                                  for name in net.external_output]
예제 #8
0
def identify_reshape_sub_graph(predict_net: caffe2_pb2.NetDef) -> List[List[int]]:
    """
    Idenfity the reshape sub-graph in a protobuf.
    The reshape sub-graph is defined as matching the following pattern:

    (input_blob) -> Op_1 -> ... -> Op_N -> (new_shape) -─┐
        └-------------------------------------------> Reshape -> (output_blob)

    Return:
        List of sub-graphs, each sub-graph is represented as a list of indices
        of the relavent ops, [Op_1, Op_2, ..., Op_N, Reshape]
    """

    ssa, _ = core.get_ssa(predict_net)

    ret = []
    for i, op in enumerate(predict_net.op):
        if op.type == "Reshape":
            assert len(op.input) == 2
            input_ssa = ssa[i][0]
            data_source = input_ssa[0]
            shape_source = input_ssa[1]
            op_indices = _get_dependency_chain(ssa, shape_source, data_source)
            ret.append(op_indices + [i])
    return ret
예제 #9
0
def Export(workspace, net, params):
    """Returns init_net and predict_net suitable for writing to disk
       and loading into a Predictor"""
    predict_net = caffe2_pb2.NetDef()
    predict_net.CopyFrom(net.Proto())
    init_net = caffe2_pb2.NetDef()
    # Populate the init_net.
    ssa, blob_versions = core.get_ssa(net)
    inputs = []
    for versioned_inputs, _ in ssa:
        inputs += [name for name, _ in versioned_inputs]

    input_blobs = [blob_name for blob_name, version in
                   blob_versions.items()
                   if version == 0 and blob_name not in params]
    # Blobs that are never used as an input to another layer,
    # i.e. strictly output blobs.
    output_blobs = [blob_name for blob_name, version in
                    blob_versions.items()
                    if version != 0 and blob_name not in inputs]

    for blob_ref in params:
        blob_name = str(blob_ref)
        blob = workspace.FetchBlob(blob_name)
        init_net.op.extend(
            [
                core.CreateOperator(
                    "GivenTensorFill", [], [blob_name],
                    arg=[
                        utils.MakeArgument("shape", blob.shape),
                        utils.MakeArgument("values", blob)
                    ]
                )
            ]
        )
    # We have to make sure the blob exists in the namespace
    # and we can do so with fake data. (Which is immediately overwritten
    # by any typical usage)
    for blob_name in input_blobs:
        init_net.op.extend(
            [
                core.CreateOperator(
                    "GivenTensorFill", [], [blob_name],
                    arg=[
                        utils.MakeArgument("shape", [1, 1]),
                        utils.MakeArgument("values", [0.0])
                    ]
                )
            ]
        )

    # Now we make input/output_blobs line up with what Predictor expects.
    del predict_net.external_input[:]
    predict_net.external_input.extend(input_blobs)
    # For populating weights
    predict_net.external_input.extend(net.Proto().external_input)
    # Ensure the output is also consistent with what we want
    del predict_net.external_output[:]
    predict_net.external_output.extend(output_blobs)
    return init_net, predict_net
예제 #10
0
def _assign_device_option(predict_net: caffe2_pb2.NetDef,
                          init_net: caffe2_pb2.NetDef,
                          tensor_inputs: List[torch.Tensor]):
    """
    ONNX exported network doesn't have concept of device, assign necessary
    device option for each op in order to make it runable on GPU runtime.
    """
    def _get_device_type(torch_tensor):
        if torch_tensor.device.type == "cpu":
            return "cpu"
        elif torch_tensor.device.type == "cuda" and torch_tensor.device.index == 0:
            return "gpu"
        else:
            raise ValueError("Unsupported device {}".format(
                torch_tensor.device))

    def _assign_op_device_option(net_proto, net_ssa, blob_device_types):
        for op, ssa_i in zip(net_proto.op, net_ssa):
            if op.type in ["CopyCPUToGPU", "CopyGPUToCPU"]:
                op.device_option.CopyFrom(core.DeviceOption(
                    caffe2_pb2.CUDA, 0))
            else:
                devices = [blob_device_types[b] for b in ssa_i[0] + ssa_i[1]]
                assert all(d == devices[0] for d in devices)
                if devices[0] == "gpu":
                    op.device_option.CopyFrom(
                        core.DeviceOption(caffe2_pb2.CUDA, 0))

    # update ops in predict_net
    predict_net_input_device_types = {
        (name, 0): _get_device_type(tensor)
        for name, tensor in zip(predict_net.external_input, tensor_inputs)
    }
    predict_net_device_types = infer_device_type(
        predict_net, known_status=predict_net_input_device_types)
    predict_net_ssa, _ = core.get_ssa(predict_net)
    _assign_op_device_option(predict_net, predict_net_ssa,
                             predict_net_device_types)

    # update ops in init_net
    init_net_ssa, versions = core.get_ssa(init_net)
    init_net_output_device_types = {(name, versions[name]):
                                    predict_net_device_types[(name, 0)]
                                    for name in init_net.external_output}
    init_net_device_types = infer_device_type(
        init_net, known_status=init_net_output_device_types)
    _assign_op_device_option(init_net, init_net_ssa, init_net_device_types)
예제 #11
0
def Export(workspace, net, params):
    """Returns init_net and predict_net suitable for writing to disk
       and loading into a Predictor"""
    proto = net if isinstance(net, caffe2_pb2.NetDef) else net.Proto()
    predict_net = caffe2_pb2.NetDef()
    predict_net.CopyFrom(proto)
    init_net = caffe2_pb2.NetDef()
    # Populate the init_net.
    ssa, blob_versions = core.get_ssa(net)
    inputs = []
    for versioned_inputs, _ in ssa:
        inputs += [name for name, _ in versioned_inputs]

    input_blobs = [
        blob_name for blob_name, version in blob_versions.items()
        if version == 0 and blob_name not in params
    ]
    # Blobs that are never used as an input to another layer,
    # i.e. strictly output blobs.
    output_blobs = [
        blob_name for blob_name, version in blob_versions.items()
        if version != 0 and blob_name not in inputs
    ]

    for blob_ref in params:
        blob_name = str(blob_ref)
        blob = workspace.FetchBlob(blob_name)
        init_net.op.extend([
            core.CreateOperator("GivenTensorFill", [], [blob_name],
                                arg=[
                                    utils.MakeArgument("shape", blob.shape),
                                    utils.MakeArgument("values", blob)
                                ])
        ])
    # We have to make sure the blob exists in the namespace
    # and we can do so with fake data. (Which is immediately overwritten
    # by any typical usage)
    for blob_name in input_blobs:
        init_net.op.extend([
            core.CreateOperator("GivenTensorFill", [], [blob_name],
                                arg=[
                                    utils.MakeArgument("shape", [1, 1]),
                                    utils.MakeArgument("values", [0.0])
                                ])
        ])

    # Now we make input/output_blobs line up with what Predictor expects.
    del predict_net.external_input[:]
    predict_net.external_input.extend(input_blobs)
    # For populating weights
    predict_net.external_input.extend(proto.external_input)
    # Ensure the output is also consistent with what we want
    del predict_net.external_output[:]
    predict_net.external_output.extend(output_blobs)
    return init_net, predict_net
예제 #12
0
    def _infer_output_devices(self, inputs_dict):
        def _get_device_type(torch_tensor):
            assert torch_tensor.device.type in ["cpu", "cuda"]
            assert torch_tensor.device.index == 0
            return torch_tensor.device.type

        predict_net = self.protobuf_model.net.Proto()
        input_device_types = {
            (name, 0): _get_device_type(tensor) for name, tensor in inputs_dict.items()
        }
        device_type_map = infer_device_type(
            predict_net, known_status=input_device_types, device_name_style="pytorch"
        )
        ssa, versions = core.get_ssa(predict_net)
        versioned_outputs = [(name, versions[name]) for name in predict_net.external_output]
        output_devices = [device_type_map[outp] for outp in versioned_outputs]
        return output_devices
예제 #13
0
def rename_op_output(predict_net: caffe2_pb2.NetDef, op_id: int, output_id: int, new_name: str):
    """
    Rename the op_id-th operator in predict_net, change it's output_id-th input's
        name to the new_name. It also does automatic re-route and change
        external_output and if necessary.
    - It allows multiple consumers of its output.
    - This function modifies predict_net in-place, doesn't need init_net.
    """
    assert isinstance(predict_net, caffe2_pb2.NetDef)

    ssa, blob_versions = core.get_ssa(predict_net)

    versioned_inputs, versioned_outputs = ssa[op_id]
    old_name, version = versioned_outputs[output_id]

    # update predict_net
    _rename_versioned_blob_in_proto(
        predict_net, old_name, new_name, version, ssa, {}, blob_versions
    )
예제 #14
0
    def _fuse_once(predict_net):
        ssa, blob_versions = core.get_ssa(predict_net)
        consumer_map = get_consumer_map(ssa)
        versioned_external_output = [
            (name, blob_versions[name]) for name in predict_net.external_output
        ]

        for op_id, op in enumerate(predict_net.op):
            if op.type in _COPY_OPS:
                fw_copy_versioned_output = ssa[op_id][1][0]
                consumer_ids = [x[0] for x in consumer_map[fw_copy_versioned_output]]
                reverse_op_type = _COPY_OPS[1 - _COPY_OPS.index(op.type)]

                is_fusable = (
                    len(consumer_ids) > 0
                    and fw_copy_versioned_output not in versioned_external_output
                    and all(
                        predict_net.op[_op_id].type == reverse_op_type
                        and ssa[_op_id][1][0] not in versioned_external_output
                        for _op_id in consumer_ids
                    )
                )

                if is_fusable:
                    for rv_copy_op_id in consumer_ids:
                        # making each NextOp uses "a" directly and removing Copy ops
                        rs_copy_versioned_output = ssa[rv_copy_op_id][1][0]
                        next_op_id, inp_id = consumer_map[rs_copy_versioned_output][0]
                        predict_net.op[next_op_id].input[inp_id] = op.input[0]
                    # remove CopyOps
                    new_ops = [
                        op
                        for i, op in enumerate(predict_net.op)
                        if i != op_id and i not in consumer_ids
                    ]
                    del predict_net.op[:]
                    predict_net.op.extend(new_ops)
                    return True

        return False
예제 #15
0
def get_external_blob_names(net, lexical_scope):
    """
    Returns a set of blobs a given net depends on and a set of
    output blobs that are written by the net
    Inputs:
        net - net to return input/output blobs for;
        lexical_scope - all external blob names visible to the net
    """
    net_proto = net.Proto()
    net_ssa, _ = core.get_ssa(net_proto)
    input_names = core.get_undefined_blobs(net_ssa)
    if net_proto.external_input:
        input_names |= set(net_proto.external_input)

    output_names = set()
    if net_proto.external_output:
        output_names = set(net_proto.external_output)
    for op in net_proto.op:
        for output in op.output:
            if output in lexical_scope:
                output_names.add(output)

    return input_names, output_names
예제 #16
0
def get_external_blob_names(net, lexical_scope):
    """
    Returns a set of blobs a given net depends on and a set of
    output blobs that are written by the net
    Inputs:
        net - net to return input/output blobs for;
        lexical_scope - all external blob names visible to the net
    """
    # Use the blobs that are actually read/written to as external inputs/outputs
    net_proto = net.Proto()
    net_ssa, _ = core.get_ssa(net_proto)
    input_names = core.get_undefined_blobs(net_ssa)
    for input_name in input_names:
        assert str(input_name) in lexical_scope, \
            "Input blob " + input_name + " is undefined"

    output_names = set()
    for op in net_proto.op:
        for output in op.output:
            if output in lexical_scope:
                output_names.add(output)

    return input_names, output_names
예제 #17
0
def get_external_blob_names(net, lexical_scope):
    """
    Returns a set of blobs a given net depends on and a set of
    output blobs that are written by the net
    Inputs:
        net - net to return input/output blobs for;
        lexical_scope - all external blob names visible to the net
    """
    # Use the blobs that are actually read/written to as external inputs/outputs
    net_proto = net.Proto()
    net_ssa, _ = core.get_ssa(net_proto)
    input_names = core.get_undefined_blobs(net_ssa)
    for input_name in input_names:
        assert str(input_name) in lexical_scope, \
            "Input blob " + input_name + " is undefined"

    output_names = set()
    for op in net_proto.op:
        for output in op.output:
            if output in lexical_scope:
                output_names.add(output)

    return input_names, output_names
예제 #18
0
    def _infer_output_devices(self, inputs):
        """
        Returns:
            list[str]: list of device for each external output
        """
        def _get_device_type(torch_tensor):
            assert torch_tensor.device.type in ["cpu", "cuda"]
            assert torch_tensor.device.index == 0
            return torch_tensor.device.type

        predict_net = self.net.Proto()
        input_device_types = {
            (name, 0): _get_device_type(tensor)
            for name, tensor in zip(self._input_blobs, inputs)
        }
        device_type_map = infer_device_type(predict_net,
                                            known_status=input_device_types,
                                            device_name_style="pytorch")
        ssa, versions = core.get_ssa(predict_net)
        versioned_outputs = [(name, versions[name])
                             for name in predict_net.external_output]
        output_devices = [device_type_map[outp] for outp in versioned_outputs]
        return output_devices
예제 #19
0
def recurrent_net(
        net, cell_net, inputs, initial_cell_inputs,
        links, scratch_sizes,
        timestep=None, scope=None
):
    '''
    net: the main net operator should be added to

    cell_net: cell_net which is executed in a recurrent fasion

    inputs: sequences to be fed into the recurrent net. Currently only one input
    is supported. It has to be in a format T x N x (D1...Dk) where T is lengths
    of the sequence. N is a batch size and (D1...Dk) are the rest of dimentions

    initial_cell_inputs: inputs of the cell_net for the 0 timestamp.
    Format for each input is:
        (cell_net_input_name, external_blob_with_data)

    links: a dictionary from cell_net input names in moment t+1 and
    output names of moment t. Currently we assume that each output becomes
    an input for the next timestep.

    scratch_sizes: sizes of the scratch blobs. Scratch blobs are those
    intermidiate blobs of the cell_net which are used in backward pass.
    We use sizes iformation to preallocate memory for them over time.

    For example in case of LSTM we have FC -> Sum ->LSTMUnit sequence of
    operations in each iteration of the cell net. Output of Sum is an
    intermidiate blob. Also it is going to be part of the backward pass.
    Thus it is a scratch blob size of which we must to pvovide.

    timestep: name of the timestep blob to be used. If not provided "timestep"
    is used.

    scope: Internal blobs are going to be scoped in a format
    <scope_name>/<blob_name>
    If not provided we generate a scope name automatically
    '''
    assert len(inputs) == 1, "Only one input blob is supported so far"

    input_blobs = [str(i[0]) for i in inputs]
    initial_input_blobs = [str(x[1]) for x in initial_cell_inputs]
    op_name = net.NextName('recurrent')

    def s(name):
        # We have to manually scope due to our internal/external blob
        # relationships.
        scope_name = op_name if scope is None else scope
        return "{}/{}".format(str(scope_name), str(name))

    # determine inputs that are considered to be references
    # it is those that are not referred to in inputs or initial_cell_inputs
    known_inputs = map(str, input_blobs + initial_input_blobs)
    known_inputs += [str(x[0]) for x in initial_cell_inputs]
    if timestep is not None:
        known_inputs.append(str(timestep))
    references = [
        b for b in cell_net.Proto().external_input
        if b not in known_inputs]

    inner_outputs = list(cell_net.Proto().external_output)
    # These gradients are expected to be available during the backward pass
    inner_outputs_map = {o: o + '_grad' for o in inner_outputs}

    # compute the backward pass of the cell net
    backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass(
        cell_net.Proto().op, inner_outputs_map)
    backward_mapping = {str(k): str(v) for k, v in backward_mapping.items()}
    backward_cell_net = core.Net("RecurrentBackwardStep")

    del backward_cell_net.Proto().op[:]
    backward_cell_net.Proto().op.extend(backward_ops)

    # compute blobs used but not defined in the backward pass
    ssa, _ = core.get_ssa(backward_cell_net.Proto())
    undefined = core.get_undefined_blobs(ssa)

    # also add to the output list the intermediate outputs of fwd_step that
    # are used by backward.
    ssa, blob_versions = core.get_ssa(cell_net.Proto())
    scratches = [
        blob for (blob, ver) in blob_versions.items()
        if ver > 0 and
        blob in undefined and
        blob not in cell_net.Proto().external_output]

    all_inputs = [i[1] for i in inputs] + [
        x[1] for x in initial_cell_inputs] + references
    all_outputs = []

    cell_net.Proto().type = 'simple'
    backward_cell_net.Proto().type = 'simple'

    # Internal arguments used by RecurrentNetwork operator

    # Links are in the format blob_name, recurrent_states, offset.
    # In the moment t we know that corresponding data block is at
    # t + offset position in the recurrent_states tensor
    forward_links = []
    backward_links = []

    # Aliases are used to expose outputs to external world
    # Format (internal_blob, external_blob, offset)
    # Negative offset stands for going from the end,
    # positive - from the beginning
    aliases = []
    backward_aliases = []

    # States held inputs to the cell net
    recurrent_states = []

    # a map from gradient blob name to blob with its value over time
    grad_to_state = {}

    # A mapping from a blob to its gradient state blob

    for cell_input, _ in initial_cell_inputs:
        cell_input = str(cell_input)
        # Recurrent_states is going to be (T + 1) x ...
        # It stores all inputs and outputs of the cell net over time.
        # Or their gradients in the case of the backward pass.
        state = s(cell_input + "_states")
        states_grad = state + "_grad"
        cell_output = links[str(cell_input)]
        forward_links.append((cell_input, state, 0))
        forward_links.append((cell_output, state, 1))
        backward_links.append((cell_input + "_grad", states_grad, 0))
        backward_links.append((cell_output + "_grad", states_grad, 1))

        backward_cell_net.Proto().external_input.append(
            str(cell_output) + "_grad")
        aliases.append((state, cell_output + "_last", -1))
        aliases.append((state, cell_output + "_all", 1))
        all_outputs.extend([cell_output + "_all", cell_output + "_last"])

        recurrent_states.append(state)

    for scratch in scratches:
        # no scoping as scratches should be already scoped
        forward_links.append((scratch, scratch + "_states", 0))
        grad_blob = scratch + "_grad"
        states_grad_blob = scratch + "_states_grad"
        backward_links.append((grad_blob, states_grad_blob, 0))
        backward_cell_net.Proto().external_input.append(scratch)
        grad_to_state[grad_blob] = states_grad_blob

    input_gradient_ids = []
    for input_id, (input_t, input_blob) in enumerate(inputs):
        forward_links.append((str(input_t), str(input_blob), 0))

        input_blob_grad = str(input_blob) + "_grad"
        if backward_mapping[str(input_t)] != str(input_t) + "_grad":
            # Some scratch (internal blob) ends up being an input gradient
            # So we avoid extra copy and reuse it by applying this alias
            backward_aliases.append((
                grad_to_state[backward_mapping[str(input_t)]],
                input_blob_grad,
                0
            ))
        else:
            # This is a general case - we have to explicitly create input
            # gradient blob as it doesn't match any of internal gradients
            backward_links.append(
                (str(input_t) + "_grad", input_blob_grad, 0))
            input_gradient_ids.append(input_id)

    backward_cell_net.Proto().external_input.extend(
        cell_net.Proto().external_input)
    backward_cell_net.Proto().external_input.extend(
        cell_net.Proto().external_output)

    def unpack_triple(x):
        if x:
            a, b, c = zip(*x)
            return a, b, c
        return [], [], []

    # Splitting to separate lists so we can pass them to c++
    # where we ensemle them back
    link_internal, link_external, link_offset = unpack_triple(forward_links)
    backward_link_internal, backward_link_external, backward_link_offset = \
        unpack_triple(backward_links)
    alias_src, alias_dst, alias_offset = unpack_triple(aliases)
    backward_alias_src, backward_alias_dst, backward_alias_offset = \
        unpack_triple(backward_aliases)

    params = [x for x in references if x in backward_mapping.keys()]

    recurrent_inputs = [str(x[1]) for x in initial_cell_inputs]

    return net.RecurrentNetwork(
        all_inputs,
        all_outputs,
        param=map(all_inputs.index, params),
        alias_src=alias_src,
        alias_dst=map(str, alias_dst),
        alias_offset=alias_offset,
        recurrent_states=recurrent_states,
        recurrent_inputs=recurrent_inputs,
        recurrent_input_ids=map(all_inputs.index, recurrent_inputs),
        link_internal=map(str, link_internal),
        link_external=map(str, link_external),
        link_offset=link_offset,
        backward_link_internal=map(str, backward_link_internal),
        backward_link_external=map(str, backward_link_external),
        backward_link_offset=backward_link_offset,
        backward_alias_src=backward_alias_src,
        backward_alias_dst=backward_alias_dst,
        backward_alias_offset=backward_alias_offset,
        scratch=[sc + "_states" for sc in scratches],
        backward_scratch=[sc + "_states_grad" for sc in scratches],
        scratch_sizes=scratch_sizes,
        step_net=str(cell_net.Proto()),
        backward_step_net=str(backward_cell_net.Proto()),
        timestep="timestep" if timestep is None else str(timestep),
        input_gradient_ids=input_gradient_ids,
    )
예제 #20
0
def remove_reshape_for_fc(predict_net, params):
    """
    In PyTorch nn.Linear has to take 2D tensor, this often leads to reshape
        a 4D tensor to 2D by calling .view(). However this (dynamic) reshaping
        doesn't work well with ONNX and Int8 tools, and cause using extra
        ops (eg. ExpandDims) that might not be available on mobile.
    Luckily Caffe2 supports 4D tensor for FC, so we can remove those reshape
        after exporting ONNX model.
    """
    from caffe2.python import core

    # find all reshape sub-graph that can be removed, which is now all Reshape
    # sub-graph whose output is only consumed by FC.
    # TODO: to make it safer, we may need the actually value to better determine
    # if a Reshape before FC is removable.
    reshape_sub_graphs = identify_reshape_sub_graph(predict_net)
    sub_graphs_to_remove = []
    for reshape_sub_graph in reshape_sub_graphs:
        reshape_op_id = reshape_sub_graph[-1]
        assert predict_net.op[reshape_op_id].type == "Reshape"
        ssa, _ = core.get_ssa(predict_net)
        reshape_output = ssa[reshape_op_id][1][0]
        consumers = [i for i in range(len(ssa)) if reshape_output in ssa[i][0]]
        if all(predict_net.op[consumer].type == "FC" for consumer in consumers):
            # safety check if the sub-graph is isolated, for this reshape sub-graph,
            # it means it has one non-param external input and one external output.
            ext_inputs, ext_outputs = get_sub_graph_external_input_output(
                predict_net, reshape_sub_graph
            )
            non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0]
            if len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1:
                sub_graphs_to_remove.append(reshape_sub_graph)

    # perform removing subgraph by:
    # 1: rename the Reshape's output to its input, then the graph can be
    #   seen as in-place itentify, meaning whose external input/output are the same.
    # 2: simply remove those ops.
    remove_op_ids = []
    params_to_remove = []
    for sub_graph in sub_graphs_to_remove:
        logger.info(
            "Remove Reshape sub-graph:\n{}".format(
                "".join(["(#{:>4})\n{}".format(i, predict_net.op[i]) for i in sub_graph])
            )
        )
        reshape_op_id = sub_graph[-1]
        new_reshap_output = predict_net.op[reshape_op_id].input[0]
        rename_op_output(predict_net, reshape_op_id, 0, new_reshap_output)
        ext_inputs, ext_outputs = get_sub_graph_external_input_output(predict_net, sub_graph)
        non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0]
        params_ext_inputs = [inp for inp in ext_inputs if inp[1] == 0]
        assert len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1
        assert ext_outputs[0][0] == non_params_ext_inputs[0][0]
        assert ext_outputs[0][1] == non_params_ext_inputs[0][1] + 1
        remove_op_ids.extend(sub_graph)
        params_to_remove.extend(params_ext_inputs)

    predict_net = copy.deepcopy(predict_net)
    new_ops = [op for i, op in enumerate(predict_net.op) if i not in remove_op_ids]
    del predict_net.op[:]
    predict_net.op.extend(new_ops)
    for versioned_params in params_to_remove:
        name = versioned_params[0]
        logger.info("Remove params: {} from init_net and predict_net.external_input".format(name))
        del params[name]
        predict_net.external_input.remove(name)

    return predict_net, params
예제 #21
0
def recurrent_net(net,
                  cell_net,
                  inputs,
                  initial_cell_inputs,
                  links,
                  scratch_sizes,
                  timestep=None,
                  scope=None):
    '''
    net: the main net operator should be added to

    cell_net: cell_net which is executed in a recurrent fasion

    inputs: sequences to be fed into the recurrent net. Currently only one input
    is supported. It has to be in a format T x N x (D1...Dk) where T is lengths
    of the sequence. N is a batch size and (D1...Dk) are the rest of dimentions

    initial_cell_inputs: inputs of the cell_net for the 0 timestamp.
    Format for each input is:
        (cell_net_input_name, external_blob_with_data, input_size)

    links: a dictionary from cell_net input names in moment t+1 and
    output names of moment t. Currently we assume that each output becomes
    an input for the next timestep.

    scratch_sizes: sizes of the scratch blobs. Scratch blobs are those
    intermidiate blobs of the cell_net which are used in backward pass.
    We use sizes iformation to preallocate memory for them over time.

    For example in case of LSTM we have FC -> Sum ->LSTMUnit sequence of
    operations in each iteration of the cell net. Output of Sum is an
    intermidiate blob. Also it is going to be part of the backward pass.
    Thus it is a scratch blob size of which we must to pvovide.

    timestep: name of the timestep blob to be used. If not provided "timestep"
    is used.

    scope: Internal blobs are going to be scoped in a format
    <scope_name>/<blob_name>
    If not provided we generate a scope name automatically
    '''
    assert len(inputs) == 1, "Only one input blob is supported so far"

    input_blobs = [str(i[0]) for i in inputs]
    initial_input_blobs = [str(x[1]) for x in initial_cell_inputs]
    op_name = net.NextName('recurrent')

    def s(name):
        # We have to manually scope due to our internal/external blob
        # relationships.
        scope_name = op_name if scope is None else scope
        return "{}/{}".format(str(scope_name), str(name))

    # determine inputs that are considered to be references
    # it is those that are not referred to in inputs or initial_cell_inputs
    known_inputs = map(str, input_blobs + initial_input_blobs)
    known_inputs += [str(x[0]) for x in initial_cell_inputs]
    if timestep is not None:
        known_inputs.append(str(timestep))
    references = [
        b for b in cell_net.Proto().external_input if b not in known_inputs
    ]

    inner_outputs = list(cell_net.Proto().external_output)
    # These gradients are expected to be available during the backward pass
    inner_outputs_map = {o: o + '_grad' for o in inner_outputs}

    # compute the backward pass of the cell net
    backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass(
        cell_net.Proto().op, inner_outputs_map)
    backward_mapping = {str(k): str(v) for k, v in backward_mapping.items()}
    backward_cell_net = core.Net("RecurrentBackwardStep")

    del backward_cell_net.Proto().op[:]
    backward_cell_net.Proto().op.extend(backward_ops)

    # compute blobs used but not defined in the backward pass
    ssa, _ = core.get_ssa(backward_cell_net.Proto())
    undefined = core.get_undefined_blobs(ssa)

    # also add to the output list the intermediate outputs of fwd_step that
    # are used by backward.
    ssa, blob_versions = core.get_ssa(cell_net.Proto())
    scratches = [
        blob for (blob, ver) in blob_versions.items() if ver > 0
        and blob in undefined and blob not in cell_net.Proto().external_output
    ]

    all_inputs = [i[1] for i in inputs] + [x[1] for x in initial_cell_inputs
                                           ] + references
    all_outputs = []

    cell_net.Proto().type = 'simple'
    backward_cell_net.Proto().type = 'simple'

    # Internal arguments used by RecurrentNetwork operator

    # Links are in the format blob_name, recurrent_states, offset.
    # In the moment t we know that corresponding data block is at
    # t + offset position in the recurrent_states tensor
    forward_links = []
    backward_links = []

    # Aliases are used to expose outputs to external world
    # Format (internal_blob, external_blob, offset)
    # Negative offset stands for going from the end,
    # positive - from the beginning
    aliases = []
    backward_aliases = []

    # States held inputs to the cell net
    recurrent_states = []

    # a map from gradient blob name to blob with its value over time
    grad_to_state = {}

    # A mapping from a blob to its gradient state blob

    for cell_input, _, size in initial_cell_inputs:
        cell_input = str(cell_input)
        # Recurrent_states is going to be (T + 1) x ...
        # It stores all inputs and outputs of the cell net over time.
        # Or their gradients in the case of the backward pass.
        state = s(cell_input + "_states")
        states_grad = state + "_grad"
        cell_output = links[str(cell_input)]
        forward_links.append((cell_input, state, 0))
        forward_links.append((cell_output, state, 1))
        backward_links.append((cell_input + "_grad", states_grad, 0))
        backward_links.append((cell_output + "_grad", states_grad, 1))

        backward_cell_net.Proto().external_input.append(
            str(cell_output) + "_grad")
        aliases.append((state, cell_output + "_last", -1))
        aliases.append((state, cell_output + "_all", 1))
        all_outputs.extend([cell_output + "_all", cell_output + "_last"])

        recurrent_states.append(state)

    for scratch in scratches:
        # no scoping as scratches should be already scoped
        forward_links.append((scratch, scratch + "_states", 0))
        grad_blob = scratch + "_grad"
        states_grad_blob = scratch + "_states_grad"
        backward_links.append((grad_blob, states_grad_blob, 0))
        backward_cell_net.Proto().external_input.append(scratch)
        grad_to_state[grad_blob] = states_grad_blob

    input_gradient_ids = []
    for input_id, (input_t, input_blob) in enumerate(inputs):
        forward_links.append((str(input_t), str(input_blob), 0))

        input_blob_grad = str(input_blob) + "_grad"
        if backward_mapping[str(input_t)] != str(input_t) + "_grad":
            # Some scratch (internal blob) ends up being an input gradient
            # So we avoid extra copy and reuse it by applying this alias
            backward_aliases.append(
                (grad_to_state[backward_mapping[str(input_t)]],
                 input_blob_grad, 0))
        else:
            # This is a general case - we have to explicitly create input
            # gradient blob as it doesn't match any of internal gradients
            backward_links.append((str(input_t) + "_grad", input_blob_grad, 0))
            input_gradient_ids.append(input_id)

    backward_cell_net.Proto().external_input.extend(
        cell_net.Proto().external_input)
    backward_cell_net.Proto().external_input.extend(
        cell_net.Proto().external_output)

    def unpack_triple(x):
        if x:
            a, b, c = zip(*x)
            return a, b, c
        return [], [], []

    # Splitting to separate lists so we can pass them to c++
    # where we ensemle them back
    link_internal, link_external, link_offset = unpack_triple(forward_links)
    backward_link_internal, backward_link_external, backward_link_offset = \
        unpack_triple(backward_links)
    alias_src, alias_dst, alias_offset = unpack_triple(aliases)
    backward_alias_src, backward_alias_dst, backward_alias_offset = \
        unpack_triple(backward_aliases)

    params = [x for x in references if x in backward_mapping.keys()]

    return net.RecurrentNetwork(
        all_inputs,
        all_outputs,
        param=params,
        param_gradient=[backward_mapping[p] for p in params],
        alias_src=alias_src,
        alias_dst=map(str, alias_dst),
        alias_offset=alias_offset,
        recurrent_states=recurrent_states,
        recurrent_inputs=[str(x[1]) for x in initial_cell_inputs],
        recurrent_sizes=[int(x[2]) for x in initial_cell_inputs],
        link_internal=map(str, link_internal),
        link_external=map(str, link_external),
        link_offset=link_offset,
        backward_link_internal=map(str, backward_link_internal),
        backward_link_external=map(str, backward_link_external),
        backward_link_offset=backward_link_offset,
        backward_alias_src=backward_alias_src,
        backward_alias_dst=backward_alias_dst,
        backward_alias_offset=backward_alias_offset,
        scratch=[sc + "_states" for sc in scratches],
        backward_scratch=[sc + "_states_grad" for sc in scratches],
        scratch_sizes=scratch_sizes,
        step_net=str(cell_net.Proto()),
        backward_step_net=str(backward_cell_net.Proto()),
        timestep="timestep" if timestep is None else str(timestep),
        input_gradient_ids=input_gradient_ids,
    )
예제 #22
0
def recurrent_net(
        net, cell_net, inputs, initial_cell_inputs,
        links, timestep=None, scope=None, outputs_with_grads=(0,),
        recompute_blobs_on_backward=None, forward_only=False,
):
    '''
    net: the main net operator should be added to

    cell_net: cell_net which is executed in a recurrent fasion

    inputs: sequences to be fed into the recurrent net. Currently only one input
    is supported. It has to be in a format T x N x (D1...Dk) where T is lengths
    of the sequence. N is a batch size and (D1...Dk) are the rest of dimentions

    initial_cell_inputs: inputs of the cell_net for the 0 timestamp.
    Format for each input is:
        (cell_net_input_name, external_blob_with_data)

    links: a dictionary from cell_net input names in moment t+1 and
    output names of moment t. Currently we assume that each output becomes
    an input for the next timestep.

    timestep: name of the timestep blob to be used. If not provided "timestep"
    is used.

    scope: Internal blobs are going to be scoped in a format
    <scope_name>/<blob_name>
    If not provided we generate a scope name automatically

    outputs_with_grads : position indices of output blobs which will receive
    error gradient (from outside recurrent network) during backpropagation

    recompute_blobs_on_backward: specify a list of blobs that will be
                 recomputed for backward pass, and thus need not to be
                 stored for each forward timestep.

    forward_only: if True, only forward steps are executed
    '''
    assert len(inputs) == 1, "Only one input blob is supported so far"

    input_blobs = [str(i[0]) for i in inputs]
    initial_input_blobs = [str(x[1]) for x in initial_cell_inputs]
    op_name = net.NextName('recurrent')

    def s(name):
        # We have to manually scope due to our internal/external blob
        # relationships.
        scope_name = op_name if scope is None else scope
        return "{}/{}".format(str(scope_name), str(name))

    # determine inputs that are considered to be references
    # it is those that are not referred to in inputs or initial_cell_inputs
    known_inputs = [str(b) for b in input_blobs + initial_input_blobs]
    known_inputs += [str(x[0]) for x in initial_cell_inputs]
    if timestep is not None:
        known_inputs.append(str(timestep))
    references = [
        core.BlobReference(b) for b in cell_net.Proto().external_input
        if b not in known_inputs]

    inner_outputs = list(cell_net.Proto().external_output)
    # These gradients are expected to be available during the backward pass
    inner_outputs_map = {o: o + '_grad' for o in inner_outputs}

    # compute the backward pass of the cell net
    if not forward_only:
        backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass(
            cell_net.Proto().op, inner_outputs_map)
        backward_mapping = {str(k): v for k, v in viewitems(backward_mapping)}

        backward_cell_net = core.Net("RecurrentBackwardStep")
        del backward_cell_net.Proto().op[:]

        if recompute_blobs_on_backward is not None:
            # Insert operators to re-compute the specified blobs.
            # They are added in the same order as for the forward pass, thus
            # the order is correct.
            recompute_blobs_on_backward = {str(b) for b in
                                           recompute_blobs_on_backward}

            for op in cell_net.Proto().op:
                if not recompute_blobs_on_backward.isdisjoint(set(op.output)):
                    backward_cell_net.Proto().op.extend([op])
                    # This fires if other outputs than the declared
                    # are computed by the ops that are recomputed
                    assert set(op.output).issubset(recompute_blobs_on_backward)

        backward_cell_net.Proto().op.extend(backward_ops)
        # compute blobs used but not defined in the backward pass
        backward_ssa, backward_blob_versions = core.get_ssa(
            backward_cell_net.Proto())
        undefined = core.get_undefined_blobs(backward_ssa)

        # also add to the output list the intermediate outputs of fwd_step that
        # are used by backward.
        ssa, blob_versions = core.get_ssa(cell_net.Proto())
        scratches = [
            blob
            for blob, ver in viewitems(blob_versions)
            if (ver > 0 and
                blob in undefined and
                blob not in cell_net.Proto().external_output)
        ]
        backward_cell_net.Proto().external_input.extend(scratches)
        backward_cell_net.Proto().type = 'simple'
    else:
        backward_cell_net = None

    all_inputs = [i[1] for i in inputs] + [
        x[1] for x in initial_cell_inputs] + references
    all_outputs = []

    cell_net.Proto().type = 'simple'

    # Internal arguments used by RecurrentNetwork operator

    # Links are in the format blob_name, recurrent_states, offset.
    # In the moment t we know that corresponding data block is at
    # t + offset position in the recurrent_states tensor
    forward_links = []
    backward_links = []

    # Aliases are used to expose outputs to external world
    # Format (internal_blob, external_blob, offset)
    # Negative offset stands for going from the end,
    # positive - from the beginning
    aliases = []

    # States held inputs to the cell net
    recurrent_states = []

    for cell_input, _ in initial_cell_inputs:
        cell_input = str(cell_input)
        # Recurrent_states is going to be (T + 1) x ...
        # It stores all inputs and outputs of the cell net over time.
        # Or their gradients in the case of the backward pass.
        state = s(cell_input + "_states")
        states_grad = state + "_grad"
        cell_output = links[str(cell_input)]
        forward_links.append((cell_input, state, 0))
        forward_links.append((cell_output, state, 1))

        aliases.append((state, cell_output + "_all", 1))
        aliases.append((state, cell_output + "_last", -1))
        all_outputs.extend([cell_output + "_all", cell_output + "_last"])

        recurrent_states.append(state)

        if backward_cell_net is not None:
            backward_links.append((cell_output + "_grad", states_grad, 1))
            backward_cell_net.Proto().external_input.append(
                str(cell_output) + "_grad")

            recurrent_input_grad = cell_input + "_grad"
            if not backward_blob_versions.get(recurrent_input_grad, 0):
                # If nobody writes to this recurrent input gradient, we need
                # to make sure it gets to the states grad blob after all.
                # We do this by using backward_links which triggers an alias
                # This logic is being used for example in a SumOp case
                backward_links.append(
                    (backward_mapping[cell_input], states_grad, 0))
            else:
                backward_links.append((recurrent_input_grad, states_grad, 0))


    for input_t, input_blob in inputs:
        forward_links.append((str(input_t), str(input_blob), 0))

    if backward_cell_net is not None:
        for input_t, input_blob in inputs:
            backward_links.append((
                backward_mapping[str(input_t)], str(input_blob) + "_grad", 0
            ))
        backward_cell_net.Proto().external_input.extend(
            cell_net.Proto().external_input)
        backward_cell_net.Proto().external_input.extend(
            cell_net.Proto().external_output)

    def unpack_triple(x):
        if x:
            a, b, c = zip(*x)
            return a, b, c
        return [], [], []

    # Splitting to separate lists so we can pass them to c++
    # where we ensemle them back
    link_internal, link_external, link_offset = unpack_triple(forward_links)
    alias_src, alias_dst, alias_offset = unpack_triple(aliases)

    recurrent_inputs = [str(x[1]) for x in initial_cell_inputs]

    # Make sure that recurrent gradients accumulate with internal gradients
    # (if a blob in the backward_cell_net receives gradient from both an
    # external connection as well as from within the backward_cell_net,
    # those gradients need to be added together, rather than one overwriting
    # the other)
    if backward_cell_net is not None:
        proto = backward_cell_net.Proto()
        operators = []
        while len(proto.op) > 0:
            op = proto.op[-1]
            proto.op.remove(op)
            operators.append(op)
        for op in operators[::-1]:
            proto.op.extend([op])
            for j, output_blob in enumerate(op.output):
                if output_blob in proto.external_input:
                    # In place operation won't cause issues because it takes
                    # existing value of a blob into account
                    if output_blob in op.input:
                        continue
                    output_blob = core.BlobReference(output_blob)
                    accum_blob = output_blob + "_accum"
                    proto.op[-1].output[j] = str(accum_blob)
                    backward_cell_net.Sum(
                        [output_blob, accum_blob],
                        [output_blob],
                    )

    def map_to_dual_list(m):
        return [str(x) for x in list(m.keys())] + \
               [str(x) for x in list(m.values())]

    backward_args = {}
    if backward_cell_net is not None:
        backward_mapping_keys = set(viewkeys(backward_mapping))
        backward_link_internal, backward_link_external, backward_link_offset = \
            unpack_triple(backward_links)
        params = [x for x in references if x in backward_mapping_keys]
        param_grads = [
            str(backward_mapping[x])
            for x in references
            if x in backward_mapping_keys
        ]
        if recompute_blobs_on_backward is None:
            recompute_blobs_on_backward = set()
        backward_args = {
            'param': [all_inputs.index(p) for p in params],
            'backward_link_internal': [str(l) for l in backward_link_internal],
            'backward_link_external': [str(l) for l in backward_link_external],
            'backward_link_offset': backward_link_offset,
            'outputs_with_grads': outputs_with_grads,
            'recompute_blobs_on_backward': [
                str(b) for b in recompute_blobs_on_backward
            ],
            'param_grads': param_grads,
        }
        if len(backward_cell_net.Proto().op) != 0:
            backward_args['backward_step_net'] = backward_cell_net.Proto()


    results = net.RecurrentNetwork(
        all_inputs,
        all_outputs + [s("step_workspaces")],
        alias_src=alias_src,
        alias_dst=[str(a) for a in alias_dst],
        alias_offset=alias_offset,
        recurrent_states=recurrent_states,
        initial_recurrent_state_ids=[
            all_inputs.index(i) for i in recurrent_inputs
        ],
        link_internal=[str(l) for l in link_internal],
        link_external=[str(l) for l in link_external],
        link_offset=link_offset,
        enable_rnn_executor=1,
        step_net=cell_net.Proto(),
        timestep="timestep" if timestep is None else str(timestep),
        **backward_args
    )

    # Restore net type since 'rnn' is not recognized outside RNNs
    cell_net.Proto().type = 'simple'

    # The last output is a list of step workspaces,
    # which is only needed internally for gradient propogation
    return results[:-1]
예제 #23
0
def _generic_status_identifier(
    predict_net: caffe2_pb2.NetDef,
    status_updater: Callable,
    known_status: Dict[Tuple[str, int], Any],
) -> Dict[Tuple[str, int], Any]:
    """
    Statically infer the status of each blob, the status can be such as device type
        (CPU/GPU), layout (NCHW/NHWC), data type (float32/int8), etc. "Blob" here
        is versioned blob (Tuple[str, int]) in the format compatible with ssa.
    Inputs:
        predict_net: the caffe2 network
        status_updater: a callable, given an op and the status of its input/output,
            it returns the updated status of input/output. `None` is used for
            representing unknown status.
        known_status: a dict containing known status, used as initialization.
    Outputs:
        A dict mapping from versioned blob to its status
    """
    ssa, versions = core.get_ssa(predict_net)
    versioned_ext_input = [(b, 0) for b in predict_net.external_input]
    versioned_ext_output = [(b, versions[b]) for b in predict_net.external_output]
    all_versioned_blobs = set().union(*[set(x[0] + x[1]) for x in ssa])

    allowed_vbs = all_versioned_blobs.union(versioned_ext_input).union(versioned_ext_output)
    assert all(k in allowed_vbs for k in known_status)
    assert all(v is not None for v in known_status.values())
    _known_status = copy.deepcopy(known_status)

    def _check_and_update(key, value):
        assert value is not None
        if key in _known_status:
            if not _known_status[key] == value:
                raise RuntimeError(
                    "Confilict status for {}, existing status {}, new status {}".format(
                        key, _known_status[key], value
                    )
                )
        _known_status[key] = value

    def _update_i(op, ssa_i):
        versioned_inputs = ssa_i[0]
        versioned_outputs = ssa_i[1]

        inputs_status = [_known_status.get(b, None) for b in versioned_inputs]
        outputs_status = [_known_status.get(b, None) for b in versioned_outputs]

        new_inputs_status, new_outputs_status = status_updater(op, inputs_status, outputs_status)

        for versioned_blob, status in zip(
            versioned_inputs + versioned_outputs, new_inputs_status + new_outputs_status
        ):
            if status is not None:
                _check_and_update(versioned_blob, status)

    for op, ssa_i in zip(predict_net.op, ssa):
        _update_i(op, ssa_i)
    for op, ssa_i in zip(reversed(predict_net.op), reversed(ssa)):
        _update_i(op, ssa_i)

    # NOTE: This strictly checks all the blob from predict_net must be assgined
    # a known status. However sometimes it's impossible (eg. having deadend op),
    # we may relax this constraint if
    for k in all_versioned_blobs:
        if k not in _known_status:
            raise NotImplementedError(
                "Can not infer the status for {}. Currently only support the case where"
                " a single forward and backward pass can identify status for all blobs.".format(k)
            )

    return _known_status
예제 #24
0
def recurrent_net(net,
                  cell_net,
                  inputs,
                  initial_cell_inputs,
                  links,
                  timestep=None,
                  scope=None,
                  outputs_with_grads=(0, )):
    '''
    net: the main net operator should be added to

    cell_net: cell_net which is executed in a recurrent fasion

    inputs: sequences to be fed into the recurrent net. Currently only one input
    is supported. It has to be in a format T x N x (D1...Dk) where T is lengths
    of the sequence. N is a batch size and (D1...Dk) are the rest of dimentions

    initial_cell_inputs: inputs of the cell_net for the 0 timestamp.
    Format for each input is:
        (cell_net_input_name, external_blob_with_data)

    links: a dictionary from cell_net input names in moment t+1 and
    output names of moment t. Currently we assume that each output becomes
    an input for the next timestep.

    timestep: name of the timestep blob to be used. If not provided "timestep"
    is used.

    scope: Internal blobs are going to be scoped in a format
    <scope_name>/<blob_name>
    If not provided we generate a scope name automatically

    outputs_with_grads : position indices of output blobs which will receive
    error gradient (from outside recurrent network) during backpropagation
    '''
    assert len(inputs) == 1, "Only one input blob is supported so far"

    # Validate scoping
    for einp in cell_net.Proto().external_input:
        assert einp.startswith(CurrentNameScope()), \
            '''
            Cell net external inputs are not properly scoped, use
            AddScopedExternalInputs() when creating them
            '''

    input_blobs = [str(i[0]) for i in inputs]
    initial_input_blobs = [str(x[1]) for x in initial_cell_inputs]
    op_name = net.NextName('recurrent')

    def s(name):
        # We have to manually scope due to our internal/external blob
        # relationships.
        scope_name = op_name if scope is None else scope
        return "{}/{}".format(str(scope_name), str(name))

    # determine inputs that are considered to be references
    # it is those that are not referred to in inputs or initial_cell_inputs
    known_inputs = map(str, input_blobs + initial_input_blobs)
    known_inputs += [str(x[0]) for x in initial_cell_inputs]
    if timestep is not None:
        known_inputs.append(str(timestep))
    references = [
        core.BlobReference(b) for b in cell_net.Proto().external_input
        if b not in known_inputs
    ]

    inner_outputs = list(cell_net.Proto().external_output)
    # These gradients are expected to be available during the backward pass
    inner_outputs_map = {o: o + '_grad' for o in inner_outputs}

    # compute the backward pass of the cell net
    backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass(
        cell_net.Proto().op, inner_outputs_map)
    backward_mapping = {str(k): str(v) for k, v in backward_mapping.items()}
    backward_cell_net = core.Net("RecurrentBackwardStep")

    del backward_cell_net.Proto().op[:]
    backward_cell_net.Proto().op.extend(backward_ops)

    # compute blobs used but not defined in the backward pass
    ssa, _ = core.get_ssa(backward_cell_net.Proto())
    undefined = core.get_undefined_blobs(ssa)

    # also add to the output list the intermediate outputs of fwd_step that
    # are used by backward.
    ssa, blob_versions = core.get_ssa(cell_net.Proto())
    scratches = [
        blob for (blob, ver) in blob_versions.items() if ver > 0
        and blob in undefined and blob not in cell_net.Proto().external_output
    ]
    backward_cell_net.Proto().external_input.extend(scratches)

    all_inputs = [i[1] for i in inputs] + [x[1] for x in initial_cell_inputs
                                           ] + references
    all_outputs = []

    cell_net.Proto().type = 'simple'
    backward_cell_net.Proto().type = 'simple'

    # Internal arguments used by RecurrentNetwork operator

    # Links are in the format blob_name, recurrent_states, offset.
    # In the moment t we know that corresponding data block is at
    # t + offset position in the recurrent_states tensor
    forward_links = []
    backward_links = []

    # Aliases are used to expose outputs to external world
    # Format (internal_blob, external_blob, offset)
    # Negative offset stands for going from the end,
    # positive - from the beginning
    aliases = []

    # States held inputs to the cell net
    recurrent_states = []

    for cell_input, _ in initial_cell_inputs:
        cell_input = str(cell_input)
        # Recurrent_states is going to be (T + 1) x ...
        # It stores all inputs and outputs of the cell net over time.
        # Or their gradients in the case of the backward pass.
        state = s(cell_input + "_states")
        states_grad = state + "_grad"
        cell_output = links[str(cell_input)]
        forward_links.append((cell_input, state, 0))
        forward_links.append((cell_output, state, 1))
        backward_links.append((cell_input + "_grad", states_grad, 0))
        backward_links.append((cell_output + "_grad", states_grad, 1))

        backward_cell_net.Proto().external_input.append(
            str(cell_output) + "_grad")
        aliases.append((state, cell_output + "_all", 1))
        aliases.append((state, cell_output + "_last", -1))
        all_outputs.extend([cell_output + "_all", cell_output + "_last"])

        recurrent_states.append(state)

    for input_t, input_blob in inputs:
        forward_links.append((str(input_t), str(input_blob), 0))
        backward_links.append(
            (backward_mapping[str(input_t)], str(input_blob) + "_grad", 0))
    backward_cell_net.Proto().external_input.extend(
        cell_net.Proto().external_input)
    backward_cell_net.Proto().external_input.extend(
        cell_net.Proto().external_output)

    def unpack_triple(x):
        if x:
            a, b, c = zip(*x)
            return a, b, c
        return [], [], []

    # Splitting to separate lists so we can pass them to c++
    # where we ensemle them back
    link_internal, link_external, link_offset = unpack_triple(forward_links)
    backward_link_internal, backward_link_external, backward_link_offset = \
        unpack_triple(backward_links)
    alias_src, alias_dst, alias_offset = unpack_triple(aliases)

    params = [x for x in references if x in backward_mapping.keys()]
    recurrent_inputs = [str(x[1]) for x in initial_cell_inputs]

    global _workspace_seq
    results = net.RecurrentNetwork(
        all_inputs,
        all_outputs + [s("step_workspaces_{}".format(_workspace_seq))],
        param=map(all_inputs.index, params),
        alias_src=alias_src,
        alias_dst=map(str, alias_dst),
        alias_offset=alias_offset,
        recurrent_states=recurrent_states,
        initial_recurrent_state_ids=map(all_inputs.index, recurrent_inputs),
        link_internal=map(str, link_internal),
        link_external=map(str, link_external),
        link_offset=link_offset,
        backward_link_internal=map(str, backward_link_internal),
        backward_link_external=map(str, backward_link_external),
        backward_link_offset=backward_link_offset,
        step_net=str(cell_net.Proto()),
        backward_step_net=str(backward_cell_net.Proto()),
        timestep="timestep" if timestep is None else str(timestep),
        outputs_with_grads=outputs_with_grads,
    )
    _workspace_seq += 1
    # The last output is a list of step workspaces,
    # which is only needed internally for gradient propogation
    return results[:-1]
예제 #25
0
def recurrent_net(
        net,
        cell_net,
        inputs,
        initial_cell_inputs,
        links,
        timestep=None,
        scope=None,
        outputs_with_grads=(0, ),
        recompute_blobs_on_backward=None,
):
    '''
    net: the main net operator should be added to

    cell_net: cell_net which is executed in a recurrent fasion

    inputs: sequences to be fed into the recurrent net. Currently only one input
    is supported. It has to be in a format T x N x (D1...Dk) where T is lengths
    of the sequence. N is a batch size and (D1...Dk) are the rest of dimentions

    initial_cell_inputs: inputs of the cell_net for the 0 timestamp.
    Format for each input is:
        (cell_net_input_name, external_blob_with_data)

    links: a dictionary from cell_net input names in moment t+1 and
    output names of moment t. Currently we assume that each output becomes
    an input for the next timestep.

    timestep: name of the timestep blob to be used. If not provided "timestep"
    is used.

    scope: Internal blobs are going to be scoped in a format
    <scope_name>/<blob_name>
    If not provided we generate a scope name automatically

    outputs_with_grads : position indices of output blobs which will receive
    error gradient (from outside recurrent network) during backpropagation

    recompute_blobs_on_backward: specify a list of blobs that will be
                 recomputed for backward pass, and thus need not to be
                 stored for each forward timestep.
    '''
    assert len(inputs) == 1, "Only one input blob is supported so far"

    # Validate scoping
    for einp in cell_net.Proto().external_input:
        assert einp.startswith(CurrentNameScope()), \
            '''
            Cell net external inputs are not properly scoped, use
            AddScopedExternalInputs() when creating them
            '''

    input_blobs = [str(i[0]) for i in inputs]
    initial_input_blobs = [str(x[1]) for x in initial_cell_inputs]
    op_name = net.NextName('recurrent')

    def s(name):
        # We have to manually scope due to our internal/external blob
        # relationships.
        scope_name = op_name if scope is None else scope
        return "{}/{}".format(str(scope_name), str(name))

    # determine inputs that are considered to be references
    # it is those that are not referred to in inputs or initial_cell_inputs
    known_inputs = map(str, input_blobs + initial_input_blobs)
    known_inputs += [str(x[0]) for x in initial_cell_inputs]
    if timestep is not None:
        known_inputs.append(str(timestep))
    references = [
        core.BlobReference(b) for b in cell_net.Proto().external_input
        if b not in known_inputs
    ]

    inner_outputs = list(cell_net.Proto().external_output)
    # These gradients are expected to be available during the backward pass
    inner_outputs_map = {o: o + '_grad' for o in inner_outputs}

    # compute the backward pass of the cell net
    backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass(
        cell_net.Proto().op, inner_outputs_map)
    backward_mapping = {str(k): v for k, v in backward_mapping.items()}
    backward_cell_net = core.Net("RecurrentBackwardStep")
    del backward_cell_net.Proto().op[:]

    if recompute_blobs_on_backward is not None:
        # Insert operators to re-compute the specified blobs.
        # They are added in the same order as for the forward pass, thus
        # the order is correct.
        recompute_blobs_on_backward = set(
            [str(b) for b in recompute_blobs_on_backward])
        for op in cell_net.Proto().op:
            if not recompute_blobs_on_backward.isdisjoint(set(op.output)):
                backward_cell_net.Proto().op.extend([op])
                assert set(op.output).issubset(recompute_blobs_on_backward), \
                       'Outputs {} are output by op but not recomputed: {}'.format(
                            set(op.output) - recompute_blobs_on_backward,
                            op
                       )
    else:
        recompute_blobs_on_backward = set()

    backward_cell_net.Proto().op.extend(backward_ops)
    # compute blobs used but not defined in the backward pass
    backward_ssa, backward_blob_versions = core.get_ssa(
        backward_cell_net.Proto())
    undefined = core.get_undefined_blobs(backward_ssa)

    # also add to the output list the intermediate outputs of fwd_step that
    # are used by backward.
    ssa, blob_versions = core.get_ssa(cell_net.Proto())
    scratches = [
        blob for (blob, ver) in blob_versions.items() if ver > 0
        and blob in undefined and blob not in cell_net.Proto().external_output
    ]
    backward_cell_net.Proto().external_input.extend(scratches)

    all_inputs = [i[1] for i in inputs] + [x[1] for x in initial_cell_inputs
                                           ] + references
    all_outputs = []

    cell_net.Proto().type = 'simple'
    backward_cell_net.Proto().type = 'simple'

    # Internal arguments used by RecurrentNetwork operator

    # Links are in the format blob_name, recurrent_states, offset.
    # In the moment t we know that corresponding data block is at
    # t + offset position in the recurrent_states tensor
    forward_links = []
    backward_links = []

    # Aliases are used to expose outputs to external world
    # Format (internal_blob, external_blob, offset)
    # Negative offset stands for going from the end,
    # positive - from the beginning
    aliases = []

    # States held inputs to the cell net
    recurrent_states = []

    for cell_input, _ in initial_cell_inputs:
        cell_input = str(cell_input)
        # Recurrent_states is going to be (T + 1) x ...
        # It stores all inputs and outputs of the cell net over time.
        # Or their gradients in the case of the backward pass.
        state = s(cell_input + "_states")
        states_grad = state + "_grad"
        cell_output = links[str(cell_input)]
        forward_links.append((cell_input, state, 0))
        forward_links.append((cell_output, state, 1))
        backward_links.append((cell_output + "_grad", states_grad, 1))

        backward_cell_net.Proto().external_input.append(
            str(cell_output) + "_grad")
        aliases.append((state, cell_output + "_all", 1))
        aliases.append((state, cell_output + "_last", -1))
        all_outputs.extend([cell_output + "_all", cell_output + "_last"])

        recurrent_states.append(state)

        recurrent_input_grad = cell_input + "_grad"
        if not backward_blob_versions.get(recurrent_input_grad, 0):
            # If nobody writes to this recurrent input gradient, we need
            # to make sure it gets to the states grad blob after all.
            # We do this by using backward_links which triggers an alias
            # This logic is being used for example in a SumOp case
            backward_links.append(
                (backward_mapping[cell_input], states_grad, 0))
        else:
            backward_links.append((cell_input + "_grad", states_grad, 0))

    for reference in references:
        # Similar to above, in a case of a SumOp we need to write our parameter
        # gradient to an external blob. In this case we can be sure that
        # reference + "_grad" is a correct parameter name as we know how
        # RecurrentNetworkOp gradient schema looks like.
        reference_grad = reference + "_grad"
        if (reference in backward_mapping
                and reference_grad != str(backward_mapping[reference])):
            # We can use an Alias because after each timestep
            # RNN op adds value from reference_grad into and _acc blob
            # which accumulates gradients for corresponding parameter accross
            # timesteps. Then in the end of RNN op these two are being
            # swaped and reference_grad blob becomes a real blob instead of
            # being an alias
            backward_cell_net.Alias(backward_mapping[reference],
                                    reference_grad)

    for input_t, input_blob in inputs:
        forward_links.append((str(input_t), str(input_blob), 0))
        backward_links.append(
            (backward_mapping[str(input_t)], str(input_blob) + "_grad", 0))
    backward_cell_net.Proto().external_input.extend(
        cell_net.Proto().external_input)
    backward_cell_net.Proto().external_input.extend(
        cell_net.Proto().external_output)

    def unpack_triple(x):
        if x:
            a, b, c = zip(*x)
            return a, b, c
        return [], [], []

    # Splitting to separate lists so we can pass them to c++
    # where we ensemle them back
    link_internal, link_external, link_offset = unpack_triple(forward_links)
    backward_link_internal, backward_link_external, backward_link_offset = \
        unpack_triple(backward_links)
    alias_src, alias_dst, alias_offset = unpack_triple(aliases)

    params = [x for x in references if x in backward_mapping.keys()]
    recurrent_inputs = [str(x[1]) for x in initial_cell_inputs]

    global _workspace_seq
    results = net.RecurrentNetwork(
        all_inputs,
        all_outputs + [s("step_workspaces")],
        param=map(all_inputs.index, params),
        alias_src=alias_src,
        alias_dst=map(str, alias_dst),
        alias_offset=alias_offset,
        recurrent_states=recurrent_states,
        initial_recurrent_state_ids=map(all_inputs.index, recurrent_inputs),
        link_internal=map(str, link_internal),
        link_external=map(str, link_external),
        link_offset=link_offset,
        backward_link_internal=map(str, backward_link_internal),
        backward_link_external=map(str, backward_link_external),
        backward_link_offset=backward_link_offset,
        step_net=str(cell_net.Proto()),
        backward_step_net=str(backward_cell_net.Proto()),
        timestep="timestep" if timestep is None else str(timestep),
        outputs_with_grads=outputs_with_grads,
        recompute_blobs_on_backward=map(str, recompute_blobs_on_backward))
    # The last output is a list of step workspaces,
    # which is only needed internally for gradient propogation
    return results[:-1]
예제 #26
0
def recurrent_net(
        net, cell_net, inputs, initial_cell_inputs,
        links, timestep=None, scope=None, outputs_with_grads=(0,),
        recompute_blobs_on_backward=None, forward_only=False,
):
    '''
    net: the main net operator should be added to

    cell_net: cell_net which is executed in a recurrent fasion

    inputs: sequences to be fed into the recurrent net. Currently only one input
    is supported. It has to be in a format T x N x (D1...Dk) where T is lengths
    of the sequence. N is a batch size and (D1...Dk) are the rest of dimentions

    initial_cell_inputs: inputs of the cell_net for the 0 timestamp.
    Format for each input is:
        (cell_net_input_name, external_blob_with_data)

    links: a dictionary from cell_net input names in moment t+1 and
    output names of moment t. Currently we assume that each output becomes
    an input for the next timestep.

    timestep: name of the timestep blob to be used. If not provided "timestep"
    is used.

    scope: Internal blobs are going to be scoped in a format
    <scope_name>/<blob_name>
    If not provided we generate a scope name automatically

    outputs_with_grads : position indices of output blobs which will receive
    error gradient (from outside recurrent network) during backpropagation

    recompute_blobs_on_backward: specify a list of blobs that will be
                 recomputed for backward pass, and thus need not to be
                 stored for each forward timestep.

    forward_only: if True, only forward steps are executed
    '''
    assert len(inputs) == 1, "Only one input blob is supported so far"

    # Validate scoping
    for einp in cell_net.Proto().external_input:
        assert einp.startswith(CurrentNameScope()), \
            '''
            Cell net external inputs are not properly scoped, use
            AddScopedExternalInputs() when creating them
            '''

    input_blobs = [str(i[0]) for i in inputs]
    initial_input_blobs = [str(x[1]) for x in initial_cell_inputs]
    op_name = net.NextName('recurrent')

    def s(name):
        # We have to manually scope due to our internal/external blob
        # relationships.
        scope_name = op_name if scope is None else scope
        return "{}/{}".format(str(scope_name), str(name))

    # determine inputs that are considered to be references
    # it is those that are not referred to in inputs or initial_cell_inputs
    known_inputs = [str(b) for b in input_blobs + initial_input_blobs]
    known_inputs += [str(x[0]) for x in initial_cell_inputs]
    if timestep is not None:
        known_inputs.append(str(timestep))
    references = [
        core.BlobReference(b) for b in cell_net.Proto().external_input
        if b not in known_inputs]

    inner_outputs = list(cell_net.Proto().external_output)
    # These gradients are expected to be available during the backward pass
    inner_outputs_map = {o: o + '_grad' for o in inner_outputs}

    # compute the backward pass of the cell net
    if not forward_only:
        backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass(
            cell_net.Proto().op, inner_outputs_map)
        backward_mapping = {str(k): v for k, v in viewitems(backward_mapping)}

        backward_cell_net = core.Net("RecurrentBackwardStep")
        del backward_cell_net.Proto().op[:]

        if recompute_blobs_on_backward is not None:
            # Insert operators to re-compute the specified blobs.
            # They are added in the same order as for the forward pass, thus
            # the order is correct.
            recompute_blobs_on_backward = {str(b) for b in
                                           recompute_blobs_on_backward}

            for op in cell_net.Proto().op:
                if not recompute_blobs_on_backward.isdisjoint(set(op.output)):
                    backward_cell_net.Proto().op.extend([op])
                    # This fires if other outputs than the declared
                    # are computed by the ops that are recomputed
                    assert set(op.output).issubset(recompute_blobs_on_backward)

        backward_cell_net.Proto().op.extend(backward_ops)
        # compute blobs used but not defined in the backward pass
        backward_ssa, backward_blob_versions = core.get_ssa(
            backward_cell_net.Proto())
        undefined = core.get_undefined_blobs(backward_ssa)

        # also add to the output list the intermediate outputs of fwd_step that
        # are used by backward.
        ssa, blob_versions = core.get_ssa(cell_net.Proto())
        scratches = [
            blob
            for blob, ver in viewitems(blob_versions)
            if (ver > 0 and
                blob in undefined and
                blob not in cell_net.Proto().external_output)
        ]
        backward_cell_net.Proto().external_input.extend(scratches)
        backward_cell_net.Proto().type = 'simple'
    else:
        backward_cell_net = None

    all_inputs = [i[1] for i in inputs] + [
        x[1] for x in initial_cell_inputs] + references
    all_outputs = []

    cell_net.Proto().type = 'rnn'

    # Internal arguments used by RecurrentNetwork operator

    # Links are in the format blob_name, recurrent_states, offset.
    # In the moment t we know that corresponding data block is at
    # t + offset position in the recurrent_states tensor
    forward_links = []
    backward_links = []

    # Aliases are used to expose outputs to external world
    # Format (internal_blob, external_blob, offset)
    # Negative offset stands for going from the end,
    # positive - from the beginning
    aliases = []

    # States held inputs to the cell net
    recurrent_states = []

    for cell_input, _ in initial_cell_inputs:
        cell_input = str(cell_input)
        # Recurrent_states is going to be (T + 1) x ...
        # It stores all inputs and outputs of the cell net over time.
        # Or their gradients in the case of the backward pass.
        state = s(cell_input + "_states")
        states_grad = state + "_grad"
        cell_output = links[str(cell_input)]
        forward_links.append((cell_input, state, 0))
        forward_links.append((cell_output, state, 1))

        aliases.append((state, cell_output + "_all", 1))
        aliases.append((state, cell_output + "_last", -1))
        all_outputs.extend([cell_output + "_all", cell_output + "_last"])

        recurrent_states.append(state)

        if backward_cell_net is not None:
            backward_links.append((cell_output + "_grad", states_grad, 1))
            backward_cell_net.Proto().external_input.append(
                str(cell_output) + "_grad")

            recurrent_input_grad = cell_input + "_grad"
            if not backward_blob_versions.get(recurrent_input_grad, 0):
                # If nobody writes to this recurrent input gradient, we need
                # to make sure it gets to the states grad blob after all.
                # We do this by using backward_links which triggers an alias
                # This logic is being used for example in a SumOp case
                backward_links.append(
                    (backward_mapping[cell_input], states_grad, 0))
            else:
                backward_links.append((cell_input + "_grad", states_grad, 0))

    for input_t, input_blob in inputs:
        forward_links.append((str(input_t), str(input_blob), 0))

    if backward_cell_net is not None:
        for input_t, input_blob in inputs:
            backward_links.append((
                backward_mapping[str(input_t)], str(input_blob) + "_grad", 0
            ))
        backward_cell_net.Proto().external_input.extend(
            cell_net.Proto().external_input)
        backward_cell_net.Proto().external_input.extend(
            cell_net.Proto().external_output)

    def unpack_triple(x):
        if x:
            a, b, c = zip(*x)
            return a, b, c
        return [], [], []

    # Splitting to separate lists so we can pass them to c++
    # where we ensemle them back
    link_internal, link_external, link_offset = unpack_triple(forward_links)
    alias_src, alias_dst, alias_offset = unpack_triple(aliases)

    recurrent_inputs = [str(x[1]) for x in initial_cell_inputs]

    # Make sure that recurrent gradients accumulate with internal gradients
    # (if a blob in the backward_cell_net receives gradient from both an
    # external connection as well as from within the backward_cell_net,
    # those gradients need to be added together, rather than one overwriting
    # the other)
    if backward_cell_net is not None:
        proto = backward_cell_net.Proto()
        operators = []
        while len(proto.op) > 0:
            op = proto.op[-1]
            proto.op.remove(op)
            operators.append(op)
        for op in operators[::-1]:
            proto.op.extend([op])
            for j, output_blob in enumerate(op.output):
                if output_blob in proto.external_input:
                    # In place operation won't cause issues because it takes
                    # existing value of a blob into account
                    if output_blob in op.input:
                        continue
                    output_blob = core.BlobReference(output_blob)
                    accum_blob = output_blob + "_accum"
                    proto.op[-1].output[j] = str(accum_blob)
                    backward_cell_net.Sum(
                        [output_blob, accum_blob],
                        [output_blob],
                    )

    backward_args = {}
    backward_mapping_keys = set(viewkeys(backward_mapping))
    if backward_cell_net is not None:
        backward_link_internal, backward_link_external, backward_link_offset = \
            unpack_triple(backward_links)
        params = [x for x in references if x in backward_mapping_keys]
        param_grads = [
            str(backward_mapping[x])
            for x in references
            if x in backward_mapping_keys
        ]
        if recompute_blobs_on_backward is None:
            recompute_blobs_on_backward = set()
        backward_args = {
            'param': [all_inputs.index(p) for p in params],
            'backward_link_internal': [str(l) for l in backward_link_internal],
            'backward_link_external': [str(l) for l in backward_link_external],
            'backward_link_offset': backward_link_offset,
            'backward_step_net': str(backward_cell_net.Proto()),
            'outputs_with_grads': outputs_with_grads,
            'recompute_blobs_on_backward': [
                str(b) for b in recompute_blobs_on_backward
            ],
            'param_grads': param_grads,
        }

    results = net.RecurrentNetwork(
        all_inputs,
        all_outputs + [s("step_workspaces")],
        alias_src=alias_src,
        alias_dst=[str(a) for a in alias_dst],
        alias_offset=alias_offset,
        recurrent_states=recurrent_states,
        initial_recurrent_state_ids=[
            all_inputs.index(i) for i in recurrent_inputs
        ],
        link_internal=[str(l) for l in link_internal],
        link_external=[str(l) for l in link_external],
        link_offset=link_offset,
        step_net=str(cell_net.Proto()),
        timestep="timestep" if timestep is None else str(timestep),
        **backward_args
    )

    # Restore net type since 'rnn' is not recognized outside RNNs
    cell_net.Proto().type = 'simple'

    # The last output is a list of step workspaces,
    # which is only needed internally for gradient propogation
    return results[:-1]