Esempio n. 1
0
    def convert(self, model: "keras.models.Model") -> Graph:
        """convert(model, input_orders=None)

        Convert kerasmodel into WebDNN IR Graph. First, WebDNN try to convert backend TensorFlow graph by TensorFlowConverter.
        If TensorFlowConverter failed to convert, then KerasConverter converts model by itself

        Args:
            model (`keras.models.Model`): keras model

        .. example::

            model = keras.models.load_model("pre_trained_model.h5")
            graph = KerasConverter(batch_size=1).convert(model)

        Returns:
            (:class:`~webdnn.graph.graph.Graph`): WebDNN IR Graph
        """
        if not self._use_tensorflow_converter:
            return self._convert_fallback(model)

        else:
            # noinspection PyBroadException
            try:
                return TensorFlowConverter(
                    session=K.get_session(),
                    batch_size=self._batch_size).convert(
                        model.inputs, model.outputs)

            except Exception:
                self._use_tensorflow_converter = False
                console.debug(traceback.format_exc())
                console.debug(
                    "[KerasConverter] TensorflowConverter failed to convert.")

        return self._convert_fallback(model)
Esempio n. 2
0
    def optimize(self, graph: Graph) -> Tuple[Graph, bool]:
        """optimize(graph)

        Optimize the given graph. In the single call, this rule is applied multiple times until the graph will be not changed.

        args:
            graph(:class:`~webdnn.Graph`): Computational graph

        returns:
            (tuple of :class:`~webdnn.Graph` and bool): Optimized graph and flag whether the graph is changed or not.
        """
        if not all(self.flags()):
            return graph, False

        flag_retry = True
        flag_totally_changed = False

        while flag_retry:
            flag_retry = False

            for sub_rule in self.sub_rules:
                if not all(sub_rule.flags()):
                    continue

                graph, flag_changed = sub_rule.optimize(graph)
                if flag_changed:
                    console.debug(f"[OptimizeRule] apply: {sub_rule.__class__.__name__}")

                flag_retry |= flag_changed

            flag_totally_changed |= flag_retry

        return graph, flag_totally_changed
Esempio n. 3
0
def _optimize_inplace(operators: List[Operator],
                      allocations_dict: AllocationDict):
    if not (flags.optimize.OPTIMIZE
            and flags.optimize.OPTIMIZE_MEMORY_ALLOCATION
            and flags.optimize.OPTIMIZE_INPLACE_OPERATION):
        console.debug('_optimize_inplace is skipped')
        return

    for op in operators:
        for attr in op.get_attribute(Inplace):
            v_in = attr.get_input()
            v_out = attr.get_output()

            if v_in.has_attribute(Input):
                continue

            if isinstance(v_in, ConstantVariable):
                continue

            if any(v_in.stride_dict[a] != v_out.stride_dict[a]
                   for a in v_out.order.axes if a in v_in.order.axes):
                continue

            _merge_allocation(allocations_dict, allocations_dict[v_in],
                              allocations_dict[v_out])
Esempio n. 4
0
def _optimize_inplace(operators: List[Operator], allocations_dict: AllocationDict):
    if not (flags.optimize.OPTIMIZE and flags.optimize.OPTIMIZE_MEMORY_ALLOCATION and flags.optimize.OPTIMIZE_INPLACE_OPERATION):
        console.debug('_optimize_inplace is skipped')
        return

    for op in operators:
        for attr in op.get_attribute(Inplace):  # type: Inplace
            _merge_allocation(allocations_dict, allocations_dict[attr.get_input()], allocations_dict[attr.get_output()])
Esempio n. 5
0
    def _convert_operator(self, proto: INodeProto):
        console.debug(
            f"-----------------------------------------------------------")
        console.debug(f"Type  : {proto.op_type}")
        console.debug(f"Input : {proto.input}")
        console.debug(f"Output: {proto.output}")
        for name, val in attribute_dict(proto).items():
            console.debug(f"Attr  : {name} = {val}")

        super(ONNXConverter, self)._convert_operator(proto)
Esempio n. 6
0
def dump_op(op: Operator):  # pragma: no cover
    parameters_sorted = [
        repr(key) + ': ' + str(op.parameters[key])
        for key in sorted(op.parameters.keys())
    ]
    console.debug(f"{op.__class__.__name__} : {op.name}")
    console.debug(f"    In  : {op.inputs}")
    console.debug(f"    Out : {op.outputs}")
    console.debug(
        f"    Attr: {', '.join(sorted(str(attr) for attr in op.attributes))}")
    console.debug(f"    Parameters: {{{', '.join(parameters_sorted)}}}")
Esempio n. 7
0
def validate_kernel_source(descriptor: GraphDescriptor):
    # FIXME: WebGPU supports multi shader languages, but this test supposes the language as METAL.

    source = descriptor.concat_kernel_sources()

    if os.name != 'posix':
        # os.name in mac is 'posix', and xcrun command is only in mac
        console.warning(
            "[WebGPUDescriptorGenerator] 'xcrun' command is not found. validation of generated source code in webgpu backend is "
            "skipped.")
        return

    with tmp.TemporaryDirectory() as tmpdir:
        source_path = path.join(tmpdir, "kernel.metal")
        lib_path = path.join(tmpdir, "kernel.air")

        with open(source_path, "w+") as f:
            f.write(source)

        try:
            result = subprocess.run([
                "xcrun", "-sdk", "macosx", "metal", source_path, "-o", lib_path
            ],
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)

            if result.returncode == 0:
                if result.stderr == b"":
                    console.debug(
                        "[WebGPUDescriptorGenerator] Generated kernel source is valid."
                    )

                else:
                    console.warning(
                        "[WebGPUDescriptorGenerator] In validating kernel source, warnings are generated."
                    )
                    console.stderr(result.stderr.decode("utf-8"))

            else:
                console.error(
                    "[WebGPUDescriptorGenerator] Generated kernel source is invalid."
                )
                console.stderr(result.stderr.decode("utf-8"))
                exit(result.returncode)

        except FileNotFoundError:
            console.warning(
                "[WebGPUDescriptorGenerator] 'xcrun' command is not found. validation of generated source code in webgpu backend is "
                "skipped.")
            return
Esempio n. 8
0
    def generate(cls, graph: Graph, **kwargs):
        if flags.DEBUG:
            traverse.dump(graph)

        memory_layout = allocate(graph)

        console.debug(
            f"[FallbackDescriptorGenerator] memory_layout total size: {memory_layout.total_size * 4}"
        )
        console.debug(
            f"[FallbackDescriptorGenerator] memory_layout static size: {memory_layout.static_size * 4}"
        )
        console.debug(
            f"[FallbackDescriptorGenerator] memory_layout dynamic size: {memory_layout.dynamic_size * 4}"
        )

        constant_encoder = ConstantEncoder.get_encoder(
            kwargs.get("constant_encoder_name", None))
        constants_bytes = constant_encoder.encode(memory_layout)

        console.debug(
            f"[FallbackDescriptorGenerator] constants encoded size: {len(constants_bytes)}"
        )

        descriptor = GraphDescriptor(kernels=cls.generate_kernels(
            graph, memory_layout),
                                     memory_layout=memory_layout,
                                     inputs=graph.inputs,
                                     outputs=graph.outputs,
                                     constants_encoding=constant_encoder.name,
                                     licenses=graph.licenses)

        return GraphExecutionData(graph, descriptor, constants_bytes)
Esempio n. 9
0
    def generate(cls, graph: Graph, **kwargs):
        graph, _ = WebGPUOptimizeRule().optimize(graph)
        if flags.DEBUG:
            traverse.dump(graph)

        memory_layout = allocate(graph)
        console.debug(f"[WebGPUDescriptorGenerator] memory_layout total size: {memory_layout.total_size * 4}[B]")
        console.debug(f"[WebGPUDescriptorGenerator] memory_layout static size: {memory_layout.static_size * 4}[B]")
        console.debug(f"[WebGPUDescriptorGenerator] memory_layout dynamic size: {memory_layout.dynamic_size * 4}[B]")

        constant_encoder = ConstantEncoder.get_encoder(kwargs.get("constant_encoder_name", None))
        constants_bytes = constant_encoder.encode(memory_layout)

        console.debug(f"[WebGPUDescriptorGenerator] constants encoded size: {len(constants_bytes)}[B]")

        kernels = cls.generate_kernels(graph, memory_layout)

        descriptor = GraphDescriptor(
            kernels=kernels,
            memory_layout=memory_layout,
            inputs=graph.inputs,
            outputs=graph.outputs,
            constants_encoding=constant_encoder.name,
            licenses=graph.licenses
        )

        if flags.optimize.VALIDATE_GENERATED_SOURCE:
            validate_kernel_source(descriptor)

        return GraphExecutionData(graph, descriptor, constants_bytes)
Esempio n. 10
0
    def convert(self, chainer_computational_graph: chainer.computational_graph.
                ComputationalGraph, input_c_vars: List[chainer.Variable],
                output_c_vars: List[chainer.Variable]) -> Graph:
        # In chainer v2, variables are represented as Variable and VariableNode object, and
        # graph information such as edge connection is contained in variable node.
        # Therefore all chainer variable must be normalized into variable node.
        input_c_vars = [_to_variable_node(v) for v in input_c_vars]
        output_c_vars = [_to_variable_node(v) for v in output_c_vars]

        # Append InputVariable attribute to input variables
        input_n_vars = []
        for c_var in input_c_vars:
            n_var = self._convert_var(c_var)
            n_var.attributes.add(Input(n_var))
            input_n_vars.append(n_var)

        self._convert_weight_vars(chainer_computational_graph)

        pending_c_oprs = [
            c_opr for c_opr in chainer_computational_graph.nodes
            if isinstance(c_opr, chainer.Function)
        ]

        while len(pending_c_oprs) > 0:
            for c_opr in pending_c_oprs:
                if all(((self.has_variable(_to_variable_node(c_var)))
                        for c_var in c_opr.inputs)):
                    # All input variables of the `cfunc` are converted, so this `c_opr` can be converted.
                    self.convert_operator(c_opr)
                    pending_c_oprs.remove(c_opr)
                    break  # for c_opr in pending_functions
            else:
                console.debug(pending_c_oprs)
                raise ValueError("Inputs to functions cannot be resolved.")

        # Append OutputVariable attribute to output variables
        output_n_vars = []
        for c_var in output_c_vars:
            if not self.has_variable(c_var):
                raise ValueError("Output variable is not generated by graph.")
            n_var = self.get_variable(c_var)
            n_var.attributes.add(Output)
            output_n_vars.append(n_var)

        # Convert variable order into typical one in Chainer
        self._transpose_vars()

        return Graph(input_n_vars, output_n_vars)
Esempio n. 11
0
def _convert_batch_normalization_function(
    converter: ChainerConverter, c_op: chainer.functions.normalization.
    batch_normalization.BatchNormalizationFunction):
    x = converter.get_variable(c_op.inputs[0])
    gamma = converter.get_variable(c_op.inputs[1])
    beta = converter.get_variable(c_op.inputs[2])

    if len(c_op.inputs) == 5:
        # noinspection PyUnresolvedReferences
        mean_data = converter.get_variable(c_op.inputs[3]).data
        # noinspection PyUnresolvedReferences
        variance_data = converter.get_variable(c_op.inputs[4]).data

    elif len(c_op.inputs) == 3:
        variance_data = c_op.running_var
        mean_data = c_op.running_mean

    else:
        raise ValueError(
            "inputs to BatchNormalizationFunction have to be 5 or 3.")
    console.debug(variance_data)

    # Simplify scale and bias
    #
    # from:
    #   y = (x - mean) / sqrt(var + eps) * gamma + beta
    #
    # to:
    #   y = x * gamma_div_std + beta_scaled
    #
    #   gamma_div_std = gamma / sqrt(var + eps)
    #   beta_scaled   = beta - mean * gamma_div_std

    # noinspection PyUnresolvedReferences
    gamma_div_std = gamma.data / np.sqrt(variance_data + c_op.eps)
    # noinspection PyUnresolvedReferences
    beta_scaled = beta.data - mean_data * gamma_div_std

    scale_opr = AxiswiseScale(None, axis=Axis.C)
    gamma_div_std_const = ConstantVariable(gamma_div_std, OrderC)
    scale_out, = scale_opr(x, gamma_div_std_const)

    offset_opr = AxiswiseBias(None, axis=Axis.C)
    beta_scaled_const = ConstantVariable(beta_scaled, OrderC)
    offset_out, = offset_opr(scale_out, beta_scaled_const)

    converter.set_variable(c_op.outputs[0](), offset_out)
Esempio n. 12
0
def _optimize_inplace(operators: Sequence[Operator],
                      allocations_dict: AllocationDict):
    if not (flags.optimize.OPTIMIZE
            and flags.optimize.OPTIMIZE_MEMORY_ALLOCATION
            and flags.optimize.OPTIMIZE_INPLACE_OPERATION):
        console.debug('_optimize_inplace is skipped')
        return

    for op in operators:
        for attr in op.get_attribute(Inplace):  # type: Inplace
            a1 = allocations_dict[attr.get_input()]
            a2 = allocations_dict[attr.get_output()]
            if not Placeholder.check_resolved(
                    a1.size) or not Placeholder.check_resolved(a2.size):
                continue

            _merge_allocation(allocations_dict, a1, a2)
Esempio n. 13
0
    def convert(self, model: "keras.models.Model") -> Graph:
        """convert(model, input_orders=None)

        Convert kerasmodel into WebDNN IR Graph. First, WebDNN try to convert backend TensorFlow graph by TensorFlowConverter.
        If TensorFlowConverter failed to convert, then KerasConverter converts model by itself

        Args:
            model (`keras.models.Model`): keras model

        .. admonition:: example

            Convert pre-trained keras ResNet model.

            .. code::

                import keras
                from webdnn.frontend.keras import KerasConverter

                model = keras.applications.resnet50.ResNet50(include_top=True, weights='imagenet')
                graph = KerasConverter(batch_size=1).convert(model)

        Returns:
            (:class:`~webdnn.graph.graph.Graph`): WebDNN IR Graph
        """
        if not self._use_tensorflow_converter:
            return self._convert_fallback(model)

        else:
            # noinspection PyBroadException
            try:
                return TensorFlowConverter(
                    session=K.get_session(),
                    batch_size=self._batch_size).convert(
                        model.inputs, model.outputs)

            except Exception:
                self._use_tensorflow_converter = False
                console.debug(traceback.format_exc())
                console.debug(
                    "[KerasConverter] TensorflowConverter failed to convert.")

        return self._convert_fallback(model)
Esempio n. 14
0
    def generate(cls, graph: Graph, **kwargs):
        graph, _ = WebassemblyOptimizeRule().optimize(graph)
        if flags.DEBUG:
            traverse.dump(graph)

        memory_layout = Allocator.allocate(graph)

        console.debug(
            f"[WebassemblyDescriptorGenerator] memory_layout total size: {memory_layout.total_size * 4}"
        )
        console.debug(
            f"[WebassemblyDescriptorGenerator] memory_layout static size: {memory_layout.static_size * 4}"
        )
        console.debug(
            f"[WebassemblyDescriptorGenerator] memory_layout dynamic size: {memory_layout.dynamic_size * 4}"
        )

        constant_encoder = ConstantEncoder.get_encoder(
            kwargs.get("constant_encoder_name", None))
        constants_bytes = constant_encoder.encode(memory_layout)

        console.debug(
            f"[WebassemblyDescriptorGenerator] constants encoded size: {len(constants_bytes)}"
        )

        kernels = cls.generate_kernels(graph, memory_layout)

        heap_block_size = 16 * 1024 * 1024
        if isinstance(memory_layout.dynamic_size, int):
            dynamic_size_byte_int = memory_layout.dynamic_size * 4
        else:
            dynamic_size_byte_int = kwargs.get("dynamic_allocation_size",
                                               heap_block_size)
        total_size_byte = memory_layout.static_size * 4 + dynamic_size_byte_int

        # required for calculation (size ceiling to one block) + one block
        required_heap = (
            (total_size_byte + heap_block_size - 1) // heap_block_size +
            1) * heap_block_size

        descriptor = GraphDescriptor(kernels=kernels,
                                     memory_layout=memory_layout,
                                     inputs=graph.inputs,
                                     outputs=graph.outputs,
                                     constants_encoding=constant_encoder.name,
                                     required_heap=required_heap,
                                     licenses=graph.licenses)

        return GraphExecutionData(graph, descriptor, constants_bytes)
Esempio n. 15
0
def _listup_operations(inputs: Sequence[T_NODE], outputs: Sequence[T_NODE]):
    def get_prev_nodes(node: T_NODE) -> Sequence[T_NODE]:
        if node in inputs:
            return []

        elif isinstance(node, tf.Tensor):
            return [node.op]

        else:
            return node.inputs

    result = []  # type: List[tf.Operation]
    stack = [(node, None)
             for node in outputs]  # type: List[Tuple[T_NODE, T_NODE]]
    dependency_count = {}  # type: Dict[T_NODE, int]

    while len(stack) > 0:
        node_from, node_to = stack.pop()

        if node_from not in dependency_count:
            stack.append((node_from, node_to))

            prev_nodes = get_prev_nodes(node_from)
            dependency_count[node_from] = 0
            for prev_node in prev_nodes:
                if dependency_count.get(prev_node, 1) > 0:
                    dependency_count[node_from] += 1
                    stack.append((prev_node, node_from))

        elif dependency_count[node_from] == 0:
            if isinstance(node_from, tf.Operation):
                result.append(node_from)

            if node_to is not None:
                dependency_count[node_to] -= 1

        else:
            console.debug(
                "[TensorFlowConverter] Cycle is detected in computation graph")
            console.debug("cycle starting node:")
            console.debug(node_from)

            raise CyclicGraphError(
                "[TensorFlowConverter] Cycles are detected, but TensorFlowConverter cannot convert cyclic graph"
            )

    return result
Esempio n. 16
0
def _optimize_buffer_reuse(allocations_dict: AllocationDict):
    """
    Optimize memory size by reusing buffer if available

    Algorithm:

    Considering 4 variables with follow size and lifetime.

        Size and Lifetime)

          var |size| Lifetime (t=0 -> ...)
          ----+----+---------------
            a |  5 | [0, 2)
            b |  4 | [2, 4)
            c |  2 | [3, 5)
            d |  1 | [6, 8)
            e |  3 | [0, 5)
          ----+----+---------------

    In this case, we want to get follow optimized allocation:

         ---------> address
    time
     |    aaaaa_e
     |    aaaaa_e
     |    bbbb__e
     |    bbbbcce
     |    ____cce
     V    d______
          d______

    First, construct "Merge Offset Table".

        table = {                           reduced_size = {
            a: {},                              a: {},
            b: { a: 0 },                        b: { a: 4 },
            c: { a: 0, b: 4 },                  c: { a: 2, b: 0 }
            d: { a: 0, b: 0, c: 0, e: 0 },      d: { a: 1, b: 1, c: 1, e: 1 },
            e: { a: 5, b: 4, c: 2, d: 0 }       e: { a: 0, b: 0, c: 0, d: 1 }
        }                                   }

    `table[x][y]` means offset value in case that variable `x` is merged into variable `y`. For example, when `b` is merged into
    (=reused the memory allocated for) `a`, offset value is `0` because they are not exist at same time. However, when `c` is merged into
    `b`, offset value is `4` because they are exist at same time (t=3).

    Next, for each mergeable pair, calculate the reduced size if two variables are merged. For example, if `b` is merged into `a`, the
    reduced size is `4`.

    Then merge the pair which has the largest reduced size. In this case such pair is `a` and `b`, and update the table.

        table = {                           reduced_size = {
            ab: {},                             ab: {},
            c: { ab: 4 },                       c: { ab: 1 }
            d: { ab: 0, c: 0, e: 0 },           d: { ab: 1, c: 1, e: 1 },
            e: { ab: 5, c: 2, d: 0 }            e: { ab: 0, c: 0, d: 1 }
        }                                   }

    Iterate this procedure until all variables are merged into single allocation.

    Merge `d` into `ab` with offset `0`:

        table = {                           reduced_size = {
            abd: {},                            abd: {},
            c: { abd: 4 },                      c: { abd: 1 },
            e: { abd: 5, c: 2 }                 e: { abd: 0, c: 0 }
        }                                   }

    Merge `c` into `abd` with offset `4`:

        table = {                           reduced_size = {
            abcd: {},                           abcd: {},
            e: { abcd: 5 }                      e: { abcd: 0 }
        }                                   }

    Merge `e` into `abcd` with offset `5`:

        table = {                           reduced_size = {
            abcde: {}                           abcde: {}
        }                                   }

    Finish.

    Time order:
        Build Table: O(N^2)
        Iteration: O(N) times
            update table: O(N)

        Total: O(N^2)
    """
    if not (flags.optimize.OPTIMIZE
            and flags.optimize.OPTIMIZE_MEMORY_ALLOCATION):
        console.debug('_optimize_buffer_reuse is skipped')
        return

    allocations = list(
        set(
            filter(
                lambda x: Placeholder.check_resolved(x.size) and Placeholder.
                check_resolved(x.offset), allocations_dict.values())))
    allocations = sorted(allocations, key=lambda a: a.size, reverse=True)

    # Construct offset table
    offset_table = {a2: {} for a2 in allocations}
    for i1, a1 in enumerate(allocations):
        for i2, a2 in enumerate(allocations[i1 + 1:]):
            # align offset as 16-byte alignment
            offset_table[a2][a1] = 0 if (
                a1.end <= a2.begin or a2.end <= a1.begin) else _align(a1.size)

    # Merge
    merge_tree = {}  # type: Dict[Allocation, Tuple[Allocation, int]]
    while len(offset_table) > 1:
        if len(offset_table) % 10 == 0:
            console.debug(
                f"Memory allocation optimization: {(1-len(offset_table)/len(allocations)) * 100:4.1f}% complete."
            )

        # Get max score pair
        max_score = -1
        max_a1 = None
        max_a2 = None
        for a2, a1s in offset_table.items():
            for a1, offset in a1s.items():
                score = max(min(a1.size - offset, a2.size), 0)
                if max_score < score:
                    max_score = score
                    max_a1 = a1
                    max_a2 = a2

        # Merge
        a1 = max_a1
        a2 = max_a2
        offset12 = offset_table[a2][a1]
        merge_tree[a2] = (a1, offset12)

        # Update offset table
        for a3, offset32 in offset_table[a2].items():
            if a1 in offset_table[a3]:
                # +-------+
                # |       V
                # a2->a3->a1
                #
                # condition
                # - min(a1.size - offset12, a2.size) > min(a1.size - offset13, a3.size)
                # - a2.size < a3.size < a1.size
                #
                # ==========================================================
                # case 1) offset13 < offset12 + a2.size
                #
                # before)
                # a1      |a1...................|
                # a2      <-offset12---->|a2...|
                # a3      <-offset13--------->|a3............|
                #                             <-offset23->|a2...|
                #
                # after)
                # a1      |a1...................|
                # a3      <-offset13------------>|a3............|
                #
                # ==========================================================
                #
                # before 2) offset13 > offset12 + a2.size
                # a1      |a1...................|
                # a2      <-offset12---->|a2...|
                # a3      <-offset13------------->|a3............|
                #                                 <-offset23->|a2...|
                #
                # after)
                # a1      |a1...................|
                # a3      <-offset13------------->|a3............|
                #
                offset_table[a3][a1] = max(offset_table[a3][a1],
                                           _align(offset12 + a2.size))

            elif a3 in offset_table[a1]:
                # +-------+
                # |       V
                # a2->a1->a3
                #
                # condition
                # - min(a1.size - offset12, a2.size) > min(a1.size - offset13, a3.size)
                # - a2.size < a1.size < a3.size
                #
                # ==========================================================
                # case 1) offset32 > offset31 + offset12
                #
                # before)
                #         <-offset32----------->|a2...|
                # a3      |a3..........|
                # a1      <-offset31--->|a1........|
                # a2                    <-offset12->|a2...|
                #
                # after) nothing changed
                # a3      |a3..........|
                # a1      <-offset31--->|a1........||a2...|
                #
                # ==========================================================
                # case 2) offset32 > offset31 + offset12
                #
                # before)
                #              <-offset32------------------->|a2...........................|
                # a3           |a3.........................|
                # a1           <-offset31->|a1........................................|
                # a2                        <-offset12->|a2...........................|
                #
                # after) offset31 = offset32 - offset12
                # a3      |a3.........................|
                # a1      <------offset31->|a1........................................|
                #
                offset_table[a1][a3] = max(offset_table[a1][a3],
                                           _align(offset32 - offset12))

        del offset_table[a2]
        for a3, a4s in offset_table.items():
            if a3 == a1:
                continue

            if a2 in a4s:
                if a1 in a4s:
                    # +-------+
                    # |       V
                    # a3->a2->a1

                    a4s[a1] = max(a4s[a1], offset12 + a4s[a2])
                    del a4s[a2]

                else:
                    raise NotImplementedError
                    # # a3->a2->a1
                    #
                    # a4s[a1] = offset12 + a4s[a2]
                    # del a4s[a2]

    console.debug(f"Memory allocation optimization: 100.0% complete.")

    if len(offset_table) > 0:
        # Shift allocation block to 0-offset.
        list(offset_table.keys())[0].offset = 0

    for a2, (a1, offset) in merge_tree.items():
        while a1 in merge_tree:
            a1, offset2 = merge_tree[a1]
            offset += offset2
        a2.offset = offset
Esempio n. 17
0
from webdnn.graph.variable import Variable
from webdnn.graph.variables.attributes.input import Input
from webdnn.graph.variables.attributes.output import Output
from webdnn.graph.variables.constant_variable import ConstantVariable
from webdnn.util import console

FLAG_KERAS_INSTALLED = False

try:
    import keras
    import keras.backend as K
    import tensorflow as tf

    if not "2." <= keras.__version__ < "3.":
        console.debug(
            f"WebDNN supports Keras v2.*.*. Currently, keras {keras.__version__} is installed."
        )
    FLAG_KERAS_INSTALLED = True

except ImportError as e:
    console.debug("Keras and Tensorflow are not completely installed.")
    pass


def get_default_order(tf_tensor: "tf.Tensor"):
    if len(tf_tensor.shape) == 2:
        return OrderNC

    elif len(tf_tensor.shape) == 3:
        return OrderNTC
Esempio n. 18
0
    def convert(self, chainer_computational_graph:
                "chainer.computational_graph.ComputationalGraph",
                input_c_vars: List["chainer.Variable"],
                output_c_vars: List["chainer.Variable"]) -> Graph:
        """convert(chainer_computational_graph, input_c_vars, output_c_vars)

        Convert chainer computational graph into WebDNN IR.

        Instead of using this method directly, you should use
        :func:`convert_from_inout_vars<webdnn.frontend.chainer.ChainerConverter.convert_from_inout_vars>`.

        Args:
            chainer_computational_graph(chainer.computational_graph.ComputationalGraph): chainer computational graph
            input_c_vars(list of chainer.Variable): input chainer variables
            output_c_vars(list of chainer.Variable): output chainer variables

        Returns:
            (:class:`~webdnn.Graph`): WebDNN Graph
        """
        # In chainer v2, variables are represented as Variable and VariableNode object, and
        # graph information such as edge connection is contained in variable node.
        # Therefore all chainer variable must be normalized into variable node.
        input_c_vars = [_to_variable_node(v) for v in input_c_vars]
        output_c_vars = [_to_variable_node(v) for v in output_c_vars]

        # Append InputVariable attribute to input variables
        input_n_vars = []
        for c_var in input_c_vars:
            n_var = self._convert_var(c_var)
            n_var.attributes.add(Input(n_var))
            input_n_vars.append(n_var)

        self._convert_weight_vars(chainer_computational_graph)

        pending_c_oprs = [
            c_opr for c_opr in chainer_computational_graph.nodes
            if isinstance(c_opr, chainer.Function)
        ]

        while len(pending_c_oprs) > 0:
            for c_opr in pending_c_oprs:
                if all(((self.has_variable(_to_variable_node(c_var)))
                        for c_var in c_opr.inputs)):
                    # All input variables of the `cfunc` are converted, so this `c_opr` can be converted.
                    self._convert_operator(c_opr)
                    pending_c_oprs.remove(c_opr)
                    break  # for c_opr in pending_functions
            else:
                console.debug(pending_c_oprs)
                raise ValueError("Inputs to functions cannot be resolved.")

        # Append OutputVariable attribute to output variables
        output_n_vars = []
        for c_var in output_c_vars:
            if not self.has_variable(c_var):
                raise ValueError("Output variable is not generated by graph.")
            n_var = self.get_variable(c_var)
            n_var.attributes.add(Output)
            output_n_vars.append(n_var)

        graph = Graph(input_n_vars, output_n_vars)
        # Convert variable order into typical one in Chainer
        self._transpose_vars(graph)

        return graph
Esempio n. 19
0
try:
    import chainer
    import chainer.computational_graph

    if chainer.__version__ >= "2.":
        chainer_v2 = True
        # noinspection PyUnresolvedReferences
        VariableNode = chainer.variable.VariableNode
    else:
        chainer_v2 = False
        VariableNode = chainer.variable.Variable

    FLAG_CHAINER_INSTALLED = True

except ImportError as e:
    console.debug("Chainer is not completely installed.")
    pass


def _to_variable_node(
    chainer_variable: Union["chainer.Variable",
                            "VariableNode"]) -> "VariableNode":
    if chainer_v2 and not isinstance(chainer_variable, VariableNode):
        # noinspection PyUnresolvedReferences
        return chainer_variable.node
    else:
        return chainer_variable


class ChainerConverter(Converter["chainer.Function"]):
    """ChainerConverter()
Esempio n. 20
0
    def optimize(self, graph: Graph) -> Tuple[Graph, bool]:
        if not (flags.optimize.OPTIMIZE and flags.optimize.CONCAT_SCALAR_OPERATION):
            return graph, False

        flag_changed = False

        matches = search_sub_structure(graph, [ScalarOperation, Variable, ScalarOperation])
        while len(matches) > 0:
            match = matches[0]
            op1 = match[0]  # type: Operator
            op2 = match[2]  # type: Operator

            y1 = op1.outputs["y"]
            y2 = op2.outputs["y"]

            if isinstance(op1, ScalarAffine):
                if isinstance(op2, ScalarAffine):
                    op1.scale = op1.scale * op2.scale
                    op1.bias = op1.bias * op2.scale + op2.bias
                    op2.remove_all()
                    op1.replace_output(y1, y2)

                elif isinstance(op2, ScalarAdd):
                    op1.bias += op2.value
                    op2.remove_all()
                    op1.replace_output(y1, y2)

                elif isinstance(op2, ScalarMul):
                    op1.scale *= op2.value
                    op1.bias *= op2.value
                    op2.remove_all()
                    op1.replace_output(y1, y2)

                else:
                    console.debug(f"[ConcatScalarOperation] unhandled pair: {type(op1)} and {type(op2)}")

            elif isinstance(op1, ScalarAdd):
                if isinstance(op2, ScalarAffine):
                    op2.bias += op1.value * op2.scale
                    x = op1.inputs["x0"]
                    op1.remove_all()
                    x.replace(y1)

                elif isinstance(op2, ScalarAdd):
                    op1.parameters["value"] += op2.value
                    op2.remove_all()
                    op1.replace_output(y1, y2)

                elif isinstance(op2, ScalarMul):
                    x = op1.inputs["x0"]
                    new_op = ScalarAffine(None, scale=op2.value, bias=op1.value * op2.value)
                    new_y, = new_op(x)
                    op1.remove_all()
                    op2.remove_all()
                    y2.replace(new_y)

                else:
                    console.debug(f"[ConcatScalarOperation] unhandled pair: {type(op1)} and {type(op2)}")

            elif isinstance(op1, ScalarMul):
                if isinstance(op2, ScalarAffine):
                    op2.scale *= op1.value
                    x = op1.inputs["x0"]
                    op1.remove_all()
                    x.replace(y1)

                elif isinstance(op2, ScalarAdd):
                    x = op1.inputs["x0"]
                    new_op = ScalarAffine(None, scale=op1.value, bias=op2.value)
                    new_y, = new_op(x)
                    op1.remove_all()
                    op2.remove_all()
                    y2.replace(new_y)

                elif isinstance(op2, ScalarMul):
                    op1.parameters["value"] *= op2.value
                    op2.remove_all()
                    op1.replace_output(y1, y2)

                else:
                    console.debug(f"[ConcatScalarOperation] unhandled pair: {type(op1)} and {type(op2)}")

            flag_changed = True
            matches = search_sub_structure(graph, [ScalarAffine, Variable, ScalarAffine])

        return graph, flag_changed
Esempio n. 21
0
from webdnn.graph.graph import Graph
from webdnn.graph.order import Order
from webdnn.graph.variable import Variable
from webdnn.graph.variables.attributes.input import Input
from webdnn.graph.variables.attributes.output import Output
from webdnn.graph.variables.constant_variable import ConstantVariable
from webdnn.util import console

FLAG_ONNX_INSTALLED = False
try:
    import onnx

    FLAG_ONNX_INSTALLED = True

except ImportError as e:
    console.debug("ONNX is not completely installed.")
    pass


def attribute_dict(proto: INodeProto) -> Dict[str, IAttributeProto]:
    return {attr.name: attr for attr in proto.attribute}


class ONNXConverter(Converter["onnx.NodeProto"]):
    """ONNXConverter()

    Converter for `Open Neural Network Exchange (ONNX) <http://onnx.ai/>`_.

    To use this converter, you need to install ONNX python module. see `ONNX github repository <https://github.com/onnx/onnx>`_.
    """
Esempio n. 22
0
def dump(graph: Graph):
    indent = ""
    for op in listup_operators(graph):
        parameters_sorted = [repr(key) + ': ' + str(op.parameters[key]) for key in sorted(op.parameters.keys())]
        console.debug(f"---------------------------------------------------------------------------")
        console.debug(f"{indent}{op.__class__.__name__} : {op.name}")
        console.debug(f"{indent}    In  : {op.inputs}")
        console.debug(f"{indent}    Out : {op.outputs}")
        console.debug(f"{indent}    Attr: {sorted([attr.__class__.__name__ for attr in op.attributes])}")
        console.debug(f"{indent}    Parameters: {{{', '.join(parameters_sorted)}}}")
Esempio n. 23
0
def dump(graph: Graph):
    for op in listup_operators(graph):
        console.debug(
            f"---------------------------------------------------------------------------"
        )
        dump_op(op)
Esempio n. 24
0
def _optimize_buffer_reuse(allocations_dict: AllocationDict):
    """
    Optimize memory size by reusing buffer if available

    Algorithm:

    Considering 4 variables with follow size and lifetime.

        Size and Lifetime)

          var |size| Lifetime (t=0 -> ...)
          ----+----+---------------
            a |  5 | [0, 2)
            b |  4 | [2, 4)
            c |  2 | [3, 5)
            d |  1 | [6, 8)
            e |  3 | [0, 5)
          ----+----+---------------

    In this case, we want to get follow optimized allocation:

         ---------> address
    time
     |    aaaaa_e
     |    aaaaa_e
     |    bbbb__e
     |    bbbbcce
     |    ____cce
     V    d______
          d______

    First, construct "Merge Offset Table".

    table = {
        a: {},
        b: { a: 0 },
        c: { a: 0, b: 4, e: 3 },
        d: { a: 0, b: 0, c: 0, e: 0 },
        e: { a: 5, b: 4 }
    }

    `table[x][y]` means offset value in case that variable `x` is merged into variable `y`. For example, when `b` is merged into
    (=reused the memory allocated for) `a`, offset value is `0` because they are not exist at same time. However, when `c` is merged into
    `b`, offset value is `4` because they are exist at same time (t=3).

    Next, for each mergeable pair, calculate the reduced size if two variables are merged. For example, if `b` is merged into `a`, the
    reduced size is `4`.

    Then merge the pair which has the largest reduced size. In this case such pair is `a` and `b`, and update the table.

    table = {
        ab: {},
        c: { ab: 4, e: 3 },
        d: { ab: 0, c: 0, e: 0 },
        e: { ab: 5 }
    }

    Iterate this procedure until all variables are merged into single allocation.

    Merge `d` into `ab` with offset `0`:

        table = {
            abd: {},
            c: { abd: 4, e: 3 },
            e: { abd: 5 }
        }

    Merge `c` into `abd` with offset `4`:

        table = {
            abcd: {},
            e: { abcd: 5 }
        }

    Merge `e` into `abcd` with offset `5`:

        table = {
            abcde: {},
        }

    Finish.

    Time order:
        Build Table: O(N^2)
        Iteration: O(N) times
            update table: O(N)

        Total: O(N^2)
    """
    if not flags.optimize.OPTIMIZE:
        console.debug('_optimize_buffer_reuse is skipped')
        return

    allocations = list(
        set(
            filter(lambda x: Placeholder.check_resolved(x),
                   allocations_dict.values())))
    allocations = sorted(allocations, key=lambda a: a.size, reverse=True)

    # Construct offset table
    offset_table = {a2: {} for a2 in allocations}
    for i1, a1 in enumerate(allocations):
        for i2, a2 in enumerate(allocations[i1 + 1:]):
            # align offset as 16-byte alignment
            offset_table[a2][a1] = 0 if (
                a1.end <= a2.begin or a2.end <= a1.begin) else _align(a1.size)

    # Merge
    merge_tree = {}  # type: Dict[Allocation, Tuple[Allocation, int]]
    while len(offset_table) > 1:
        # Get max score pair
        max_score = -1
        max_a1 = None
        max_a2 = None
        for a2, a1s in offset_table.items():
            for a1, offset in a1s.items():
                score = max(min(a1.size - offset, a2.size), 0)
                if max_score < score:
                    max_score = score
                    max_a1 = a1
                    max_a2 = a2

        # Merge
        a1 = max_a1
        a2 = max_a2
        offset12 = offset_table[a2][a1]
        merge_tree[a2] = (a1, offset12)

        # Update offset table
        for a3, offset23 in offset_table[a2].items():
            if a1 in offset_table[a3]:
                # a2->a3->a1
                # |       |
                # +-------+
                #
                # a1      |/////a1///////|
                # a2      <----offset12-->|//a2//|
                #             <-offset32->
                # a3          |////a3////|
                offset_table[a3][a1] = max(offset_table[a3][a1],
                                           _align(offset12 + a2.size))

        del offset_table[a2]
        for a3, a4s in offset_table.items():
            if a3 == a1:
                continue

            if a2 in a4s:
                if a1 in a4s:
                    # a3->a2->a1
                    # |       |
                    # +-------+

                    a4s[a1] = max(a4s[a1], offset12 + a4s[a2])
                    del a4s[a2]

                else:
                    a4s[a1] = offset12 + a4s[a2]
                    del a4s[a2]

    # Update all allocation offset value
    list(offset_table.keys())[0].offset = 0

    for a2, (a1, offset) in merge_tree.items():
        while a1 in merge_tree:
            a1, offset2 = merge_tree[a1]
            offset += offset2
        a2.offset = offset
Esempio n. 25
0
def dump(graph: Graph):  # pragma: no cover
    for op in listup_operators(graph):
        console.debug(
            f"---------------------------------------------------------------------------"
        )
        dump_op(op)
Esempio n. 26
0
def _split_axis(v: Variable, axis: Axis, graph):
    """
    split variable by specified axis
    """
    s1 = v.shape_dict[axis] // 2
    s2 = v.shape_dict[axis] - s1

    if isinstance(v, ConstantVariable):
        v_datum = np.split(v.data, [s1], v.order.axes_dict[axis])
        v1 = ConstantVariable(v_datum[0], v.order)
        v2 = ConstantVariable(v_datum[1], v.order)

    else:
        v1 = Variable([s1 if a == axis else v.shape_dict[a] for a in v.order.axes], v.order)
        v2 = Variable([s2 if a == axis else v.shape_dict[a] for a in v.order.axes], v.order)

    ops = list(v.input_to)
    if v.output_from is not None:
        ops += [v.output_from]

    for op in ops:
        if all(isinstance(v, ConstantVariable) for v in op.inputs.values()):
            op.fold_constance()

        elif isinstance(op, SplitAxis):
            _split_splitaxis(graph, op, v, [v1, v2], axis)

        elif isinstance(op, Concat):
            _split_concat(graph, op, v, [v1, v2], axis)

        elif isinstance(op, Im2Col):
            _split_im2col(graph, op, v, [v1, v2], axis)

        elif isinstance(op, PartialIm2Col):
            _split_partial_im2col(graph, op, v, [v1, v2], axis)

        elif isinstance(op, Reshape):
            _split_reshape(graph, op, v, [v1, v2], axis)

        elif isinstance(op, Sgemm):
            _split_sgemm(graph, op, v, [v1, v2], axis)

        elif Tensorwise.check_splittable(op, axis):
            _split_tensorwise(graph, op, v, [v1, v2], axis)

        else:
            console.debug("-------------------------------------------------")
            console.debug(f"{v}")
            console.debug(f"  original order: {v.order}")
            console.debug(f"  original shape: {v.shape}")
            console.debug(f"")
            console.debug(f"  split axis: {axis}")
            console.debug(f"")
            console.debug(f"  related operators:")
            for related_op in ops:
                console.debug(f"  {related_op}")
            console.debug(f"")

            with open("cg-failed.dot", "w") as f:
                f.write(traverse.dump_dot(graph))

            raise NotImplementedError(f"Variable is too large to handle in WebGL backend: {v}")
Esempio n. 27
0
def _choose_split_axis(v: Variable) -> Axis:
    """
    For too-large texture `v`, choose one axis which is the best one to reduce texture size by splitting `v` in that axis.

    Args:
        v: Variable, whose size is too large (= this variable has :code:`SplitTarget` attribute)

    Returns:
        axis
    """

    ops = list(v.input_to)
    if v.output_from is not None:
        ops += [v.output_from]

    splittable_axes = list(v.order.axes)
    for op in ops:
        _op_splittable_axes = _listup_splittable_axis(
            v, op) + [attr.axis for attr in op.get_attribute(Tensorwise)]
        for a in list(splittable_axes):
            if a not in _op_splittable_axes:
                splittable_axes.remove(a)

    if len(splittable_axes) == 0:
        raise ValueError("No axis is splittable")

    # Calculate the size of a side of texture which will be changed when each axis is split
    #
    # ex) OrderNC, N=512, C=2048, texture(width=2048, height=512)
    #     => If axis `N` is split, then height will be changed => N: 512 (=height)
    #        If axis `C` is split, then width will be changed => C: 2048 (=width)
    #
    # ex) OrderNCHW, N=1, C=512, H=13, W=13, texture(width=2048, height=43)
    #     => TexW == W*H*(partial of C) texture width consists of axis W, H and C.
    #        TexH == (partial of C)*N   texture height consists of axis C and N.
    #     => N cannot be split => N: -1
    #        C is related both width and height. In this case, use large one. => C: 2048
    #        H is included in width =>  H: 2048
    #        W is also included in width =>  W: 2048

    axis_corresponding_texture_size = AxisKeyDict()
    element_per_pixel = ChannelMode.elements_per_pixel(v)
    tex_h, tex_w = TextureShape.get(v)
    tex_w = (tex_w + element_per_pixel - 1) // element_per_pixel
    for a in v.order.axes:
        if v.shape_dict[a] == 1:
            # This axis cannot be split
            axis_corresponding_texture_size[a] = -1

        elif v.stride_dict[a] >= tex_w * element_per_pixel:
            axis_corresponding_texture_size[a] = tex_h

        elif v.stride_dict[a] * v.shape_dict[a] >= tex_w * element_per_pixel:
            axis_corresponding_texture_size[a] = max(tex_h, tex_w)

        else:
            axis_corresponding_texture_size[a] = tex_w

    splittable_axes.sort(key=lambda a: axis_corresponding_texture_size[a],
                         reverse=True)
    target_axis = splittable_axes[0]

    console.debug(
        f"==========================================================================="
    )
    console.debug(f"{v}")
    console.debug(f"  original order: {v.order}")
    console.debug(f"  original shape: {v.shape}")
    console.debug(f"   texture shape: {TextureShape.get(v)}")
    console.debug(f"")
    console.debug(f"  splittable axis: {splittable_axes}")
    console.debug(f"  split axis: {target_axis}")
    console.debug(f"")
    console.debug(f"  related operators:")
    for related_op in ops:
        console.debug(
            f"---------------------------------------------------------------------------"
        )
        traverse.dump_op(related_op)
    console.debug(f"")

    if axis_corresponding_texture_size[target_axis] <= 0:
        raise NotImplementedError(
            f"Variable is too large to handle in WebGL backend: {v}")

    return target_axis
Esempio n. 28
0
from webdnn.graph.graph import Graph
from webdnn.graph.order import Order, OrderNC, OrderNTC, OrderNHWC, OrderC
from webdnn.graph.placeholder import Placeholder
from webdnn.graph.variable import Variable
from webdnn.graph.variables.constant_variable import ConstantVariable
from webdnn.optimizer.tensorflow_frontend_optimize_rule import TensorFlowFrontendOptimizeRule
from webdnn.util import console
from webdnn.util import flags

FLAG_TF_INSTALLED = True

try:
    import tensorflow as tf

except ImportError as e:
    console.debug("Tensorflow are not completely installed.")
    FLAG_TF_INSTALLED = False
    pass


def get_default_order(ndim: int):
    if ndim == 1:
        return OrderC

    elif ndim == 2:
        return OrderNC

    elif ndim == 3:
        return OrderNTC

    elif ndim == 4: