Ejemplo n.º 1
0
    def __init__(self, **kwargs):
        super(CPUTransformer, self).__init__(**kwargs)
        self.device_computation = None
        self.conv_engine = CPUConvEngine()
        self.init_code = CPUCodeGenerator(self)
        self.allocate_storage_code = CPUCodeGenerator(self)
        self.allocate_code = CPUCodeGenerator(self)
        self.code = CPUCodeGenerator(self)
        self.globals = PyModule(prefix="op")
        self.initialize_module(self.globals)
        self.n_computations = 0
        self.use_pinned_mem = False
        self.rng_seed = None

        self.exop_codegen_pools = CPUCodeGenerator(self)
        self.exop_codegen_tensor = CPUCodeGenerator(self)
        self.exop_codegen_tensor_view = CPUCodeGenerator(self)
        self.exop_codegen = CPUCodeGenerator(self)
        self.exop_codegen_define_length = 0
        self.prefix = ''

        # from ngraph.transformers.passes.exnviz import ExVizPass
        # from ngraph.transformers.passes.verify import VerifyPass
        # from ngraph.transformers.passes.visualizemem import VisualizeMemPass
        # from ngraph.transformers.passes.dumpgraphpass import DumpGraphPass

        self.graph_passes = []
        if self.mkldnn.enabled:
            self.graph_passes.append(CPUFusion())
        self.graph_passes += [
            # ExVizPass(view=True, filename="initial"),
            CPUTensorLayout(),
            SimplePrune(),
            RequiredTensorShaping(),
            CPUTensorShaping(),
            DeadCodeEliminationPass(),
        ]

        add_layout_conversion = AddLayoutConversions(None)
        if self.mkldnn.enabled:
            self.graph_passes.append(
                MklCreateOpDescriptors(mkldnn=self.mkldnn)),
            DeadCodeEliminationPass(),
            self.graph_passes.append(
                MklAddLayoutConversions(mkldnn=self.mkldnn,
                                        layoutpass=add_layout_conversion)),
            DeadCodeEliminationPass()
        self.graph_passes += [
            SSAConversion(),
            IndexElision(),
            # DCE here eliminates return values. Need to figure out why.
            # DeadCodeEliminationPass(),
            LivenessPass(),
            MemOptimizePass(),
            LivenessPass(),
            MemLayoutPass()
        ]
Ejemplo n.º 2
0
class CPUTransformer(ExecutionGraphTransformer):
    """
    Transformer for executing graphs on a CPU, backed by numpy.

    Given a list of ops you want to compute the results of, this transformer
    will compile the graph required to compute those results and exposes an
    evaluate method to execute the compiled graph.
    """

    transformer_name = "cpu"
    default_rtol = 1e-05
    default_atol = 1e-08

    import imp
    try:
        imp.find_module('mlsl')
        use_mlsl = True
    except ImportError:
        use_mlsl = False

    def __init__(self, **kwargs):
        super(CPUTransformer, self).__init__(**kwargs)
        self.device_computation = None
        self.conv_engine = CPUConvEngine()
        self.init_code = CPUCodeGenerator(self)
        self.allocate_storage_code = CPUCodeGenerator(self)
        self.allocate_code = CPUCodeGenerator(self)
        self.code = CPUCodeGenerator(self)
        self.globals = PyModule(prefix="op")
        self.initialize_module(self.globals)
        self.n_computations = 0
        self.use_pinned_mem = False
        self.rng_seed = None

        self.exop_codegen_pools = CPUCodeGenerator(self)
        self.exop_codegen_tensor = CPUCodeGenerator(self)
        self.exop_codegen_tensor_view = CPUCodeGenerator(self)
        self.exop_codegen = CPUCodeGenerator(self)
        self.exop_codegen_define_length = 0
        self.prefix = ''

        # from ngraph.transformers.passes.exnviz import ExVizPass
        # from ngraph.transformers.passes.verify import VerifyPass
        # from ngraph.transformers.passes.visualizemem import VisualizeMemPass
        # from ngraph.transformers.passes.dumpgraphpass import DumpGraphPass

        self.graph_passes = []
        if self.mkldnn.enabled:
            self.graph_passes.append(CPUFusion())
        self.graph_passes += [
            # ExVizPass(view=True, filename="initial"),
            CPUTensorLayout(),
            SimplePrune(),
            RequiredTensorShaping(),
            CPUTensorShaping(),
            DeadCodeEliminationPass(),
        ]

        add_layout_conversion = AddLayoutConversions(None)
        if self.mkldnn.enabled:
            self.graph_passes.append(
                MklCreateOpDescriptors(mkldnn=self.mkldnn)),
            DeadCodeEliminationPass(),
            self.graph_passes.append(
                MklAddLayoutConversions(mkldnn=self.mkldnn,
                                        layoutpass=add_layout_conversion)),
            DeadCodeEliminationPass()
        self.graph_passes += [
            SSAConversion(),
            IndexElision(),
            # DCE here eliminates return values. Need to figure out why.
            # DeadCodeEliminationPass(),
            LivenessPass(),
            MemOptimizePass(),
            LivenessPass(),
            MemLayoutPass()
        ]
        # DumpGraphPass(filename=graph_name+'.txt').do_pass(computation_decl)

        # VisualizeMemPass(filename=mem_name+'.html').do_pass(computation_decl)
        # ExVizPass(view=False, filename=graph_name).do_pass(computation_decl)

    def finish_allocate_computation(self, computation):
        self.exop_codegen.endl(2)

    def start_define_computation(self, computation_decl):
        self.exop_codegen.append("class {}(HetrLocals, ConvLocals):",
                                 computation_decl.computation_op.name)
        with indenting(self.exop_codegen):
            self.exop_codegen.append("def __init__(self, **kwargs):")
            with indenting(self.exop_codegen):
                if is_tracing_enabled():
                    self.exop_codegen.append("""
self.__profiler_start__ = list()
self.__profiler_stop__  = list()
""")
                self.exop_codegen.append('super({}, self).__init__(**kwargs)',
                                         computation_decl.computation_op.name)
                for exop in computation_decl.exop_block:
                    output_decl = exop.output_decls[0] if len(
                        exop.output_decls) > 0 else None
                    # TODO better way to deal with multiple values
                    self.exop_codegen.exop = exop
                    self.exop_codegen.allocate_op(exop.op, output_decl,
                                                  *exop.input_decls)

            self.exop_codegen.endl()

        self.exop_codegen.indent(1)
        self.exop_codegen.append("def __call__(self):")
        self.exop_codegen.indent(1)
        self.codegen_define_length = self.exop_codegen.code_length

    def generate_exop(self, exop):
        value = exop.output_decls[0] if len(exop.output_decls) > 0 else None
        # TODO better way to deal with multiple values
        self.exop_codegen.exop = exop
        self.exop_codegen.generate_op_pre(exop.op)
        self.exop_codegen.generate_op(exop.op, value, *exop.input_decls)
        self.exop_codegen.generate_op_post(exop.op)

    def finish_define_computation(self, computation_decl):
        if self.codegen_define_length == self.exop_codegen.code_length:
            self.exop_codegen.append('pass')
        self.exop_codegen.indent(-2)

    def finish_load_computation(self, computation_decl):
        device_computation = computation_decl.device_computation
        temp_pool_size = computation_decl.exop_block.memory_footprint() // 4
        persistent_pool_size = computation_decl.exop_block.persistent_size(
        ) // 4
        self.exop_codegen_pools.append(
            "{}_temporary_pool = np.empty({}, dtype=np.dtype('{}'))",
            computation_decl.computation_op.name, temp_pool_size, 'float32')
        self.exop_codegen_pools.append(
            "{}_persistent_pool = np.empty({}, dtype=np.dtype('{}'))",
            computation_decl.computation_op.name, persistent_pool_size,
            'float32')

        code = '#---------------------------------------------\n'
        code += '# memory pool\n'
        code += '#---------------------------------------------\n'
        code += self.exop_codegen_pools.take_code()
        code += '\n\n#---------------------------------------------\n'
        code += '# tensor\n'
        code += '#---------------------------------------------\n'
        code += self.exop_codegen_tensor.take_code()
        code += '\n\n#---------------------------------------------\n'
        code += '# tensor view\n'
        code += '#---------------------------------------------\n'
        code += self.exop_codegen_tensor_view.take_code()
        code += '\n\n#---------------------------------------------\n'
        code += '# code\n'
        code += '#---------------------------------------------\n'
        code += self.exop_codegen.take_code()
        self.globals.compile(code)
        cls = self.globals[computation_decl.computation_op.name]
        executor = cls(
            conv_params=device_computation.conv_params,
            pool_params=device_computation.pool_params,
            conv_slices=device_computation.conv_slices,
            pool_slices=device_computation.pool_slices,
            send_nodes=device_computation.send_nodes,
            recv_nodes=device_computation.recv_nodes,
            scatter_send_nodes=device_computation.scatter_send_nodes,
            scatter_recv_nodes=device_computation.scatter_recv_nodes,
            gather_send_nodes=device_computation.gather_send_nodes,
            gather_recv_nodes=device_computation.gather_recv_nodes,
            allreduce_nodes=device_computation.allreduce_nodes,
            broadcast_send_nodes=device_computation.broadcast_send_nodes,
            broadcast_recv_nodes=device_computation.broadcast_recv_nodes)
        return executor

    def make_device_tensor(self, computation, tensor_decl):
        """
        Make a DeviceTensor.

        Arguments:
            computation:
            tensor_decl: A TensorDecl.

        Returns: A DeviceTensor.
        """
        return CPUDeviceTensor(self, computation, tensor_decl)

    def initialize_module(self, module):
        module.execute("""from __future__ import print_function
from builtins import print
import os
import numpy as np
import ctypes as ct
import numpy.ctypeslib as npct
import itertools as itt
from monotonic import monotonic as monotonic
try:
    import mlsl
    import ctypes
except ImportError:
    pass
from ngraph.op_graph import axes
from ngraph.transformers.cpu.cpuengine import fprop_lut, update_lut
from ngraph.transformers.cpu.cpuengine import Mkldnn
from ngraph.transformers.cpu.cpuengine import ConvLocals
from ngraph.transformers.cpu.hetr import HetrLocals
from ngraph.transformers.cpu.ctc import ctc_cpu
        """)

        mkldnn_path = os.path.join(os.path.dirname(__file__), "..", "..")
        mkldnn_engine_path = os.path.join(mkldnn_path, 'mkldnn_engine.so')
        module.execute("mkldnn = Mkldnn('{}')".format(mkldnn_engine_path))
        module.execute("mkldnn.open()")
        self.mkldnn = module['mkldnn']
        if self.use_mlsl:
            module.execute("mlsl_obj = mlsl.MLSL()")
            module.execute("mlsl_obj.init()")

    def transform_allocate_ops(self, all_ops):
        def tensor_description_value(x):
            if isinstance(x, TensorDescription):
                return self.get_tensor_description_tensor_view(x)
            return x

        for op in all_ops:
            out = tensor_description_value(op.forwarded.tensor_description())
            call_info = (tensor_description_value(_) for _ in op.call_info())
            self.compute_code.allocate_op(op, out, *call_info)

    def finish_transform_allocate(self):
        pass

    def allocate_storage(self):
        pass

    def close(self):
        if self.code is not None:
            try:
                if self.globals.get('mkldnn', None) is not None:
                    self.globals.execute('mkldnn.close()')
                if self.globals.get('mlsl_obj', None) is not None:
                    for device_buffer in self.device_buffers:
                        self.globals.execute(
                            "mlsl_obj.free({}.__array_interface__['data'][0])".
                            format(device_buffer.ref_str))

                    self.globals.execute('mlsl_obj.finalize()')
            except TypeError:
                pass
        self.code = None

    def consume(self, buf_index, hostlist, devlist):
        '''
        This is currently used for Aeon dataloading -- need to set things up to do actual
        device buffer allocation
        '''
        assert 0 <= buf_index < 2, 'Can only double buffer'
        hb = np.rollaxis(hostlist[buf_index], 0, hostlist[buf_index].ndim)
        if devlist[buf_index] is None:
            devlist[buf_index] = np.empty_like(hb)
        devlist[buf_index][:] = hb

    def make_computation(self, computation):
        return CPUDeviceComputation(self, computation)