def f(local_opt): name = (kwargs and kwargs.pop("name")) or local_opt.__name__ if isinstance(local_opt, OptimizationDatabase): opt = local_opt else: opt = local_optimizer(tracks)(local_opt) gpu_optimizer2.register(name, opt, "fast_run", "gpuarray", *tags) return local_opt
def f(maker): def local_opt(fgraph, node): if isinstance(node.op, OP): # Either one of our inputs is on the gpu or # all of our clients are on the gpu replace = False # TODO: Maybe set context_name with infer_context_name()? context_name = None # We replace if any input is a host_from_gpu for i in node.inputs: if i.owner and i.owner.op == host_from_gpu and move_to_gpu(i): context_name = i.owner.inputs[0].type.context_name replace = True break if not replace: # We replace if *all* clients are on the GPU clients = [c for o in node.outputs for c in fgraph.clients[o]] replace = len(clients) != 0 for c, idx in clients: if c == "output" or not isinstance(c.op, GpuFromHost): replace = False # TODO: check that the clients want the same context? if replace: # All clients are GpuFromHost and we have at least one context_name = clients[0][0].op.context_name # Check if we should replace if ( not replace or (cuda_only and get_context(context_name).kind != b"cuda") or any("complex" in getattr(i, "dtype", "") for i in node.inputs) ): return False # tag the inputs with the context in case # the context was derived from the outputs for i in node.inputs: i.tag.context_name = context_name new_op = maker(node.op, context_name, node.inputs, node.outputs) # This is needed as sometimes new_op inherits from OP. if new_op and new_op != node.op: if isinstance(new_op, Op): new_outputs = new_op(*node.inputs, return_list=True) to_cpu_fn = safe_to_cpu elif isinstance(new_op, (tuple, list)): new_outputs = new_op to_cpu_fn = safe_to_cpu else: # suppose it is a variable on the GPU new_outputs = [new_op] def to_cpu_fn(x): return x.transfer("cpu") # copy stack traces onto gpu outputs # also copy the stack traces onto HostFromGpu outputs on_cpu = [] for old_output, new_output in zip(node.outputs, new_outputs): copy_stack_trace(old_output, new_output) cpu = to_cpu_fn(new_output) on_cpu.append(cpu) copy_stack_trace(old_output, cpu) return on_cpu return False local_opt.__name__ = maker.__name__ return local_optimizer(OP)(local_opt)
mode=mode) return unpad_dims(ret_padded, img, 2, nd) pool_db.register( "local_gpua_pool_dnn_alternative", op_lifter([Pool])(local_gpua_pool_dnn_alternative), "gpuarray", "fast_compile", "fast_run", "cudnn", position=0, ) pool_db2.register( "local_gpua_pool_dnn_alternative", local_optimizer([Pool])(local_gpua_pool_dnn_alternative), "gpuarray", "fast_compile", "fast_run", "cudnn", position=0, ) def local_gpua_pool_dnn_grad_stride(fgraph, op, ctx_name, inputs, outputs): if not dnn_available(ctx_name): return if not op.ignore_border: return inp, out, out_grad, ws, stride, pad = inputs nd = op.ndim