def train(config): ''' Training function for EWM generator model. ''' # Create python version of cpp operation # (Credit: Chen, arXiv:1906.03471, GitHub: https://github.com/chen0706/EWM) from torch.utils.cpp_extension import load my_ops = load(name = "my_ops", sources = ["W1_extension/my_ops.cpp", "W1_extension/my_ops_kernel.cu"], verbose = False) import my_ops # Set up GPU device ordinal - if this fails, use CUDA_LAUNCH_BLOCKING environment param... device = torch.device(config['gpu']) # Get model kwargs emw_kwargs = setup_model.ewm_kwargs(config) # Setup model on GPU G = ewm_G(**emw_kwargs).to(device) G.weights_init() print(G) input('Press any key to launch') # Setup model optimizer model_params = {'g_params': G.parameters()} G_optim = utils.get_optim(config, model_params) # Set up full_dataloader (single batch) dataloader = utils.get_dataloader(config) # Full Dataloader dset_size = len(dataloader) # Flatten the dataloader into a Tensor of shape [dset_size, l_dim] dataloader = dataloader.view(dset_size, -1).to(device) # Set up psi optimizer psi = torch.zeros(dset_size, requires_grad=True).to(device).detach().requires_grad_(True).to(device) psi_optim = torch.optim.Adam([psi], lr=config['psi_lr']) # Set up directories for saving training stats and outputs config = utils.directories(config) # Set up dict for saving checkpoints checkpoint_kwargs = {'G':G, 'G_optim':G_optim} # Variance argument for the tessellation vectors tess_var = config['tess_var']**0.5 # Compute the stopping criterion using set of test vectors # and computing the 'ideal' loss between the test/target. print(line(60)) print("Computing stopping criterion") print(line(60)) stop_criterion = [] test_loader = utils.get_test_loader(config) for _, test_vecs in enumerate(test_loader): # Add Gaussian noise to test_vectors test_vecs = test_vecs.view(config['batch_size'], -1).to(device) # 'Perfect' generator model t1 = tess_var*torch.randn(test_vecs.shape[0], test_vecs.shape[1]).to(device) test_vecs += t1 # Add Gaussian noise to target data t2 = tess_var*torch.randn(dataloader.shape[0], dataloader.shape[1]).to(device) test_target = dataloader + t2 # Compute the stop score stop_score = my_ops.l1_t(test_vecs, test_target) stop_loss = -torch.mean(stop_score) stop_criterion.append(stop_loss.cpu().detach().numpy()) del test_loader # Set stopping criterion variables stop_min, stop_mean, stop_max = np.min(stop_criterion), np.mean(stop_criterion), np.max(stop_criterion) print(line(60)) print('Stop Criterion: min: {}, mean: {}, max: {}'.format(round(stop_min, 3), round(stop_mean, 3), round(stop_max, 3))) print(line(60)) # Set up stats logging hist_dict = {'hist_min':[], 'hist_max':[], 'ot_loss':[]} losses = {'ot_loss': [], 'fit_loss': []} history = {'dset_size': dset_size, 'epoch': 0, 'iter': 0, 'losses' : losses, 'hist_dict': hist_dict} config['early_end'] = (200, 320) # Empirical stopping criterion from EWM author stop_counter = 0 # Set up progress bar for terminal output and enumeration epoch_bar = tqdm([i for i in range(config['num_epochs'])]) # Training Loop for epoch, _ in enumerate(epoch_bar): history['epoch'] = epoch # Set up memory lists: # - mu: simple feed-forward distribution # - transfer: transfer plan given by lists of indices # Rule-of-thumb: do not save the tensors themselves: instead, save the # data as a list and covert it to a tensor as needed. mu = [0] * config['mem_size'] transfer = [0] * config['mem_size'] mem_idx = 0 # Compute the Optimal Transport Solver for ots_iter in range(1, dset_size//2): history['iter'] = ots_iter psi_optim.zero_grad() # Generate samples from feed-forward distribution z_batch = torch.randn(config['batch_size'], config['z_dim']).to(device) y_fake = G(z_batch) # [B, dset_size] # # Add Gaussian noise to the output of the generator function and to the data with tessellation vectors t1 = tess_var*torch.randn(y_fake.shape[0], y_fake.shape[1]).to(device) t2 = tess_var*torch.randn(dataloader.shape[0], dataloader.shape[1]).to(device) y_fake += t1 dataloader += t2 # Compute the W1 distance between the model output and the target distribution score = my_ops.l1_t(y_fake, dataloader) - psi phi, hit = torch.max(score, 1) # Remove the tesselation from the dataloader dataloader -= t2 # Standard loss computation # This loss defines the sample mean of the marginal distribution # of the dataset. This is the only computation that generalizes. loss = -torch.mean(psi[hit]) # Backprop loss.backward() psi_optim.step() # Update memory tensors mu[mem_idx] = z_batch.data.cpu().numpy().tolist() transfer[mem_idx] = hit.data.cpu().numpy().tolist() mem_idx = (mem_idx + 1) % config['mem_size'] # Update losses history['losses']['ot_loss'].append(loss.item()) if (ots_iter % 500 == 0): avg_loss = np.mean(history['losses']['ot_loss']) print('OTS Iteration {} | Epoch {} | Avg Loss Value: {}'.format(ots_iter, epoch, round(avg_loss, 3))) # if (iter % 2000 == 0): # # Display histogram stats # hist_dict, stop = utils.update_histogram(transfer, history, config) # # Emperical stopping criterion # if stop: # break if ots_iter > (dset_size//3): if stop_min <= np.mean(history['losses']['ot_loss']) <= stop_max: stop_counter += 1 break # Compute the Optimal Fitting Transport Plan for fit_iter in range(config['mem_size']): G_optim.zero_grad() # Retrieve stored batch of generated samples z_batch = torch.tensor(mu[fit_iter]).to(device) y_fake = G(z_batch) # G'(z) # Get Transfer plan from OTS: T(G_{t-1}(z)) t_plan = torch.tensor(transfer[fit_iter]).to(device) y0_hit = dataloader[t_plan].to(device) # Tesselate the output of the generator function and the data # t1 = tess_var*torch.randn(y_fake.shape[0], y_fake.shape[1]).to(device) # t2 = tess_var*torch.randn(y0_hit.shape[0], y0_hit.shape[1]).to(device) # y_fake *= t1 # y0_hit *= t1 # Compute Wasserstein distance between G and T G_loss = torch.mean(torch.abs(y0_hit - y_fake)) * config['l_dim'] # Backprop G_loss.backward() # Gradient descent G_optim.step() # Update losses history['losses']['fit_loss'].append(G_loss.item()) # Check if best loss value and save checkpoint if 'best_loss' not in history: history.update({ 'best_loss' : G_loss.item() }) best = G_loss.item() < (history['best_loss'] * 0.70) if best: history['best_loss'] = G_loss.item() checkpoint = utils.get_checkpoint(history['epoch'], checkpoint_kwargs, config) utils.save_checkpoint(checkpoint, config) if (fit_iter % 500 == 0): avg_loss = np.mean(history['losses']['fit_loss']) print('FIT Iteration {} | Epoch {} | Avg Loss Value: {}'.format(fit_iter, epoch, round(avg_loss,3))) # Save a checkpoint at end of training checkpoint = utils.get_checkpoint(history['epoch'], checkpoint_kwargs, config) utils.save_checkpoint(checkpoint, config) # Save training data to csv's after training end utils.save_train_hist(history, config, times=None, histogram=history['hist_dict']) print("Stop Counter Triggered {} Times".format(stop_counter)) # For Aiur print("I see you have an appetite for destruction.") print("And you have learned to use your illusion.") print("But I find your lack of control disturbing.")
if torch.__version__.endswith('+cpu'): torch_version = version.parse(torch.__version__.rstrip('+cpu')) else: torch_version = version.parse(torch.__version__) try: __version__ = get_distribution(__name__).version except DistributionNotFound: # package is not installed pass if config.JIT_ENABLED: extensions_dir = os.path.join(pkg_dir, 'csrc') sources = glob.glob(os.path.join(extensions_dir, '*.cpp')) sources = [os.path.join(extensions_dir, s) for s in sources] try: cpp_extension.load( name='autograd_ste_ops', sources=sources, is_python_module=False, verbose=config.VERBOSE) NATIVE_STE_BACKEND_LOADED = True except: warnings.warn("Brevitas' native STE backend is enabled but couldn't be loaded") NATIVE_STE_BACKEND_LOADED = False else: NATIVE_STE_BACKEND_LOADED = False
# -*- coding: utf-8 -*- """ Created on Mon Oct 1 07:59:09 2018 @author: nsde """ #%% import torch from torch.utils.cpp_extension import load #%% if __name__ == '__main__': # _dir = get_dir(__file__) # Compile cpu source cpab_cpu = load(name='cpab_cpu', sources=['CPAB_ops.cpp'], verbose=True) # Compule gpu source cpab_gpu = load(name='cpab_gpu', sources=['CPAB_ops_cuda.cpp', 'CPAB_ops_cuda_kernel.cu'], verbose=True)
import torch from torch.utils.cpp_extension import load cd = load(name="cd", sources=[ "./chamfer_distance/chamfer_distance.cpp", "./chamfer_distance/chamfer_distance.cu" ]) class ChamferDistanceFunction(torch.autograd.Function): @staticmethod def forward(ctx, xyz1, xyz2): batchsize, n, _ = xyz1.size() _, m, _ = xyz2.size() xyz1 = xyz1.contiguous() xyz2 = xyz2.contiguous() dist1 = torch.zeros(batchsize, n) dist2 = torch.zeros(batchsize, m) idx1 = torch.zeros(batchsize, n, dtype=torch.int) idx2 = torch.zeros(batchsize, m, dtype=torch.int) if not xyz1.is_cuda: cd.forward(xyz1, xyz2, dist1, dist2, idx1, idx2) else: dist1 = dist1.cuda() dist2 = dist2.cuda() idx1 = idx1.cuda() idx2 = idx2.cuda() cd.forward_cuda(xyz1, xyz2, dist1, dist2, idx1, idx2)
def load_cpp(name, files, path): os.makedirs(Config().model / 'qrnn', exist_ok=True) return cpp_extension.load(name=name, sources=[path / f for f in files], build_directory=Config().model / 'qrnn')
# Ref: https://github.com/chrdiller/pyTorchChamferDistance import os, torch, torch.nn as nn from torch.utils.cpp_extension import load basedir = os.path.dirname(__file__) cd = load(name="cd", sources=[ os.path.join(basedir, "chamfer_distance.cpp"), os.path.join(basedir, "chamfer_distance.cu") ]) class ChamferDistanceFunction(torch.autograd.Function): @staticmethod def forward(ctx, xyz1, xyz2): batchsize, n, _ = xyz1.size() _, m, _ = xyz2.size() xyz1 = xyz1.contiguous() xyz2 = xyz2.contiguous() dist1 = torch.zeros(batchsize, n) dist2 = torch.zeros(batchsize, m) idx1 = torch.zeros(batchsize, n, dtype=torch.int) idx2 = torch.zeros(batchsize, m, dtype=torch.int) if not xyz1.is_cuda: cd.forward(xyz1, xyz2, dist1, dist2, idx1, idx2) else: dist1 = dist1.cuda() dist2 = dist2.cuda() idx1 = idx1.cuda()
import os import torch from torch import nn from torch.nn import functional as F from torch.autograd import Function from torch.utils.cpp_extension import load module_path = os.path.dirname(__file__) fused = load( "fused", sources=[ os.path.join(module_path, "fused_bias_act.cpp"), os.path.join(module_path, "fused_bias_act_kernel.cu"), ], ) class FusedLeakyReLUFunctionBackward(Function): @staticmethod def forward(ctx, grad_output, out, bias, negative_slope, scale): ctx.save_for_backward(out) ctx.negative_slope = negative_slope ctx.scale = scale empty = grad_output.new_empty(0) grad_input = fused.fused_bias_act(grad_output, empty, out, 3, 1, negative_slope, scale) dim = [0]
import os chamfer_found = importlib.find_loader("chamfer_6D") is not None if not chamfer_found: ## Cool trick from https://github.com/chrdiller print("Jitting Chamfer 6D") cur_path = os.path.dirname(os.path.abspath(__file__)) build_path = cur_path.replace('chamfer6D', 'tmp') os.makedirs(build_path, exist_ok=True) from torch.utils.cpp_extension import load chamfer_6D = load(name="chamfer_6D", sources=[ "/".join( os.path.abspath(__file__).split('/')[:-1] + ["chamfer_cuda.cpp"]), "/".join( os.path.abspath(__file__).split('/')[:-1] + ["chamfer6D.cu"]), ], build_directory=build_path) print("Loaded JIT 6D CUDA chamfer distance") else: import chamfer_6D print("Loaded compiled 6D CUDA chamfer distance") # Chamfer's distance module @thibaultgroueix # GPU tensors only class chamfer_6DFunction(Function): @staticmethod
# CXX=g++-4.9 CC=gcc-4.9 python jit.py from torch.utils.cpp_extension import load import os cur_dir = os.path.abspath(os.path.dirname(__file__)) gpu_flow = load( "gpu_flow", ["gpu_flow.cpp", "gpu_flow_kernel.cu"], build_directory=cur_dir, verbose=True, extra_cuda_cflags=[ "-arch=sm_52", "--ptxas-options=-v", "-c", "--compiler-options", "'-fPIC'" ], # sm_35, sm_61 ) help(gpu_flow)
import os import torch import torch.nn as nn import torch.nn.functional as F from torch.autograd import Function # import neural_renderer.cuda.rasterize as rasterize_cuda from torch.utils.cpp_extension import load rasterize_cuda = load(name='rasterize_cuda', sources=[os.path.join(os.path.dirname(__file__), 'cuda/rasterize_cuda.cpp'), os.path.join(os.path.dirname(__file__), 'cuda/rasterize_cuda_kernel.cu')]) DEFAULT_IMAGE_SIZE = 256 DEFAULT_ANTI_ALIASING = True DEFAULT_NEAR = 0.1 DEFAULT_FAR = 100 DEFAULT_EPS = 1e-4 DEFAULT_BACKGROUND_COLOR = (0, 0, 0) class RasterizeFunction(Function): ''' Definition of differentiable rasterize operation Some parts of the code are implemented in CUDA Currently implemented only for cuda Tensors ''' @staticmethod def forward(ctx, faces, textures, image_size, near, far, eps, background_color, return_rgb=False, return_alpha=False, return_depth=False): ''' Forward pass
import torch from torch.autograd import Function from torch.nn.modules.module import Module from torch.autograd import Variable import os from torch.autograd.function import once_differentiable torch_ver = torch.__version__[:3] if torch_ver=="0.4": from torch.utils.cpp_extension import load build_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),'../cppcuda/build/')) print('compiling/loading roi_align') roialign = load(name='roialign',sources=['lib/cppcuda/roi_align_binding.cpp', 'lib/cppcuda/roi_align_forward_cuda.cu', 'lib/cppcuda/roi_align_backward_cuda.cu'], build_directory=build_path,verbose=True) else: import cppcuda_cffi.roialign as roialign class RoIAlignFunction(Function): # def __init__(ctx, pooled_height, pooled_width, spatial_scale, sampling_ratio): # ctx.pooled_width = int(pooled_width) # ctx.pooled_height = int(pooled_height) # ctx.spatial_scale = float(spatial_scale) # ctx.sampling_ratio = int(sampling_ratio) # ctx.features_size = None # ctx.rois=None @staticmethod
from torch.utils.cpp_extension import load calc_assoc_cuda = load('calc_assoc_cuda', ['calc_assoc_cuda.cpp', 'calc_assoc_cuda_kernel.cu'], verbose=True) help(calc_assoc_cuda)
import torch.autograd as autograd import torch.cuda.comm as comm import torch.nn.functional as F from torch.autograd.function import once_differentiable from torch.utils.cpp_extension import load import os, time import functools from torch.autograd import Variable curr_dir = os.path.dirname(os.path.abspath(__file__)) _src_path = os.path.join(curr_dir, "src") _build_path = os.path.join(curr_dir, "build") os.makedirs(_build_path, exist_ok=True) rcca = load(name="rcca", extra_cflags=["-O3"], build_directory=_build_path, verbose=True, sources = [os.path.join(_src_path, f) for f in [ "lib_cffi.cpp", "ca.cu" ]], extra_cuda_cflags=["--expt-extended-lambda"]) def _check_contiguous(*args): if not all([mod is None or mod.is_contiguous() for mod in args]): raise ValueError("Non-contiguous input") class CA_Weight(autograd.Function): @staticmethod def forward(ctx, t, f): # Save context n, c, h, w = t.size() size = (n, h+w-1, h, w)
#Copyright (c) Facebook, Inc. and its affiliates. #All rights reserved. #This source code is licensed under the license found in the #LICENSE file in the root directory of this source tree. import torch from torch.utils.cpp_extension import load cd = load(name="cd", sources=[ "../third_party_code/chamfer_distance.cpp", "../third_party_code/chamfer_distance.cu" ]) class ChamferDistanceFunction(torch.autograd.Function): @staticmethod def forward(ctx, xyz1, xyz2): batchsize, n, _ = xyz1.size() _, m, _ = xyz2.size() xyz1 = xyz1.contiguous() xyz2 = xyz2.contiguous() dist1 = torch.zeros(batchsize, n) dist2 = torch.zeros(batchsize, m) idx1 = torch.zeros(batchsize, n, dtype=torch.int) idx2 = torch.zeros(batchsize, m, dtype=torch.int) dist1 = dist1.cuda() dist2 = dist2.cuda() idx1 = idx1.cuda() idx2 = idx2.cuda()
from torch.nn import functional as F from torch.autograd import Function from torch.utils.cpp_extension import load module_path = os.path.dirname(__file__) gpu_name = "".join(torch.cuda.get_device_name().split(" ")) build_dir = os.path.join( module_path, ".build_cache_{}_PT{}_cu{}_gpu{}".format(socket.gethostname(), torch.__version__, torch.version.cuda, gpu_name)) if not os.path.exists(build_dir): os.makedirs(build_dir) upfirdn2d_op = load( "upfirdn2d", sources=[ os.path.join(module_path, "upfirdn2d.cpp"), os.path.join(module_path, "upfirdn2d_kernel.cu"), ], build_directory=build_dir, ) class UpFirDn2dBackward(Function): @staticmethod def forward(ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad, in_size, out_size): up_x, up_y = up down_x, down_y = down g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1)
from os import path import torch import torch.autograd as autograd import torch.cuda.comm as comm from torch.autograd.function import once_differentiable from torch.utils.cpp_extension import load from torch import nn from torch.nn import functional as F _src_path = path.dirname(path.abspath(__file__)) _ext = load( name="incenter_match_build", extra_cflags=["-O3"], sources=[path.join(_src_path, f) for f in [ "vanilla.cpp", "vanilla.cu", ]], extra_cuda_cflags=[ "--expt-extended-lambda -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__" ], ) def _check_contiguous(*args): if not all([mod is None or mod.is_contiguous() for mod in args]): raise ValueError("Non-contiguous input") # weight : N,S,H,W class Vanilla_Weight(autograd.Function): @staticmethod
from os import path import torch.autograd as autograd import torch.cuda.comm as comm from torch.autograd.function import once_differentiable from torch.utils.cpp_extension import load _src_path = path.join(path.dirname(path.abspath(__file__)), "src") _backend = load(name="inplace_abn", extra_cflags=["-O3"], sources=[path.join(_src_path, f) for f in [ "inplace_abn.cpp", "inplace_abn_cpu.cpp", "inplace_abn_cuda.cu" ]], extra_cuda_cflags=["--expt-extended-lambda"]) # Activation names ACT_LEAKY_RELU = "leaky_relu" ACT_ELU = "elu" ACT_NONE = "none" def _check(fn, *args, **kwargs): success = fn(*args, **kwargs) if not success: raise RuntimeError("CUDA Error encountered in {}".format(fn)) def _broadcast_shape(x): out_size = []
from torch.utils.cpp_extension import load lltm_cpp = load(name="lltm_cpp", sources=["lltm.cpp"], verbose=True) help(lltm_cpp)
import os import torch from torch.nn import functional as F from torch.autograd import Function from torch.utils.cpp_extension import load module_path = os.path.dirname(__file__) upfirdn2d_op = load( "upfirdn2d", sources=[ os.path.join(module_path, "upfirdn2d.cpp"), os.path.join(module_path, "upfirdn2d_kernel.cu"), ], ) class UpFirDn2dBackward(Function): @staticmethod def forward(ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad, in_size, out_size): up_x, up_y = up down_x, down_y = down g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1) grad_input = upfirdn2d_op.upfirdn2d( grad_output, grad_kernel,
# tf.backends.cudnn.enabled = False #tf.backends.cudnn.benchmark = True MULTIPLIER = tf.cuda.device_count() else: MULTIPLIER = 1 #if (hvd.rank() == 0): if (True): f_out = open("./PREDICT.OUT", "w") f_out.close() FREEZE_MODEL = tf.load("./freeze_model.pytorch", map_location=device) """Load coordinates, sym_coordinates, energy, force, type, n_atoms and parameters""" script_path = sys.path[0] if (device != tf.device('cpu')): comput_descrpt_and_deriv = load(name="test_from_cpp", sources=[script_path + "/comput_descrpt_deriv.cu"], verbose=True) else: comput_descrpt_and_deriv = load(name="test_from_cpp", sources=[script_path + "/comput_descrpt_deriv.cpp", script_path + "/../../c/Utilities.cpp"], verbose=True, extra_cflags=["-fopenmp", "-O2"]) parameters_from_bin = FREEZE_MODEL['parameters'] parameters_from_file = Parameters() read_parameters_flag = read_parameters(parameters_from_file) parameters_from_file_adapt_bin = Parameters() read_parameters_flag = read_parameters(parameters_from_file_adapt_bin) parameters = parameters_from_file print("All parameters:") print(parameters) COORD_Reshape_tf, SYM_COORD_Reshape_tf, ENERGY_tf, FORCE_Reshape_tf, N_ATOMS_tf, TYPE_Reshape_tf, NEI_IDX_Reshape_tf, \ NEI_COORD_Reshape_tf, FRAME_IDX_tf, SYM_COORD_DX_Reshape_tf, SYM_COORD_DY_Reshape_tf, SYM_COORD_DZ_Reshape_tf, \ N_ATOMS_ORI_tf, NEI_TYPE_Reshape_tf= read_and_init_bin_file(parameters_from_file, default_dtype=default_dtype, is_predict=1)
import os import torch from torch import nn from torch.autograd import Function from torch.utils.cpp_extension import load module_path = os.path.dirname(__file__) fused = load( 'fused', sources=[ os.path.join(module_path, 'fused_bias_act.cpp'), os.path.join(module_path, 'fused_bias_act_kernel.cu'), ], ) class FusedLeakyReLUFunctionBackward(Function): @staticmethod def forward(ctx, grad_output, out, negative_slope, scale): ctx.save_for_backward(out) ctx.negative_slope = negative_slope ctx.scale = scale empty = grad_output.new_empty(0) grad_input = fused.fused_bias_act(grad_output, empty, out, 3, 1, negative_slope, scale) dim = [0]
import warnings import os from torch.utils.cpp_extension import load warnings.warn("Unable to load pointops_cuda cpp extension.") pointops_cuda_src = os.path.join(os.path.dirname(__file__), "../src") pointops_cuda = load('pointops_cuda', [ pointops_cuda_src + '/pointops_api.cpp', pointops_cuda_src + '/ballquery/ballquery_cuda.cpp', pointops_cuda_src + '/ballquery/ballquery_cuda_kernel.cu', pointops_cuda_src + '/knnquery/knnquery_cuda.cpp', pointops_cuda_src + '/knnquery/knnquery_cuda_kernel.cu', pointops_cuda_src + '/knnquery_heap/knnquery_heap_cuda.cpp', pointops_cuda_src + '/knnquery_heap/knnquery_heap_cuda_kernel.cu', pointops_cuda_src + '/grouping/grouping_cuda.cpp', pointops_cuda_src + '/grouping/grouping_cuda_kernel.cu', pointops_cuda_src + '/grouping_int/grouping_int_cuda.cpp', pointops_cuda_src + '/grouping_int/grouping_int_cuda_kernel.cu', pointops_cuda_src + '/interpolation/interpolation_cuda.cpp', pointops_cuda_src + '/interpolation/interpolation_cuda_kernel.cu', pointops_cuda_src + '/sampling/sampling_cuda.cpp', pointops_cuda_src + '/sampling/sampling_cuda_kernel.cu', pointops_cuda_src + '/labelstat/labelstat_cuda.cpp', pointops_cuda_src + '/labelstat/labelstat_cuda_kernel.cu', pointops_cuda_src + '/featuredistribute/featuredistribute_cuda.cpp', pointops_cuda_src + '/featuredistribute/featuredistribute_cuda_kernel.cu' ], build_directory=pointops_cuda_src, verbose=False) class FurthestSampling(Function): @staticmethod def forward(ctx, xyz, m):
def train_MNIST(config): # Create python version of cpp operation if config['dist'] == 'W1': print("Building C++ extension for W1 (requires PyTorch >= 1.0.0)...") from torch.utils.cpp_extension import load my_ops = load(name="my_ops", sources=[ "W1_extension/my_ops.cpp", "W1_extension/my_ops_kernel.cu" ], verbose=False) import my_ops print("Building complete") # Centralize stats logging times, losses, hist_dict, checkpt = utils.centralized_logs() # Select device device = torch.device(config['gpu']) # Update config dict with MNIST params and get MNIST dataset as one batch # config, mnist_data = utils.MNIST(config) ''' Returns MNIST training data as a single batch of data ''' transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, ))]) dataset = dset.MNIST(root=config['data_root'], train=True, download=True, transform=transform) def get_data(dataset): full_dataloader = torch.utils.data.DataLoader(dataset, batch_size=len(dataset)) for y_batch, l_batch in full_dataloader: return y_batch y_t = get_data(dataset) n_dim = len(y_t) mnist_data = y_t.view(n_dim, -1).to(device) config.update({ 'dset_size': n_dim, 'imsize': 28, 'out_dim': 784, 'batch_size': 64, 'sample_size': 16, 'early_end': (200, 320) }) # Set MLP architecture G_arch = utils.get_mlp_arch(config) # Create G_model G_mlp = utils.get_model(G_arch, device) # Get optimizer opt_G = utils.get_optim(G_arch, G_mlp.parameters(), MNIST=True) print(G_mlp) # Initialize G_model weights and get the number of layers # G_mlp = utils.weights_init(G_mlp, MNIST=True) def initialize_weights(net): for m in net.modules(): if isinstance(m, nn.Linear): m.weight.data.normal_(0, 0.02) if hasattr(m, "bias") and m.bias is not None: m.bias.data.zero_() initialize_weights(G_mlp) # Create labels for experiment labels = utils.create_labels(config) # Update config with labels and save locations config = utils.update_with_labels(config, labels) # Setup psi optimizer psi = torch.zeros(n_dim, requires_grad=True, device=device) opt_psi = torch.optim.Adam([psi], lr=1e-1) # Set fixed noise vector for testing z_fixed = utils.get_z(config, device, sample=False) z_fixed.resize_((config['batch_size'], config['z_dim'])).to(device) # Training loop for epoch in range(config['num_epochs']): epoch_start_time = time.time() # Save list of losses for end-training determination loss_memory = [] # Set up memory tensors: simple feed-forward distribution, transfer plan mu = torch.zeros(config['mem_size'], config['batch_size'], config['z_dim']) transfer = torch.zeros(config['mem_size'], config['batch_size'], dtype=torch.long) mem_idx = 0 # Compute Optimal Transport Solver (OTS) over every training example ot_start = time.time() # for ots_iter in range(0, config['dset_size']): for iter in range(1, 20001): opt_psi.zero_grad() # Generate samples from feed-forward distribution z_batch = utils.get_z(config, device, sample=False) z_batch.resize_((config['batch_size'], config['z_dim'])).to(device) y_fake = G_mlp(z_batch) # [B, n_dim] # Compute cost between sample batch and target distribution if (config['dist'] == 'W1'): score = -my_ops.l1_t(y_fake, mnist_data) - psi else: score = torch.matmul( y_fake, mnist_data.t()) - psi # score: [B, N], psi: [N] phi, hit = torch.max(score, 1) # phi, hit = torch.min(score, 1) # [B], [B] # Wasserstein distance computation: d(x,y)^p if (config['dist'] == 'W1'): loss_primal = torch.mean( torch.abs(y_fake - mnist_data[hit])) * config['out_dim'] else: loss_primal = torch.mean( (y_fake - mnist_data[hit])**2) * config['out_dim'] # Loss computation # loss = (torch.mean(phi) + torch.mean(psi)) # Testing this loss = -torch.mean(psi[hit]) # equiv. to loss? # Backprop loss.backward() # Gradient ascent opt_psi.step() # Append losses to dict losses['ot_loss'].append(loss.item()) losses['w2_estim'].append(loss_primal.item()) # Update memory tensors mu[mem_idx] = z_batch transfer[mem_idx] = hit mem_idx = (mem_idx + 1) % config['mem_size'] if (iter % 500 == 0): print('OTS Iteration {} | Epoch {}'.format(iter, epoch)) if (iter % 2000 == 0): # Display histogram stats hist_dict, stop = utils.update_histogram( transfer, n_dim, epoch, iter, config, losses, hist_dict) # Emperical stopping criterion if stop: break # Compute OTS time and append ot_end = time.time() times['ot_time'].append(ot_end - ot_start) # Compute Fitting Optimal Transport Plan (FIT) fit_start = time.time() for fit_iter in range(config['mem_size']): opt_G.zero_grad() # Get stored batch of generated samples z_batch = mu[fit_iter].to(device) y_fake = G_mlp(z_batch) # G'(z) # Get Transfer plan from OTS: T(G_{t-1}(z)) y0_hit = mnist_data[transfer[fit_iter].to(device)] # Compute Wasserstein distance between G and T if (config['dist'] == 'W1'): loss_g = torch.mean( torch.abs(y0_hit - y_fake)) * config['out_dim'] else: loss_g = torch.mean((y0_hit - y_fake)**2) * config['out_dim'] # Backprop loss_g.backward() # Gradient descent opt_G.step() # Append losses to dict losses['g_loss'].append(loss_g.item()) loss_memory.append(loss_g.item()) if (fit_iter % 500 == 0): print( 'Fit_iter: {} | Epoch: {} | Loss: {:.2f} | Best Loss: {:.2f}' .format(fit_iter, epoch, loss_g, checkpt['best'])) # Check if best loss value and save checkpoint # threshold = (checkpt['best'] - round(checkpt['best']*0.5)) # best = ( loss_g.item() < threshold ) # if best: # checkpt['best'] = loss_g.item() # chkpt_dict = utils.checkpoint_dict(fit_iter, epoch, G_mlp, opt_G) # utils.save_checkpoint(chkpt_dict, best, epoch, -1, config['weights_root']) # Save periodic checkpoint # if (fit_iter % 2000 == 0): # chkpt_dict = utils.checkpoint_dict(fit_iter, epoch, G_mlp, opt_G) # utils.save_checkpoint(chkpt_dict, False, epoch, fit_iter, config['weights_root']) # Get random sample from G if (fit_iter % 1000 == 0): z_rand = utils.get_z(config, device, sample=True) z_rand.resize_( (config['sample_size'], config['z_dim'])).to(device) sample = G_mlp(z_rand).view(-1, 1, config['imsize'], config['imsize']) utils.save_sample(sample, epoch, fit_iter, config['random']) # Check if loss is changing - stop training if no change if (len(loss_memory) > (config['mem_size'] // 2)): if ((loss_g <= (mean(loss_memory)*.999)) and \ (loss_g >= (mean(loss_memory)*.995))): break # Compute FIT time fit_end = time.time() times['fit_time'].append(fit_end - fit_start) # Compute epoch time times['epoch_times'].append(time.time() - epoch_start_time) # Output to terminal print('Best loss: {}'.format(checkpt['best'])) print('Epoch_time: {}'.format(time.time() - epoch_start_time)) print('Num epochs: {}'.format(epoch)) print("FIT loss: {:.2f}".format(np.mean(losses['g_loss']))) # Save fixed sample at end of training epoch sample = G_mlp(z_fixed).view(-1, 1, config['imsize'], config['imsize']) utils.save_sample(sample, epoch, 0, config['fixed']) # Save training data to csv after training completion utils.save_stats(times, losses, hist_dict, G_arch, config['save_root'])
import math import torch import torch.nn as nn from torch.utils.cpp_extension import load tr_cuda = load('tr_cuda', ['kernels/tr_cuda.cpp', 'kernels/tr_cuda_kernel.cu']) def hese(number): ''' Applies HESE encoding on a number. Returns the power-of-two exponents in the encoding. ''' char_number = bin(number).split('b')[1] if bin(number)[0] == '-': sign = -1 else: sign = 1 char_number = '0' + char_number + '0' char_number = char_number[::-1] exponents = [] for i in range(len(char_number) - 1): b1 = char_number[i] b2 = char_number[i + 1] if b1 == b2: continue if b1 == '0': exponents.append(-sign * 2**i) else: exponents.append(sign * 2**i)
import torch.autograd from torch.autograd import Function from torch.utils.cpp_extension import load import os base_path = os.getcwd() line_variance_parallel = load( name="line_variance_parallel", sources=[ os.path.join( base_path, "layers/DefGrid/variance_function_atom/line_distance_func_parallel/variance_line_distance_for.cu" ), os.path.join( base_path, "layers/DefGrid/variance_function_atom/line_distance_func_parallel/variance_line_distance_back.cu" ), os.path.join( base_path, "layers/DefGrid/variance_function_atom/line_distance_func_parallel/variance_line_distance.cpp" ) ], verbose=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ############################################ eps = 1e-8 debug = False ############################################3
Just be sure to import torch first, as this will resolve some symbols that the dynamic linker must see; """ import math import torch import time """ method 1: Building with setuptools # Our module! from build.lib import lltm_cpp """ """ JIT Compiling Extensions: just in time, JIT """ from torch.utils.cpp_extension import load lltm_cpp = load( name="lltm_cpp", sources=["lltm.cpp"], #verbose = False verbose=True) class LLTMFunction(torch.autograd.Function): @staticmethod def forward(ctx, input, weights, bias, old_h, old_cell): outputs = lltm_cpp.forward(input, weights, bias, old_h, old_cell) new_h, new_cell = outputs[:2] variables = outputs[1:] + [weights] ctx.save_for_backward(*variables) return new_h, new_cell @staticmethod
import torch from torch.autograd import Function from torch.nn import Module from torch.utils.cpp_extension import load lib = load( name="depthflowprojection_cuda", sources=[ "DAIN/helper/DepthFlowProjection/depthflowprojection_cuda.cc", "DAIN/helper/DepthFlowProjection/depthflowprojection_cuda_kernel.cu" ], verbose=True, ) class DepthFlowProjectionLayer(Function): def __init__(self, requires_grad): super(DepthFlowProjectionLayer, self).__init__() @staticmethod def forward(ctx, input1, input2, requires_grad): assert input1.is_contiguous() assert input2.is_contiguous() fillhole = 1 if requires_grad == False else 0 if input1.is_cuda: count = (torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()) output = torch.cuda.FloatTensor().resize_(input1.size()).zero_() err = lib.DepthFlowProjectionLayer_gpu_forward(
from os import path import torch import torch.distributed as dist import torch.autograd as autograd import torch.cuda.comm as comm from torch.autograd.function import once_differentiable from torch.utils.cpp_extension import load _src_path = path.join(path.dirname(path.abspath(__file__)), "src") _backend = load(name="inplace_abn", extra_cflags=["-O3"], sources=[ path.join(_src_path, f) for f in [ "inplace_abn.cpp", "inplace_abn_cpu.cpp", "inplace_abn_cuda.cu", "inplace_abn_cuda_half.cu" ] ], extra_cuda_cflags=["--expt-extended-lambda"]) # Activation names ACT_RELU = "relu" ACT_LEAKY_RELU = "leaky_relu" ACT_ELU = "elu" ACT_NONE = "none" def _check(fn, *args, **kwargs): success = fn(*args, **kwargs) if not success: raise RuntimeError("CUDA Error encountered in {}".format(fn))
from pathlib import Path import os, sys _srcdir = Path(__file__).resolve().parent _build_dir = Path.home() / "tmp" from torch.utils.cpp_extension import load, verify_ninja_availability try: verify_ninja_availability() except: os.environ['PATH'] = str(Path( sys.executable).parent) + ":" + os.environ['PATH'] print("Compiling npp extension") if (_build_dir / "lock").exists(): print("Warning: found %s, compilation may hang here" % (_build_dir / "lock")) nppig_cpp = load(verbose=False, name="nppig_cpp", sources=[_srcdir / "nppig.cpp"], extra_ldflags=['-lnppc', '-lnppig'], with_cuda=True, build_directory=_build_dir) print("done")
from os import path import torch.autograd as autograd import torch.cuda.comm as comm from torch.autograd.function import once_differentiable from torch.utils.cpp_extension import load _src_path = path.join(path.dirname(path.abspath(__file__)), "src") _backend = load( name="inplace_abn", # extra_cflags=["-O3"], extra_cflags=["-O3 -D_GLIBCXX_USE_CXX11_ABI=0"], sources=[ path.join(_src_path, f) for f in ["inplace_abn.cpp", "inplace_abn_cpu.cpp", "inplace_abn_cuda.cu"] ], extra_cuda_cflags=["--expt-extended-lambda"]) # Activation names ACT_RELU = "relu" ACT_LEAKY_RELU = "leaky_relu" ACT_ELU = "elu" ACT_NONE = "none" def _check(fn, *args, **kwargs): success = fn(*args, **kwargs) if not success: raise RuntimeError("CUDA Error encountered in {}".format(fn))
from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast from torch.utils.cpp_extension import load from lib.extensions.syncbn.comm import SyncMaster torch_ver = torch.__version__[:3] print('compiling/loading syncbn') build_path = '/tmp/bulid/syncbn' if not os.path.exists(build_path): os.makedirs(build_path) syncbn = load(name='syncbn', sources=[ 'lib/extensions/syncbn/src/operator.cpp', 'lib/extensions/syncbn/src/syncbn_kernel.cu' ], build_directory=build_path, verbose=True) def sum_square(input): r"""Calculate sum of elements and sum of squares for Batch Normalization""" return _sum_square.apply(input) class _sum_square(Function): @staticmethod def forward(ctx, input): ctx.save_for_backward(input) if input.is_cuda:
import torch from torch.utils.cpp_extension import load cd = load( name="cd", sources=[ "/home/user/point-normals-upsampling/pyTorchChamferDistance/chamfer_distance/chamfer_distance.cpp", "/home/user/point-normals-upsampling/pyTorchChamferDistance/chamfer_distance/chamfer_distance.cu" ]) class ChamferDistanceFunction(torch.autograd.Function): @staticmethod def forward(ctx, xyz1, xyz2): batchsize, n, _ = xyz1.size() _, m, _ = xyz2.size() xyz1 = xyz1.contiguous() xyz2 = xyz2.contiguous() dist1 = torch.zeros(batchsize, n) dist2 = torch.zeros(batchsize, m) idx1 = torch.zeros(batchsize, n, dtype=torch.int) idx2 = torch.zeros(batchsize, m, dtype=torch.int) if not xyz1.is_cuda: cd.forward(xyz1, xyz2, dist1, dist2, idx1, idx2) else: dist1 = dist1.cuda() dist2 = dist2.cuda() idx1 = idx1.cuda()
# Uses code from https://github.com/cooooorn/Pytorch-XNOR-Net from modules.base import * from torch.nn.parameter import Parameter from torch.utils.cpp_extension import load boolop_cuda = load(name="boolop_cuda", sources=[ "extensions/booleanOperations.cpp", "extensions/booleanOperationsCuda.cu" ]) from extensions import booleanOperations import cupy class AsType(nn.Module): def __init__(self, dtype): super(AsType, self).__init__() self.dtype = dtype def forward(self, x): return x.type(self.dtype) class ExtractBits(nn.Module): def __init__(self, dtype): nn.Module.__init__(self) self.dtype = dtype self.bitlength = torch.iinfo(dtype).bits self.mask = 2**torch.arange(self.bitlength, dtype=dtype) def forward(self, input):
from torch.utils.cpp_extension import load lltm_cuda = load( 'lltm_cuda', ['lltm_cuda.cpp', 'lltm_cuda_kernel.cu'], verbose=True) help(lltm_cuda)