def bench(batch_size: int, d: int, hw: int, num_iter: int): if not torch.cuda.is_available(): print("GPU is not available") return device = torch.device('cuda:0') torch.set_grad_enabled(False) # BxDxHxWx2 inp = torch.randn(batch_size, d, hw, hw, 2, device=device) # warmup outp = torch.fft(inp, 3) inp_ = torch.ifft(outp, 3) # fft start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() with contexttimer.Timer() as t: for it in range(num_iter): outp = torch.fft(inp, 3) end.record() torch.cuda.synchronize() elapsed = start.elapsed_time(end) / 1e3 tps = num_iter / elapsed fft_time_consume = elapsed del outp, inp outp = torch.randn(batch_size, d, hw, hw, 2, device=device) # ifft start.record() with contexttimer.Timer() as t: for it in range(num_iter): inp_ = torch.ifft(outp, 3) end.record() torch.cuda.synchronize() elapsed = start.elapsed_time(end) / 1e3 itps = num_iter / elapsed ifft_time_consume = elapsed print( json.dumps({ "TPS": tps, "fft_elapsed": fft_time_consume, "ITPS": itps, "ifft_elapsed": ifft_time_consume, "n": num_iter, "batch_size": batch_size, "D_size": d, "HW_size": hw, }))
def test_timer_print(self): def print_reversed(string): print " ".join(reversed(string.split())) tests = [ # (kwargs, expected_regex) ({ 'output': True }, r"took [0-9.]+ seconds"), ({ 'output': print_reversed }, r"seconds [0-9.]+ took"), ({ 'prefix': 'foo' }, r"foo took [0-9.]+ seconds"), ({ 'output': True, 'prefix': 'foo' }, r"foo took [0-9.]+ seconds"), ({ 'output': True, 'fmt': '{} seconds later...' }, r"[0-9.]+ seconds later..."), ] for kwargs, expected in tests: output = StringIO() with mock.patch('sys.stdout', new=output): with contexttimer.Timer(**kwargs): pass self.assertIsNotNone(output) self.assertRegexpMatches(output.getvalue(), expected)
def run_model(model, use_cuda, num_iter=50, use_profile=False): # warm up model() if use_cuda: start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() if use_profile: pr = cProfile.Profile() pr.enable() with contexttimer.Timer() as t: for it in range(num_iter): result = model() if use_profile: pr.disable() s = io.StringIO() sortby = SortKey.CUMULATIVE ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue()) if use_cuda: end.record() torch.cuda.synchronize() torch_elapsed = start.elapsed_time(end) / 1e3 qps = num_iter / torch_elapsed time_consume = torch_elapsed / num_iter else: qps = num_iter / t.elapsed time_consume = t.elapsed / num_iter return result, qps, time_consume
def benchmark_torch_jit(model: str, seq_len: int, batch_size: int, n: int, num_threads: int): import transformers import contexttimer import torch.jit torch.set_num_threads(num_threads) torch.set_grad_enabled(False) model = transformers.BertModel.from_pretrained( model) # type: transformers.BertModel model.eval() cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long) model = torch.jit.trace(model, (input_ids, )) with torch.jit.optimized_execution(True): model(input_ids) with contexttimer.Timer() as t: for _ in range(n): model(input_ids) print( json.dumps({ "QPS": n / t.elapsed, "elapsed": t.elapsed, "n": n, "batch_size": batch_size, "seq_len": seq_len, "framework": "torch_jit", "n_threads": num_threads }))
def weightedsvd(W, A, rank, err, U=None, V=None): if U is None: U = np.random.randn(A.shape[0], rank).astype(A.dtype) if V is None: V = np.random.randn(A.shape[1], rank).astype(A.dtype) Wsqrt = np.sqrt(W) oldApprox = A.copy() with contexttimer.Timer() as timer: while True: for row_index in range(A.shape[0]): Wrow = np.diag(Wsqrt[row_index, :]) U[row_index, :] = np.linalg.pinv( Wrow @ V) @ (Wrow @ A[row_index, :]) for row_index in range(A.shape[1]): Wrow = np.diag(Wsqrt[:, row_index]) V[row_index, :] = np.linalg.pinv( Wrow @ U) @ (Wrow @ A[:, row_index]) newApprox = U @ V.T change = np.linalg.norm(oldApprox - newApprox) oldApprox = newApprox print("Change:", change, "Elapsed:", timer.elapsed) if change <= err: return U, V
def run_model(model, batch_size, seq_len, framework_name): # warmup model() if use_cuda: start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() with contexttimer.Timer() as t: for it in range(num_iter): model() end.record() torch.cuda.synchronize() torch_elapsed = start.elapsed_time(end) / 1e3 ips = num_iter / torch_elapsed time_consume = torch_elapsed print( json.dumps({ "IPS": ips, "elapsed": time_consume, "avg_elapsed": time_consume / num_iter, "iter": num_iter, "batch_size": batch_size, "seq_len": seq_len, "framework": framework_name, }))
def _get(self, endpoint): dev.debug("GET %s" % self._api_url(endpoint)) # try to user current bearer token; this could result in a 401 if the # token is expired with contexttimer.Timer() as timer: req = requests.get(self._api_url(endpoint), headers={ 'Authorization': 'Bearer {}'.format(self.token), 'Accept': 'application/json', }) log.info("GET {} - took {} sec".format(self._api_url(endpoint), timer.elapsed)) if req.status_code == 401: # re-authenticate and repeat the request - failure here is final self._authenticate_for(req) req = requests.get(self._api_url(endpoint), headers={ 'Authorization': 'Bearer {}'.format(self.token), 'Accept': 'application/json', }) req.raise_for_status() data = req.json() dev.debug("GOT {}: {}".format(self._api_url(endpoint), pprint.pformat(data, indent=4))) return data
def prep(d, n_o, Xs): # Replicates LMC (runlmc.models.lmc) code minimally. with contexttimer.Timer() as exact: dists = scipy.spatial.distance.pdist(Xs.reshape(-1, 1)) dists = scipy.spatial.distance.squareform(dists) with contexttimer.Timer() as apprx: grid, m = LMC._autogrid(Xs, lo=None, hi=None, m=None) grid_dists = grid - grid[0] interpolant = multi_interpolant(Xs, grid) interpolantT = interpolant.transpose().tocsr() print() print('preparation time (once per optimization)') print(' {:8.4f} sec exact - pairwise distances (for dense approaches)'. format(exact.elapsed)) print(' {:8.4f} sec apprx - linear interpolation (for approximations)'. format(apprx.elapsed)) return dists, grid_dists, interpolant, interpolantT
def check_grads(f, name): with contexttimer.Timer() as t: exact_kgrad = f(exact) ngrad = sum(map(len, exact_kgrad)) print(' {} gradients # {}'.format(name, ngrad)) print(' {:10.4f} sec exact per gradient'.format(t.elapsed / ngrad)) tot_exact_time = t.elapsed with contexttimer.Timer() as t: apprx_kgrad = f(apprx) assert ngrad == sum(map(len, apprx_kgrad)) print(' {:10.4f} sec apprx per gradient'.format(t.elapsed / ngrad)) tot_apprx_time = t.elapsed exact_kgrad = np.hstack(exact_kgrad) apprx_kgrad = np.hstack(apprx_kgrad) err = exact_kgrad - apprx_kgrad print(' {:9.4e} avg grad error'.format(np.fabs(err).mean())) return err, tot_exact_time, tot_apprx_time, exact_kgrad
def _impl_(model_name: str, seq_len: int, batch_size: int, n: int, num_threads: int = 1): import multiprocessing import os temp_fn = "/tmp/temp_onnx.model" p = multiprocessing.Pool(1) vocab_size = p.apply(generate_onnx_model, args=(model_name, temp_fn, seq_len, batch_size, backend)) p.close() import contexttimer import onnxruntime.backend import onnx import numpy import json if not onnxruntime.backend.supports_device(backend): raise RuntimeError( f"onnxruntime does not support {backend}, recompile it!") os.environ['OMP_NUM_THREADS'] = str(num_threads) os.environ['MKL_NUM_THREADS'] = str(num_threads) model = onnx.load_model(f=temp_fn) model = onnxruntime.backend.prepare( model=model, device=backend, graph_optimization_level=onnxruntime.GraphOptimizationLevel. ORT_ENABLE_ALL) input_ids = numpy.random.randint(low=0, high=vocab_size - 1, size=(batch_size, seq_len), dtype=numpy.int64) model.run(inputs=[input_ids]) with contexttimer.Timer() as t: for _ in range(n): model.run(inputs=[input_ids]) print( json.dumps({ "QPS": n / t.elapsed, "elapsed": t.elapsed, "n": n, "batch_size": batch_size, "seq_len": seq_len, "framework": f"onnx_rt_{backend}", "n_threads": num_threads }))
def log_probability_of_sentence(self, tokens: Sequence[str]): """ Derived from _SampleModel in lm_1b_eval.py """ if isinstance(tokens, str): raise ValueError( "Input to log_probability_of_sentence is a sequence of token strings," " not a single string") # these don't matter when we are running the model in inference mode targets = np.zeros([_BATCH_SIZE, _NUM_TIMESTEPS], np.int32) weights = np.ones([_BATCH_SIZE, _NUM_TIMESTEPS], np.float32) # these contain information about the previous word # we initialize them with the beginning-of-sentence marker inputs = np.zeros([_BATCH_SIZE, _NUM_TIMESTEPS], np.int32) inputs[0, 0] = self._vocab.word_to_id(_START_SENTENCE_SYMBOL) char_ids_inputs = np.zeros( [_BATCH_SIZE, _NUM_TIMESTEPS, self._vocab.max_word_length], np.int32) char_ids_inputs[0, 0, :] = self._vocab.word_to_char_ids( _START_SENTENCE_SYMBOL) # we take the log probability of a token sequence to be the sum of the log-probs # of each of its tokens given the preceding context log_prob_sum = 0.0 for token in tokens: with contexttimer.Timer() as token_timer: dist_over_next_words = self._session.run( self._name_to_node['softmax_out'], feed_dict={ self._name_to_node['char_inputs_in']: char_ids_inputs, self._name_to_node['inputs_in']: inputs, self._name_to_node['targets_in']: targets, self._name_to_node['target_weights_in']: weights }) token_idx = self._vocab.word_to_id(token) log_prob_sum += math.log(dist_over_next_words[0][token_idx]) # prepare this word to be the context for the next word inputs[0, 0] = token_idx char_ids_inputs[0, 0, :] = self._vocab.word_to_char_ids(token) # restore original state so that future calls to log_probability_of_sentence # are not affected by past calls self._reset_state() return log_prob_sum
def load(*, graph_def_file: Path, checkpoint_file: Path, vocab: Union[Path, CharsVocabulary]) -> 'LM1B': with contexttimer.Timer() as loading_timer: resolved_vocab: CharsVocabulary if isinstance(vocab, CharsVocabulary): resolved_vocab = vocab else: resolved_vocab = CharsVocabulary(str(vocab), _MAX_WORD_LEN) ### copied from Tensorflow's model repo's lm_1b_eval.py with tf.Graph().as_default(): logging.info('Recovering graph.') with tf.gfile.FastGFile(str(graph_def_file), 'r') as f: s = f.read() gd = tf.GraphDef() text_format.Merge(s, gd) tf.logging.info('Recovering Graph %s', graph_def_file) t = {} [ t['states_init'], t['lstm/lstm_0/control_dependency'], t['lstm/lstm_1/control_dependency'], t['softmax_out'], t['class_ids_out'], t['class_weights_out'], t['log_perplexity_out'], t['inputs_in'], t['targets_in'], t['target_weights_in'], t['char_inputs_in'], t['all_embs'], t['softmax_weights'], t['global_step'] ] = tf.import_graph_def(gd, {}, [ 'states_init', 'lstm/lstm_0/control_dependency:0', 'lstm/lstm_1/control_dependency:0', 'softmax_out:0', 'class_ids_out:0', 'class_weights_out:0', 'log_perplexity_out:0', 'inputs_in:0', 'targets_in:0', 'target_weights_in:0', 'char_inputs_in:0', 'all_embs_out:0', 'Reshape_3:0', 'global_step:0' ], name='') logging.info('Recovering checkpoint %s\n', checkpoint_file) sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True)) sess.run('save/restore_all', {'save/Const:0': str(checkpoint_file)}) sess.run(t['states_init']) logging.info("Loaded language model in %s seconds", loading_timer.elapsed) return LM1B(session=sess, name_to_node=t, vocab=resolved_vocab)
def _get(self, endpoint): dev.debug("GET %s" % self._api_url(endpoint)) with contexttimer.Timer() as timer: req = requests.get(self._api_url(endpoint), headers={ 'Authorization': 'APIKey {}'.format(self.api_key_b64), 'Accept': 'application/json', }) log.info("GET {} - took {} sec".format(self._api_url(endpoint), timer.elapsed)) req.raise_for_status() data = req.json() # dev.debug("GOT {}: {}".format(self._api_url(endpoint), # pprint.pformat(data, indent=4))) return data
def run_model(model, use_gpu, num_iter, batch_size, seq_len, framework_name, num_threads, enable_mem_opt, model_name): # warm up import torch import contexttimer import json model() if use_gpu: start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() with contexttimer.Timer() as t: if enable_mem_opt: turbo_transformers.bert_opt_mem_allocate_api( batch_size, # batch seq_len, # seq_len model.config.num_attention_heads, model.config.hidden_size, model.config.num_hidden_layers, "GPU" if use_gpu else "CPU") for it in range(num_iter): model() if not use_gpu: qps = num_iter / t.elapsed time_consume = t.elapsed else: end.record() torch.cuda.synchronize() torch_elapsed = start.elapsed_time(end) / 1e3 qps = num_iter / torch_elapsed time_consume = torch_elapsed print( json.dumps({ "QPS": qps, "elapsed": time_consume, "n": num_iter, "batch_size": batch_size, "seq_len": seq_len, "framework": framework_name, "thread_num": num_threads, "model_name": model_name }))
def benchmark_torch_jit(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool, enable_mem_opt: bool): import transformers import contexttimer import torch.jit torch.set_num_threads(num_threads) torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long) model = torch.jit.trace(model, (input_ids, )) with torch.jit.optimized_execution(True): model(input_ids) with contexttimer.Timer() as t: for _ in range(n): model(input_ids) print( json.dumps({ "QPS": n / t.elapsed, "elapsed": t.elapsed, "n": n, "batch_size": batch_size, "seq_len": seq_len, "framework": "torch_jit", "n_threads": num_threads, "model_name": model_name }))
def forward(self, contrast_obj, fx_illu, fy_illu): #compute illumination with contexttimer.Timer() as timer: field, fx_illu, fy_illu, fz_illu = self._genIllumination( fx_illu, fy_illu) field[:, :] *= np.exp(1.0j * 2.0 * np.pi * fz_illu * self.initial_z_position) field_layer_conj = af.constant(0.0, self.shape[0], self.shape[1], self.shape[2], dtype=af_complex_datatype) if (type(contrast_obj).__module__ == np.__name__): phasecontrast_obj_af = af.to_array(contrast_obj) flag_gpu_inout = False else: phasecontrast_obj_af = contrast_obj flag_gpu_inout = True #Binning obj_af = self._binObject(phasecontrast_obj_af) #Potentials to Transmittance obj_af = af.exp(1.0j * self.sigma * obj_af) for layer in range(self.shape[2]): field_layer_conj[:, :, layer] = af.conjg(field) field[:, :] *= obj_af[:, :, layer] if layer < self.shape[2] - 1: field = self._propagationInplace( field, self.slice_separation[layer]) #propagate to volume center cache = (obj_af, field_layer_conj, flag_gpu_inout) if self.focus_at_center: #propagate to volume center field = self._propagationInplace(field, self.distance_end_to_center, adjoint=True) return {'forward_scattered_field': field, 'cache': cache}
def bench_runlmc(num_runs, m, xss, yss, test_xss, test_yss, kgen, rgen, slfmgen, indepgen, optimizer_opts, **kwargs): times, smses, nlpds = [], [], [] for i in range(num_runs): ks = kgen() rs = rgen() slfm = slfmgen() indep = indepgen() lmc = LMC(xss, yss, kernels=ks, ranks=rs, slfm_kerns=slfm, indep_gp=indep, normalize=True, m=m, **kwargs) for i in range(lmc.nkernels['lmc']): print('LMC kernel', i, 'A matrix') print(eval('lmc.a{}'.format(i)).values) print('LMC kernel', i, 'kappa diag') print(eval('lmc.kappa{}'.format(i)).values) for i in range(lmc.nkernels['slfm']): i += lmc.nkernels['lmc'] print('SLFM kernel', i, 'A matrix') print(eval('lmc.a{}'.format(i)).values) opt = AdaDelta(**optimizer_opts) with contexttimer.Timer() as t: lmc.optimize(optimizer=opt) times.append(t.elapsed) np.save( TMP + 'lmc-m{}-{}of{}-{}.npy'.format(m, i, num_runs, sum(map(len, xss))), lmc.param_array) pred_yss, pred_vss = lmc.predict(test_xss) smses.append(smse(test_yss, pred_yss, yss)) nlpds.append(nlpd(test_yss, pred_yss, pred_vss)) print('time', times[-1], 'smse', smses[-1], 'nlpd', nlpds[-1]) points = [times, smses, nlpds] stats = [(np.mean(x), np.std(x) / np.sqrt(len(x))) for x in points] return stats
def detect(self, test_sample: Sample) -> DetectionResult: """ Performs the detection of the model :param alphai_watson.datasource.Sample test_sample: input sample to evaluate for anomaly :return alphai_watson.detective.DetectionResult: object containing detection verdict """ logging.info("Running detector on {}".format(test_sample)) test_data = test_sample.data.astype(np.float32) with contexttimer.Timer() as t: detection_array = self.model.run_discriminator(test_data) logging.info("Detection completed in {}".format(t.elapsed)) return DetectionResult( data=detection_array, n_timesteps_in_chunk=test_sample.number_of_timesteps, original_sample_rate=test_sample.sample_rate)
def predict(body): """ Predict train delays. :param body: the request body :return: prediction response object """ with ct.Timer() as t: pred = clf.predict(body).tolist() resp = { 'metadata': { 'git_commit': get_git_commit(), 'prediction_elapsed_time': t.elapsed, 'model_sha256': get_model_hash() }, 'prediction': pred } return resp
def run_model(model, use_cuda, num_iter, batch_size, seq_len, framework_name, thread_num=1): # warm up import torch import contexttimer import json model() if use_cuda: start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() with contexttimer.Timer() as t: for it in range(num_iter): model() if not use_cuda: qps = num_iter / t.elapsed time_consume = t.elapsed else: end.record() torch.cuda.synchronize() torch_elapsed = start.elapsed_time(end) / 1e3 qps = num_iter / torch_elapsed time_consume = torch_elapsed print( json.dumps({ "QPS": qps, "elapsed": time_consume, "n": num_iter, "batch_size": batch_size, "seq_len": seq_len, "framework": framework_name, "thread_num": thread_num, }))
def run_model(model, use_cuda, num_iter=50, use_profile=False): # warm up model() if use_cuda: start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() with contexttimer.Timer() as t: for it in range(num_iter): result = model() if use_cuda: end.record() torch.cuda.synchronize() torch_elapsed = start.elapsed_time(end) / 1e3 qps = num_iter / torch_elapsed time_consume = torch_elapsed / num_iter else: qps = num_iter / t.elapsed time_consume = t.elapsed / num_iter return result, qps, time_consume
def _get(self, endpoint): url = '{0}/v2/{1}'.format(self.url, endpoint) log.debug("GET {}".format(url)) # Try to use previous bearer token with contexttimer.Timer() as timer: r = requests.get(url, auth=self.auth, verify=self.verify_ssl) log.info("GET {} - took {} sec".format(url, timer.elapsed)) # If necessary, try to authenticate and try again if r.status_code == 401: self._authenticate_for(r) r = requests.get(url, auth=self.auth, verify=self.verify_ssl) data = r.json() if r.status_code != 200: raise RegistryError.from_data(data) log.debug("GOT {}: {}".format(url, pprint.pformat(data, indent=4))) return data
def forwardPredict(self, field = False): """ Uses current object in the phase_obj_3d to predict the amplitude of the exit wave Before calling, make sure correct object is contained """ obj_gpu = af.to_array(self._x) #self._x created in self.setScatteringMethod() with contexttimer.Timer() as timer: forward_scattered_predict= [] for illu_idx in range(self.number_illum): fx_illu = self.fx_illu_list[illu_idx] fy_illu = self.fy_illu_list[illu_idx] fields = self._forwardMeasure(fx_illu, fy_illu, obj = obj_gpu) if field: forward_scattered_predict.append(np.array(fields["forward_scattered_field"])) else: forward_scattered_predict.append(np.abs(fields["forward_scattered_field"])) if self.number_illum > 1: print("illumination {:03d}/{:03d}.".format(illu_idx, self.number_illum), end="\r") if len(forward_scattered_predict[0][0].shape)==2: forward_scattered_predict = np.array(forward_scattered_predict).transpose(2, 3, 1, 0) elif len(forward_scattered_predict[0][0].shape)==1: forward_scattered_predict = np.array(forward_scattered_predict).transpose(1, 2, 0) return forward_scattered_predict
def bench_runlmc(num_runs, m, xss, yss, test_xss, test_yss, kgen, rgen, slfmgen, indepgen, optimizer_opts, **kwargs): times, smses, nlpds = [], [], [] return_lmc = 'return_lmc' in kwargs if return_lmc: del kwargs['return_lmc'] for i in range(num_runs): fk = FunctionalKernel(D=len(xss), lmc_kernels=kgen(), lmc_ranks=rgen(), slfm_kernels=slfmgen(), indep_gp=indepgen()) lmc = InterpolatedLLGP(xss, yss, functional_kernel=fk, normalize=True, m=m, **kwargs) opt = AdaDelta(**optimizer_opts) with contexttimer.Timer() as t: lmc.optimize(optimizer=opt) times.append(t.elapsed) np.save( TMP + 'lmc-m{}-{}of{}-{}.npy'.format(m, i, num_runs, sum(map(len, xss))), lmc.param_array) pred_yss, pred_vss = lmc.predict(test_xss) smses.append(smse(test_yss, pred_yss, yss)) nlpds.append(nlpd(test_yss, pred_yss, pred_vss)) print('time', times[-1], 'smse', smses[-1], 'nlpd', nlpds[-1]) points = [times, smses, nlpds] stats = [(np.mean(x), np.std(x) / np.sqrt(len(x))) for x in points] if return_lmc: return stats, lmc return stats
def run_variable_model(model, use_gpu, num_iter, max_seq_len, min_seq_len, framework_name, num_threads, cfg): import torch import contexttimer import json import random test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') request_list = [] # make sure all benchmarking runtimes are using the same random distribution. random.seed(0) for i in range(num_iter): generated_seq_len = random.randint(min_seq_len, max_seq_len) input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(1, generated_seq_len), dtype=torch.long, device=test_device) request_list.append(input_ids) # warm-up using the longest sequence # TODO(jiaruifang) We know recommend you to run warm-up before inference. # In the future we will refactor allocator so as to not avoid warm-up input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(1, max_seq_len), dtype=torch.long, device=test_device) model(input_ids) if enable_latency_plot: import time print(f"dump results to {framework_name}_latency_{num_threads}.txt") with open(f"{framework_name}_latency_{num_threads}.txt", "w") as of: result_list = [] for request in request_list: if use_gpu: start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() with contexttimer.Timer() as t: model(request) if not use_gpu: qps = num_iter / t.elapsed time_consume = t.elapsed else: end.record() torch.cuda.synchronize() torch_elapsed = start.elapsed_time(end) / 1e3 qps = num_iter / torch_elapsed time_consume = torch_elapsed print('.', end='', flush=True) result_list.append([len(request.view(-1)), time_consume]) elapse = 0. result_list = sorted(result_list, key=lambda s: s[0]) for item in result_list: of.write(f"{item[0]}, {item[1]}\n") elapse += item[1] print(f"elapsed {elapse} QPS {num_iter/elapse}") else: if use_gpu: start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() with contexttimer.Timer() as t: for request in request_list: model(request) if not use_gpu: qps = num_iter / t.elapsed time_consume = t.elapsed else: end.record() torch.cuda.synchronize() torch_elapsed = start.elapsed_time(end) / 1e3 qps = num_iter / torch_elapsed time_consume = torch_elapsed print( json.dumps({ "QPS": qps, "elapsed": time_consume, "n": num_iter, "max_seq_len": max_seq_len, "min_seq_len": min_seq_len, "framework": framework_name, "thread_num": num_iter, }))
def timer(*args, **kwargs): return contexttimer.Timer(*args, **kwargs, timer=time.perf_counter)
import thread_tools import time print('imported from thread_tools: ',dir(thread_tools)) import contexttimer from thread_tools import wait_loop_nogil nloops=500 with contexttimer.Timer(time.perf_counter) as pure_wall: with contexttimer.Timer(time.process_time) as pure_cpu: result=wait_loop_nogil(nloops) print(f'pybind11 wall time {pure_wall.elapsed} and cpu time {pure_cpu.elapsed}')
# ### Create two functions, one to print thread and process ids, and one to run the wait_for loop # # * Important point -- the logging module is **threadsafe** # # * Submit 6 jobs queued on 3 processors # %% njobs = 12 nprocs = 3 thread_id_jobs = [(find_ids, [], {}) for i in range(nprocs)] nloops = 5250 calc_jobs = [(wait_loop_nogil, [nloops], {}) for i in range(njobs)] print(calc_jobs) # %% with contexttimer.Timer(time.perf_counter) as wall: with contexttimer.Timer(time.process_time) as cpu: with Parallel(n_jobs=nprocs, backend="threading") as parallel: # parallel(thread_id_jobs) results = parallel(calc_jobs) print(results) print(f"wall time {wall.elapsed} and cpu time {cpu.elapsed}") # %% [markdown] # * Each job was run on a different thread but in the same process # # * Note that the cpu time is larger than the wall time, confirming that we've release the GIL. # # %% [markdown] # ### Now repeat this holding the GIL
# enable synchronous mode vrep.simxSynchronous(client_id, True) position_history = [] e, body_pos = vrep.simxGetObjectPosition(client_id, body, -1, vrep.simx_opmode_buffer) position_history.append(body_pos) joint_pos_history = [] e, joint_0_pos = vrep.simxGetJointPosition(client_id, joint_0, vrep.simx_opmode_streaming) e, joint_1_pos = vrep.simxGetJointPosition(client_id, joint_1, vrep.simx_opmode_streaming) joint_pos_history.append([joint_0_pos, joint_1_pos]) with contexttimer.Timer() as timer: for i in range(4): e = vrep.simxSetJointTargetPosition(client_id, joint_0, 0.5, vrep.simx_opmode_streaming) for t in range(3): vrep.simxSynchronousTrigger(client_id) e, body_pos = vrep.simxGetObjectPosition( client_id, body, -1, vrep.simx_opmode_buffer) position_history.append(body_pos) e, joint_0_pos = vrep.simxGetJointPosition( client_id, joint_0, vrep.simx_opmode_buffer) e, joint_1_pos = vrep.simxGetJointPosition( client_id, joint_1, vrep.simx_opmode_buffer) joint_pos_history.append([joint_0_pos, joint_1_pos]) e = vrep.simxSetJointTargetPosition(client_id, joint_1, 2.5,
def compress(Xs, weights, bias, rank, elements_per_batch, snapshot_num=None): # Chosen Line Search Parameters alpha = 0.1 beta = 0.5 print("Setting up Problem") if snapshot_num is not None: f = np.load("snapshot_{}.npz".format(snapshot_num)) U = f.get('arr_0') V = f.get('arr_1') else: # Seed the initial U, V with the SVD. Ufull, Sfull, Vfull = np.linalg.svd(weights, full_matrices=False) U = Ufull[:, :rank] @ np.diag(Sfull[:rank]) V = Vfull[:rank, :] np.savez("snapshot_SVD.npz", U, V) for iteration in itertools.count(snapshot_num or 0): # Due to memory constraints, we must sub-sample the inputs. X = Xs[np.random.choice(Xs.shape[0], elements_per_batch)] print("Starting Iteration", iteration) gold = sigmoid(X @ weights + bias) current_guess = sigmoid(X @ U @ V + bias) diff = current_guess - gold current_value = np.linalg.norm(diff, 'fro')**2 print("Current Status:", current_value) # applying the gradient of the sigmoid sig_gradient = (current_guess * (1 - current_guess)).ravel() with contexttimer.Timer() as tv: partialGradV = (sig_gradient * computeGradientAB(X @ U, V).T).T print("GradV:", tv.elapsed) print(np.linalg.norm(partialGradV, np.inf)) with contexttimer.Timer() as tu: partialGradU = (sig_gradient * computeGradientABC(X, U, V).T).T print("GradU:", tu.elapsed) print(np.linalg.norm(partialGradU, np.inf)) # stack into one large matrix stacked = np.concatenate([partialGradU, partialGradV], axis=1) with contexttimer.Timer() as tup: print("Starting LSTSQ") assert np.isfinite(diff).all() assert np.isfinite(stacked).all() update = splinalg.lstsq(stacked, -diff.ravel())[0] print("Finished LSTSQ") expected_value = np.linalg.norm(diff.ravel() + stacked @ update)**2 expected_improvement = expected_value - current_value # This is negative, as we expect an improvement assert expected_improvement < 0 print("Expected Improvement:", expected_improvement) t = 1 while True: Uupdate = t * update[:U.size].reshape(U.shape) Vupdate = t * update[U.size:].reshape(V.shape) new_value = np.linalg.norm( sigmoid(X @ (U + Uupdate) @ (V + Vupdate) + bias) - gold, 'fro')**2 if new_value <= current_value + alpha * t * expected_improvement: print("Line search T:", t) print("New Value:", new_value) step_size = 1 / np.sqrt(iteration + 1) U += step_size * Uupdate V += step_size * Vupdate np.savez("snapshot_{}.npz".format(iteration), U, V) if np.linalg.norm(t * step_size * update) <= 1e-5: return U, V break t *= beta print("Update:", tup.elapsed)