Beispiel #1
0
def bench(batch_size: int, d: int, hw: int, num_iter: int):
    if not torch.cuda.is_available():
        print("GPU is not available")
        return

    device = torch.device('cuda:0')

    torch.set_grad_enabled(False)

    # BxDxHxWx2
    inp = torch.randn(batch_size, d, hw, hw, 2, device=device)

    # warmup
    outp = torch.fft(inp, 3)
    inp_ = torch.ifft(outp, 3)

    # fft
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    with contexttimer.Timer() as t:
        for it in range(num_iter):
            outp = torch.fft(inp, 3)
    end.record()
    torch.cuda.synchronize()
    elapsed = start.elapsed_time(end) / 1e3
    tps = num_iter / elapsed
    fft_time_consume = elapsed

    del outp, inp

    outp = torch.randn(batch_size, d, hw, hw, 2, device=device)

    # ifft
    start.record()
    with contexttimer.Timer() as t:
        for it in range(num_iter):
            inp_ = torch.ifft(outp, 3)
    end.record()
    torch.cuda.synchronize()
    elapsed = start.elapsed_time(end) / 1e3
    itps = num_iter / elapsed
    ifft_time_consume = elapsed

    print(
        json.dumps({
            "TPS": tps,
            "fft_elapsed": fft_time_consume,
            "ITPS": itps,
            "ifft_elapsed": ifft_time_consume,
            "n": num_iter,
            "batch_size": batch_size,
            "D_size": d,
            "HW_size": hw,
        }))
Beispiel #2
0
    def test_timer_print(self):
        def print_reversed(string):
            print " ".join(reversed(string.split()))

        tests = [
            # (kwargs, expected_regex)
            ({
                'output': True
            }, r"took [0-9.]+ seconds"),
            ({
                'output': print_reversed
            }, r"seconds [0-9.]+ took"),
            ({
                'prefix': 'foo'
            }, r"foo took [0-9.]+ seconds"),
            ({
                'output': True,
                'prefix': 'foo'
            }, r"foo took [0-9.]+ seconds"),
            ({
                'output': True,
                'fmt': '{} seconds later...'
            }, r"[0-9.]+ seconds later..."),
        ]

        for kwargs, expected in tests:
            output = StringIO()
            with mock.patch('sys.stdout', new=output):
                with contexttimer.Timer(**kwargs):
                    pass

            self.assertIsNotNone(output)
            self.assertRegexpMatches(output.getvalue(), expected)
Beispiel #3
0
def run_model(model, use_cuda, num_iter=50, use_profile=False):
    # warm up
    model()
    if use_cuda:
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()

    if use_profile:
        pr = cProfile.Profile()
        pr.enable()

    with contexttimer.Timer() as t:
        for it in range(num_iter):
            result = model()

    if use_profile:
        pr.disable()
        s = io.StringIO()
        sortby = SortKey.CUMULATIVE
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print(s.getvalue())

    if use_cuda:
        end.record()
        torch.cuda.synchronize()
        torch_elapsed = start.elapsed_time(end) / 1e3
        qps = num_iter / torch_elapsed
        time_consume = torch_elapsed / num_iter
    else:
        qps = num_iter / t.elapsed
        time_consume = t.elapsed / num_iter
    return result, qps, time_consume
Beispiel #4
0
def benchmark_torch_jit(model: str, seq_len: int, batch_size: int, n: int,
                        num_threads: int):
    import transformers
    import contexttimer
    import torch.jit
    torch.set_num_threads(num_threads)
    torch.set_grad_enabled(False)
    model = transformers.BertModel.from_pretrained(
        model)  # type: transformers.BertModel
    model.eval()
    cfg = model.config  # type: transformers.BertConfig
    input_ids = torch.randint(low=0,
                              high=cfg.vocab_size - 1,
                              size=(batch_size, seq_len),
                              dtype=torch.long)

    model = torch.jit.trace(model, (input_ids, ))

    with torch.jit.optimized_execution(True):
        model(input_ids)
        with contexttimer.Timer() as t:
            for _ in range(n):
                model(input_ids)

    print(
        json.dumps({
            "QPS": n / t.elapsed,
            "elapsed": t.elapsed,
            "n": n,
            "batch_size": batch_size,
            "seq_len": seq_len,
            "framework": "torch_jit",
            "n_threads": num_threads
        }))
Beispiel #5
0
def weightedsvd(W, A, rank, err, U=None, V=None):
    if U is None:
        U = np.random.randn(A.shape[0], rank).astype(A.dtype)
    if V is None:
        V = np.random.randn(A.shape[1], rank).astype(A.dtype)
    Wsqrt = np.sqrt(W)
    oldApprox = A.copy()
    with contexttimer.Timer() as timer:
        while True:
            for row_index in range(A.shape[0]):
                Wrow = np.diag(Wsqrt[row_index, :])
                U[row_index, :] = np.linalg.pinv(
                    Wrow @ V) @ (Wrow @ A[row_index, :])
            for row_index in range(A.shape[1]):
                Wrow = np.diag(Wsqrt[:, row_index])
                V[row_index, :] = np.linalg.pinv(
                    Wrow @ U) @ (Wrow @ A[:, row_index])

            newApprox = U @ V.T
            change = np.linalg.norm(oldApprox - newApprox)
            oldApprox = newApprox

            print("Change:", change, "Elapsed:", timer.elapsed)

            if change <= err:
                return U, V
Beispiel #6
0
def run_model(model, batch_size, seq_len, framework_name):
    # warmup
    model()

    if use_cuda:
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)

    start.record()
    with contexttimer.Timer() as t:
        for it in range(num_iter):
            model()
    end.record()
    torch.cuda.synchronize()
    torch_elapsed = start.elapsed_time(end) / 1e3
    ips = num_iter / torch_elapsed

    time_consume = torch_elapsed
    print(
        json.dumps({
            "IPS": ips,
            "elapsed": time_consume,
            "avg_elapsed": time_consume / num_iter,
            "iter": num_iter,
            "batch_size": batch_size,
            "seq_len": seq_len,
            "framework": framework_name,
        }))
    def _get(self, endpoint):
        dev.debug("GET %s" % self._api_url(endpoint))

        # try to user current bearer token; this could result in a 401 if the
        # token is expired
        with contexttimer.Timer() as timer:
            req = requests.get(self._api_url(endpoint),
                               headers={
                                   'Authorization':
                                   'Bearer {}'.format(self.token),
                                   'Accept': 'application/json',
                               })
        log.info("GET {} - took {} sec".format(self._api_url(endpoint),
                                               timer.elapsed))

        if req.status_code == 401:
            # re-authenticate and repeat the request -  failure here is final
            self._authenticate_for(req)
            req = requests.get(self._api_url(endpoint),
                               headers={
                                   'Authorization':
                                   'Bearer {}'.format(self.token),
                                   'Accept': 'application/json',
                               })

        req.raise_for_status()

        data = req.json()
        dev.debug("GOT {}: {}".format(self._api_url(endpoint),
                                      pprint.pformat(data, indent=4)))
        return data
Beispiel #8
0
def prep(d, n_o, Xs):
    # Replicates LMC (runlmc.models.lmc) code minimally.
    with contexttimer.Timer() as exact:
        dists = scipy.spatial.distance.pdist(Xs.reshape(-1, 1))
        dists = scipy.spatial.distance.squareform(dists)
    with contexttimer.Timer() as apprx:
        grid, m = LMC._autogrid(Xs, lo=None, hi=None, m=None)
        grid_dists = grid - grid[0]
        interpolant = multi_interpolant(Xs, grid)
        interpolantT = interpolant.transpose().tocsr()

    print()
    print('preparation time (once per optimization)')
    print('    {:8.4f} sec exact - pairwise distances (for dense approaches)'.
          format(exact.elapsed))
    print('    {:8.4f} sec apprx - linear interpolation (for approximations)'.
          format(apprx.elapsed))

    return dists, grid_dists, interpolant, interpolantT
Beispiel #9
0
 def check_grads(f, name):
     with contexttimer.Timer() as t:
         exact_kgrad = f(exact)
     ngrad = sum(map(len, exact_kgrad))
     print('    {} gradients # {}'.format(name, ngrad))
     print('        {:10.4f} sec exact per gradient'.format(t.elapsed /
                                                            ngrad))
     tot_exact_time = t.elapsed
     with contexttimer.Timer() as t:
         apprx_kgrad = f(apprx)
     assert ngrad == sum(map(len, apprx_kgrad))
     print('        {:10.4f} sec apprx per gradient'.format(t.elapsed /
                                                            ngrad))
     tot_apprx_time = t.elapsed
     exact_kgrad = np.hstack(exact_kgrad)
     apprx_kgrad = np.hstack(apprx_kgrad)
     err = exact_kgrad - apprx_kgrad
     print('        {:9.4e} avg grad error'.format(np.fabs(err).mean()))
     return err, tot_exact_time, tot_apprx_time, exact_kgrad
Beispiel #10
0
    def _impl_(model_name: str,
               seq_len: int,
               batch_size: int,
               n: int,
               num_threads: int = 1):
        import multiprocessing
        import os
        temp_fn = "/tmp/temp_onnx.model"
        p = multiprocessing.Pool(1)
        vocab_size = p.apply(generate_onnx_model,
                             args=(model_name, temp_fn, seq_len, batch_size,
                                   backend))
        p.close()
        import contexttimer
        import onnxruntime.backend
        import onnx
        import numpy
        import json
        if not onnxruntime.backend.supports_device(backend):
            raise RuntimeError(
                f"onnxruntime does not support {backend}, recompile it!")

        os.environ['OMP_NUM_THREADS'] = str(num_threads)
        os.environ['MKL_NUM_THREADS'] = str(num_threads)

        model = onnx.load_model(f=temp_fn)
        model = onnxruntime.backend.prepare(
            model=model,
            device=backend,
            graph_optimization_level=onnxruntime.GraphOptimizationLevel.
            ORT_ENABLE_ALL)
        input_ids = numpy.random.randint(low=0,
                                         high=vocab_size - 1,
                                         size=(batch_size, seq_len),
                                         dtype=numpy.int64)
        model.run(inputs=[input_ids])

        with contexttimer.Timer() as t:
            for _ in range(n):
                model.run(inputs=[input_ids])

        print(
            json.dumps({
                "QPS": n / t.elapsed,
                "elapsed": t.elapsed,
                "n": n,
                "batch_size": batch_size,
                "seq_len": seq_len,
                "framework": f"onnx_rt_{backend}",
                "n_threads": num_threads
            }))
Beispiel #11
0
    def log_probability_of_sentence(self, tokens: Sequence[str]):
        """
        Derived from _SampleModel in lm_1b_eval.py
        """
        if isinstance(tokens, str):
            raise ValueError(
                "Input to log_probability_of_sentence is a sequence of token strings,"
                " not a single string")
        # these don't matter when we are running the model in inference mode
        targets = np.zeros([_BATCH_SIZE, _NUM_TIMESTEPS], np.int32)
        weights = np.ones([_BATCH_SIZE, _NUM_TIMESTEPS], np.float32)

        # these contain information about the previous word
        # we initialize them with the beginning-of-sentence marker
        inputs = np.zeros([_BATCH_SIZE, _NUM_TIMESTEPS], np.int32)
        inputs[0, 0] = self._vocab.word_to_id(_START_SENTENCE_SYMBOL)

        char_ids_inputs = np.zeros(
            [_BATCH_SIZE, _NUM_TIMESTEPS, self._vocab.max_word_length],
            np.int32)
        char_ids_inputs[0, 0, :] = self._vocab.word_to_char_ids(
            _START_SENTENCE_SYMBOL)

        # we take the log probability of a token sequence to be the sum of the log-probs
        # of each of its tokens given the preceding context
        log_prob_sum = 0.0
        for token in tokens:
            with contexttimer.Timer() as token_timer:
                dist_over_next_words = self._session.run(
                    self._name_to_node['softmax_out'],
                    feed_dict={
                        self._name_to_node['char_inputs_in']: char_ids_inputs,
                        self._name_to_node['inputs_in']: inputs,
                        self._name_to_node['targets_in']: targets,
                        self._name_to_node['target_weights_in']: weights
                    })
                token_idx = self._vocab.word_to_id(token)
                log_prob_sum += math.log(dist_over_next_words[0][token_idx])

                # prepare this word to be the context for the next word
                inputs[0, 0] = token_idx
                char_ids_inputs[0, 0, :] = self._vocab.word_to_char_ids(token)

        # restore original state so that future calls to log_probability_of_sentence
        # are not affected by past calls
        self._reset_state()

        return log_prob_sum
Beispiel #12
0
    def load(*, graph_def_file: Path, checkpoint_file: Path,
             vocab: Union[Path, CharsVocabulary]) -> 'LM1B':
        with contexttimer.Timer() as loading_timer:
            resolved_vocab: CharsVocabulary
            if isinstance(vocab, CharsVocabulary):
                resolved_vocab = vocab
            else:
                resolved_vocab = CharsVocabulary(str(vocab), _MAX_WORD_LEN)

            ### copied from Tensorflow's model repo's lm_1b_eval.py
            with tf.Graph().as_default():
                logging.info('Recovering graph.')
                with tf.gfile.FastGFile(str(graph_def_file), 'r') as f:
                    s = f.read()
                    gd = tf.GraphDef()
                    text_format.Merge(s, gd)

                tf.logging.info('Recovering Graph %s', graph_def_file)
                t = {}
                [
                    t['states_init'], t['lstm/lstm_0/control_dependency'],
                    t['lstm/lstm_1/control_dependency'], t['softmax_out'],
                    t['class_ids_out'], t['class_weights_out'],
                    t['log_perplexity_out'], t['inputs_in'], t['targets_in'],
                    t['target_weights_in'], t['char_inputs_in'], t['all_embs'],
                    t['softmax_weights'], t['global_step']
                ] = tf.import_graph_def(gd, {}, [
                    'states_init', 'lstm/lstm_0/control_dependency:0',
                    'lstm/lstm_1/control_dependency:0', 'softmax_out:0',
                    'class_ids_out:0', 'class_weights_out:0',
                    'log_perplexity_out:0', 'inputs_in:0', 'targets_in:0',
                    'target_weights_in:0', 'char_inputs_in:0',
                    'all_embs_out:0', 'Reshape_3:0', 'global_step:0'
                ],
                                        name='')

                logging.info('Recovering checkpoint %s\n', checkpoint_file)
                sess = tf.Session(config=tf.ConfigProto(
                    allow_soft_placement=True, log_device_placement=True))
                sess.run('save/restore_all',
                         {'save/Const:0': str(checkpoint_file)})
                sess.run(t['states_init'])

        logging.info("Loaded language model in %s seconds",
                     loading_timer.elapsed)

        return LM1B(session=sess, name_to_node=t, vocab=resolved_vocab)
 def _get(self, endpoint):
     dev.debug("GET %s" % self._api_url(endpoint))
     with contexttimer.Timer() as timer:
         req = requests.get(self._api_url(endpoint),
                            headers={
                                'Authorization':
                                'APIKey {}'.format(self.api_key_b64),
                                'Accept':
                                'application/json',
                            })
     log.info("GET {} - took {} sec".format(self._api_url(endpoint),
                                            timer.elapsed))
     req.raise_for_status()
     data = req.json()
     # dev.debug("GOT {}: {}".format(self._api_url(endpoint),
     #                               pprint.pformat(data, indent=4)))
     return data
def run_model(model, use_gpu, num_iter, batch_size, seq_len, framework_name,
              num_threads, enable_mem_opt, model_name):
    # warm up
    import torch
    import contexttimer
    import json
    model()

    if use_gpu:
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()

    with contexttimer.Timer() as t:
        if enable_mem_opt:
            turbo_transformers.bert_opt_mem_allocate_api(
                batch_size,  # batch
                seq_len,  # seq_len
                model.config.num_attention_heads,
                model.config.hidden_size,
                model.config.num_hidden_layers,
                "GPU" if use_gpu else "CPU")
        for it in range(num_iter):
            model()

    if not use_gpu:
        qps = num_iter / t.elapsed
        time_consume = t.elapsed
    else:
        end.record()
        torch.cuda.synchronize()
        torch_elapsed = start.elapsed_time(end) / 1e3
        qps = num_iter / torch_elapsed
        time_consume = torch_elapsed
    print(
        json.dumps({
            "QPS": qps,
            "elapsed": time_consume,
            "n": num_iter,
            "batch_size": batch_size,
            "seq_len": seq_len,
            "framework": framework_name,
            "thread_num": num_threads,
            "model_name": model_name
        }))
Beispiel #15
0
def benchmark_torch_jit(model_name: str, seq_len: int, batch_size: int, n: int,
                        enable_random: bool, max_seq_len: int,
                        min_seq_len: int, num_threads: int, use_gpu: bool,
                        enable_mem_opt: bool):
    import transformers
    import contexttimer
    import torch.jit
    torch.set_num_threads(num_threads)
    torch.set_grad_enabled(False)
    if model_name == "bert":
        cfg = transformers.BertConfig()
        model = transformers.BertModel(cfg)
    elif model_name == "albert":
        cfg = transformers.AlbertConfig()
        model = transformers.AlbertModel(cfg)
    elif model_name == "roberta":
        cfg = transformers.RobertaConfig()
        model = transformers.RobertaModel(cfg)
    else:
        raise (f"benchmark does not support {model_name}")
    model.eval()
    input_ids = torch.randint(low=0,
                              high=cfg.vocab_size - 1,
                              size=(batch_size, seq_len),
                              dtype=torch.long)

    model = torch.jit.trace(model, (input_ids, ))

    with torch.jit.optimized_execution(True):
        model(input_ids)
        with contexttimer.Timer() as t:
            for _ in range(n):
                model(input_ids)

    print(
        json.dumps({
            "QPS": n / t.elapsed,
            "elapsed": t.elapsed,
            "n": n,
            "batch_size": batch_size,
            "seq_len": seq_len,
            "framework": "torch_jit",
            "n_threads": num_threads,
            "model_name": model_name
        }))
Beispiel #16
0
    def forward(self, contrast_obj, fx_illu, fy_illu):
        #compute illumination
        with contexttimer.Timer() as timer:
            field, fx_illu, fy_illu, fz_illu = self._genIllumination(
                fx_illu, fy_illu)
            field[:, :] *= np.exp(1.0j * 2.0 * np.pi * fz_illu *
                                  self.initial_z_position)
            field_layer_conj = af.constant(0.0,
                                           self.shape[0],
                                           self.shape[1],
                                           self.shape[2],
                                           dtype=af_complex_datatype)

            if (type(contrast_obj).__module__ == np.__name__):
                phasecontrast_obj_af = af.to_array(contrast_obj)
                flag_gpu_inout = False
            else:
                phasecontrast_obj_af = contrast_obj
                flag_gpu_inout = True

            #Binning
            obj_af = self._binObject(phasecontrast_obj_af)

            #Potentials to Transmittance
            obj_af = af.exp(1.0j * self.sigma * obj_af)

            for layer in range(self.shape[2]):
                field_layer_conj[:, :, layer] = af.conjg(field)
                field[:, :] *= obj_af[:, :, layer]
                if layer < self.shape[2] - 1:
                    field = self._propagationInplace(
                        field, self.slice_separation[layer])

            #propagate to volume center
            cache = (obj_af, field_layer_conj, flag_gpu_inout)

            if self.focus_at_center:
                #propagate to volume center
                field = self._propagationInplace(field,
                                                 self.distance_end_to_center,
                                                 adjoint=True)
        return {'forward_scattered_field': field, 'cache': cache}
Beispiel #17
0
def bench_runlmc(num_runs, m, xss, yss, test_xss, test_yss, kgen, rgen,
                 slfmgen, indepgen, optimizer_opts, **kwargs):
    times, smses, nlpds = [], [], []
    for i in range(num_runs):
        ks = kgen()
        rs = rgen()
        slfm = slfmgen()
        indep = indepgen()
        lmc = LMC(xss,
                  yss,
                  kernels=ks,
                  ranks=rs,
                  slfm_kerns=slfm,
                  indep_gp=indep,
                  normalize=True,
                  m=m,
                  **kwargs)
        for i in range(lmc.nkernels['lmc']):
            print('LMC kernel', i, 'A matrix')
            print(eval('lmc.a{}'.format(i)).values)
            print('LMC kernel', i, 'kappa diag')
            print(eval('lmc.kappa{}'.format(i)).values)
        for i in range(lmc.nkernels['slfm']):
            i += lmc.nkernels['lmc']
            print('SLFM kernel', i, 'A matrix')
            print(eval('lmc.a{}'.format(i)).values)
        opt = AdaDelta(**optimizer_opts)
        with contexttimer.Timer() as t:
            lmc.optimize(optimizer=opt)
        times.append(t.elapsed)
        np.save(
            TMP +
            'lmc-m{}-{}of{}-{}.npy'.format(m, i, num_runs, sum(map(len, xss))),
            lmc.param_array)
        pred_yss, pred_vss = lmc.predict(test_xss)
        smses.append(smse(test_yss, pred_yss, yss))
        nlpds.append(nlpd(test_yss, pred_yss, pred_vss))
        print('time', times[-1], 'smse', smses[-1], 'nlpd', nlpds[-1])

    points = [times, smses, nlpds]
    stats = [(np.mean(x), np.std(x) / np.sqrt(len(x))) for x in points]
    return stats
Beispiel #18
0
    def detect(self, test_sample: Sample) -> DetectionResult:
        """
        Performs the detection of the model

        :param alphai_watson.datasource.Sample test_sample: input sample to evaluate for anomaly
        :return alphai_watson.detective.DetectionResult: object containing detection verdict
        """

        logging.info("Running detector on {}".format(test_sample))

        test_data = test_sample.data.astype(np.float32)

        with contexttimer.Timer() as t:
            detection_array = self.model.run_discriminator(test_data)
        logging.info("Detection completed in {}".format(t.elapsed))

        return DetectionResult(
            data=detection_array,
            n_timesteps_in_chunk=test_sample.number_of_timesteps,
            original_sample_rate=test_sample.sample_rate)
Beispiel #19
0
def predict(body):
    """
    Predict train delays.

    :param body: the request body
    :return: prediction response object
    """
    with ct.Timer() as t:
        pred = clf.predict(body).tolist()

    resp = {
        'metadata': {
            'git_commit': get_git_commit(),
            'prediction_elapsed_time': t.elapsed,
            'model_sha256': get_model_hash()
        },
        'prediction': pred
    }

    return resp
Beispiel #20
0
def run_model(model,
              use_cuda,
              num_iter,
              batch_size,
              seq_len,
              framework_name,
              thread_num=1):
    # warm up
    import torch
    import contexttimer
    import json
    model()
    if use_cuda:
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()

    with contexttimer.Timer() as t:
        for it in range(num_iter):
            model()

    if not use_cuda:
        qps = num_iter / t.elapsed
        time_consume = t.elapsed
    else:
        end.record()
        torch.cuda.synchronize()
        torch_elapsed = start.elapsed_time(end) / 1e3
        qps = num_iter / torch_elapsed
        time_consume = torch_elapsed
    print(
        json.dumps({
            "QPS": qps,
            "elapsed": time_consume,
            "n": num_iter,
            "batch_size": batch_size,
            "seq_len": seq_len,
            "framework": framework_name,
            "thread_num": thread_num,
        }))
Beispiel #21
0
def run_model(model, use_cuda, num_iter=50, use_profile=False):
    # warm up
    model()
    if use_cuda:
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()

    with contexttimer.Timer() as t:
        for it in range(num_iter):
            result = model()

    if use_cuda:
        end.record()
        torch.cuda.synchronize()
        torch_elapsed = start.elapsed_time(end) / 1e3
        qps = num_iter / torch_elapsed
        time_consume = torch_elapsed / num_iter
    else:
        qps = num_iter / t.elapsed
        time_consume = t.elapsed / num_iter
    return result, qps, time_consume
    def _get(self, endpoint):
        url = '{0}/v2/{1}'.format(self.url, endpoint)
        log.debug("GET {}".format(url))

        # Try to use previous bearer token
        with contexttimer.Timer() as timer:
            r = requests.get(url, auth=self.auth, verify=self.verify_ssl)

        log.info("GET {} - took {} sec".format(url, timer.elapsed))

        # If necessary, try to authenticate and try again
        if r.status_code == 401:
            self._authenticate_for(r)
            r = requests.get(url, auth=self.auth, verify=self.verify_ssl)

        data = r.json()

        if r.status_code != 200:
            raise RegistryError.from_data(data)

        log.debug("GOT {}: {}".format(url, pprint.pformat(data, indent=4)))
        return data
Beispiel #23
0
    def forwardPredict(self, field = False):
        """
        Uses current object in the phase_obj_3d to predict the amplitude of the exit wave
        Before calling, make sure correct object is contained
        """
        obj_gpu = af.to_array(self._x) #self._x created in self.setScatteringMethod()
        with contexttimer.Timer() as timer:
            forward_scattered_predict= []
            for illu_idx in range(self.number_illum):
                fx_illu           = self.fx_illu_list[illu_idx]
                fy_illu           = self.fy_illu_list[illu_idx]               

                fields = self._forwardMeasure(fx_illu, fy_illu, obj = obj_gpu)
                if field:
                    forward_scattered_predict.append(np.array(fields["forward_scattered_field"]))
                else:
                    forward_scattered_predict.append(np.abs(fields["forward_scattered_field"]))
                if self.number_illum > 1:
                    print("illumination {:03d}/{:03d}.".format(illu_idx, self.number_illum), end="\r")                            
        if len(forward_scattered_predict[0][0].shape)==2:
            forward_scattered_predict = np.array(forward_scattered_predict).transpose(2, 3, 1, 0)
        elif len(forward_scattered_predict[0][0].shape)==1:
            forward_scattered_predict = np.array(forward_scattered_predict).transpose(1, 2, 0)            
            return forward_scattered_predict
Beispiel #24
0
def bench_runlmc(num_runs, m, xss, yss, test_xss, test_yss, kgen, rgen,
                 slfmgen, indepgen, optimizer_opts, **kwargs):
    times, smses, nlpds = [], [], []
    return_lmc = 'return_lmc' in kwargs
    if return_lmc:
        del kwargs['return_lmc']
    for i in range(num_runs):
        fk = FunctionalKernel(D=len(xss),
                              lmc_kernels=kgen(),
                              lmc_ranks=rgen(),
                              slfm_kernels=slfmgen(),
                              indep_gp=indepgen())
        lmc = InterpolatedLLGP(xss,
                               yss,
                               functional_kernel=fk,
                               normalize=True,
                               m=m,
                               **kwargs)
        opt = AdaDelta(**optimizer_opts)
        with contexttimer.Timer() as t:
            lmc.optimize(optimizer=opt)
        times.append(t.elapsed)
        np.save(
            TMP +
            'lmc-m{}-{}of{}-{}.npy'.format(m, i, num_runs, sum(map(len, xss))),
            lmc.param_array)
        pred_yss, pred_vss = lmc.predict(test_xss)
        smses.append(smse(test_yss, pred_yss, yss))
        nlpds.append(nlpd(test_yss, pred_yss, pred_vss))
        print('time', times[-1], 'smse', smses[-1], 'nlpd', nlpds[-1])

    points = [times, smses, nlpds]
    stats = [(np.mean(x), np.std(x) / np.sqrt(len(x))) for x in points]
    if return_lmc:
        return stats, lmc
    return stats
Beispiel #25
0
def run_variable_model(model, use_gpu, num_iter, max_seq_len, min_seq_len,
                       framework_name, num_threads, cfg):
    import torch
    import contexttimer
    import json
    import random
    test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0')

    request_list = []
    # make sure all benchmarking runtimes are using the same random distribution.
    random.seed(0)
    for i in range(num_iter):
        generated_seq_len = random.randint(min_seq_len, max_seq_len)
        input_ids = torch.randint(low=0,
                                  high=cfg.vocab_size - 1,
                                  size=(1, generated_seq_len),
                                  dtype=torch.long,
                                  device=test_device)
        request_list.append(input_ids)

    # warm-up using the longest sequence
    # TODO(jiaruifang) We know recommend you to run warm-up before inference.
    # In the future we will refactor allocator so as to not avoid warm-up
    input_ids = torch.randint(low=0,
                              high=cfg.vocab_size - 1,
                              size=(1, max_seq_len),
                              dtype=torch.long,
                              device=test_device)
    model(input_ids)
    if enable_latency_plot:
        import time
        print(f"dump results to {framework_name}_latency_{num_threads}.txt")
        with open(f"{framework_name}_latency_{num_threads}.txt", "w") as of:
            result_list = []
            for request in request_list:
                if use_gpu:
                    start = torch.cuda.Event(enable_timing=True)
                    end = torch.cuda.Event(enable_timing=True)
                    start.record()

                with contexttimer.Timer() as t:
                    model(request)

                if not use_gpu:
                    qps = num_iter / t.elapsed
                    time_consume = t.elapsed
                else:
                    end.record()
                    torch.cuda.synchronize()
                    torch_elapsed = start.elapsed_time(end) / 1e3
                    qps = num_iter / torch_elapsed
                    time_consume = torch_elapsed
                print('.', end='', flush=True)
                result_list.append([len(request.view(-1)), time_consume])
            elapse = 0.
            result_list = sorted(result_list, key=lambda s: s[0])
            for item in result_list:
                of.write(f"{item[0]}, {item[1]}\n")
                elapse += item[1]
            print(f"elapsed {elapse}  QPS {num_iter/elapse}")
    else:
        if use_gpu:
            start = torch.cuda.Event(enable_timing=True)
            end = torch.cuda.Event(enable_timing=True)
            start.record()

        with contexttimer.Timer() as t:
            for request in request_list:
                model(request)

        if not use_gpu:
            qps = num_iter / t.elapsed
            time_consume = t.elapsed
        else:
            end.record()
            torch.cuda.synchronize()
            torch_elapsed = start.elapsed_time(end) / 1e3
            qps = num_iter / torch_elapsed
            time_consume = torch_elapsed
        print(
            json.dumps({
                "QPS": qps,
                "elapsed": time_consume,
                "n": num_iter,
                "max_seq_len": max_seq_len,
                "min_seq_len": min_seq_len,
                "framework": framework_name,
                "thread_num": num_iter,
            }))
Beispiel #26
0
def timer(*args, **kwargs):
    return contexttimer.Timer(*args, **kwargs, timer=time.perf_counter)
Beispiel #27
0
import thread_tools
import time
print('imported from thread_tools: ',dir(thread_tools))
import contexttimer
from thread_tools import wait_loop_nogil

nloops=500
with contexttimer.Timer(time.perf_counter) as pure_wall:
    with contexttimer.Timer(time.process_time) as pure_cpu:
        result=wait_loop_nogil(nloops)
print(f'pybind11 wall time {pure_wall.elapsed} and cpu time {pure_cpu.elapsed}')



Beispiel #28
0
# ### Create two functions, one to print thread and process ids, and one to run the wait_for loop
#
# * Important point -- the logging module is **threadsafe**
#
# * Submit 6 jobs queued on 3 processors

# %%
njobs = 12
nprocs = 3
thread_id_jobs = [(find_ids, [], {}) for i in range(nprocs)]
nloops = 5250
calc_jobs = [(wait_loop_nogil, [nloops], {}) for i in range(njobs)]
print(calc_jobs)

# %%
with contexttimer.Timer(time.perf_counter) as wall:
    with contexttimer.Timer(time.process_time) as cpu:
        with Parallel(n_jobs=nprocs, backend="threading") as parallel:
            # parallel(thread_id_jobs)
            results = parallel(calc_jobs)
        print(results)
print(f"wall time {wall.elapsed} and cpu time {cpu.elapsed}")

# %% [markdown]
# * Each job was run on a different thread but in the same process
#
# * Note that the cpu time is larger than the wall time, confirming that we've release the GIL.
#

# %% [markdown]
# ### Now repeat this holding the GIL
Beispiel #29
0
    # enable synchronous mode
    vrep.simxSynchronous(client_id, True)

    position_history = []
    e, body_pos = vrep.simxGetObjectPosition(client_id, body, -1,
                                             vrep.simx_opmode_buffer)
    position_history.append(body_pos)

    joint_pos_history = []
    e, joint_0_pos = vrep.simxGetJointPosition(client_id, joint_0,
                                               vrep.simx_opmode_streaming)
    e, joint_1_pos = vrep.simxGetJointPosition(client_id, joint_1,
                                               vrep.simx_opmode_streaming)
    joint_pos_history.append([joint_0_pos, joint_1_pos])

    with contexttimer.Timer() as timer:
        for i in range(4):
            e = vrep.simxSetJointTargetPosition(client_id, joint_0, 0.5,
                                                vrep.simx_opmode_streaming)
            for t in range(3):
                vrep.simxSynchronousTrigger(client_id)
                e, body_pos = vrep.simxGetObjectPosition(
                    client_id, body, -1, vrep.simx_opmode_buffer)
                position_history.append(body_pos)
                e, joint_0_pos = vrep.simxGetJointPosition(
                    client_id, joint_0, vrep.simx_opmode_buffer)
                e, joint_1_pos = vrep.simxGetJointPosition(
                    client_id, joint_1, vrep.simx_opmode_buffer)
                joint_pos_history.append([joint_0_pos, joint_1_pos])

            e = vrep.simxSetJointTargetPosition(client_id, joint_1, 2.5,
Beispiel #30
0
def compress(Xs, weights, bias, rank, elements_per_batch, snapshot_num=None):

    # Chosen Line Search Parameters
    alpha = 0.1
    beta = 0.5

    print("Setting up Problem")
    if snapshot_num is not None:
        f = np.load("snapshot_{}.npz".format(snapshot_num))
        U = f.get('arr_0')
        V = f.get('arr_1')
    else:
        # Seed the initial U, V with the SVD.
        Ufull, Sfull, Vfull = np.linalg.svd(weights, full_matrices=False)
        U = Ufull[:, :rank] @ np.diag(Sfull[:rank])
        V = Vfull[:rank, :]
        np.savez("snapshot_SVD.npz", U, V)

    for iteration in itertools.count(snapshot_num or 0):
        # Due to memory constraints, we must sub-sample the inputs.
        X = Xs[np.random.choice(Xs.shape[0], elements_per_batch)]
        print("Starting Iteration", iteration)
        gold = sigmoid(X @ weights + bias)
        current_guess = sigmoid(X @ U @ V + bias)
        diff = current_guess - gold
        current_value = np.linalg.norm(diff, 'fro')**2
        print("Current Status:", current_value)

        # applying the gradient of the sigmoid
        sig_gradient = (current_guess * (1 - current_guess)).ravel()

        with contexttimer.Timer() as tv:
            partialGradV = (sig_gradient * computeGradientAB(X @ U, V).T).T
        print("GradV:", tv.elapsed)
        print(np.linalg.norm(partialGradV, np.inf))
        with contexttimer.Timer() as tu:
            partialGradU = (sig_gradient * computeGradientABC(X, U, V).T).T
        print("GradU:", tu.elapsed)
        print(np.linalg.norm(partialGradU, np.inf))
        # stack into one large matrix
        stacked = np.concatenate([partialGradU, partialGradV], axis=1)

        with contexttimer.Timer() as tup:
            print("Starting LSTSQ")
            assert np.isfinite(diff).all()
            assert np.isfinite(stacked).all()
            update = splinalg.lstsq(stacked, -diff.ravel())[0]
            print("Finished LSTSQ")

            expected_value = np.linalg.norm(diff.ravel() + stacked @ update)**2
            expected_improvement = expected_value - current_value  # This is negative, as we expect an improvement
            assert expected_improvement < 0
            print("Expected Improvement:", expected_improvement)
            t = 1
            while True:
                Uupdate = t * update[:U.size].reshape(U.shape)
                Vupdate = t * update[U.size:].reshape(V.shape)
                new_value = np.linalg.norm(
                    sigmoid(X @ (U + Uupdate) @ (V + Vupdate) + bias) - gold,
                    'fro')**2

                if new_value <= current_value + alpha * t * expected_improvement:
                    print("Line search T:", t)
                    print("New Value:", new_value)
                    step_size = 1 / np.sqrt(iteration + 1)
                    U += step_size * Uupdate
                    V += step_size * Vupdate
                    np.savez("snapshot_{}.npz".format(iteration), U, V)
                    if np.linalg.norm(t * step_size * update) <= 1e-5:
                        return U, V
                    break
                t *= beta

        print("Update:", tup.elapsed)