Beispiel #1
0
def profile_train(train_loader, model, criterion, optimizer):
    batch_time_meter = AverageMeter()
    data_time_meter = AverageMeter()
    NUM_STEPS_TO_PROFILE = 100  # profile 100 steps or minibatches

    # switch to train mode
    model.train()

    layer_timestamps = []
    data_times = []

    iteration_timestamps = []
    opt_step_timestamps = []
    data_timestamps = []
    
    start_time = time.time()
    for i, (input, target) in enumerate(train_loader):
        data_pid = os.getpid()
        data_time = time.time() - start_time
        data_time_meter.update(data_time)
        with torchprofiler.Profiling(model, module_whitelist=[]) as p:
            input = input.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)

            # compute output
            output = model(input)
            if isinstance(output, tuple):
                loss = sum((criterion(output_elem, target) for output_elem in output))
            else:
                loss = criterion(output, target)

            # compute gradient and do SGD step
            optimizer.zero_grad()
            loss.backward()
            optimizer_step_start = time.time()
            optimizer.step()

            end_time = time.time()
            iteration_time = end_time - start_time
            batch_time_meter.update(iteration_time)

            if i >= NUM_STEPS_TO_PROFILE:
                break
        p_str = str(p)
        layer_timestamps.append(p.processed_times())
        data_times.append(data_time)

        if args.verbose:
            print('End-to-end time: {batch_time.val:.3f} s ({batch_time.avg:.3f} s)'.format(
                  batch_time=batch_time_meter))

        iteration_timestamps.append({"start": start_time * 1000 * 1000,
                                     "duration": iteration_time * 1000 * 1000})
        opt_step_timestamps.append({"start": optimizer_step_start * 1000 * 1000,
                                    "duration": (end_time - optimizer_step_start) * 1000 * 1000, "pid": os.getpid()})
        data_timestamps.append({"start":  start_time * 1000 * 1000,
                                "duration": data_time * 1000 * 1000, "pid": data_pid})
        
        start_time = time.time()

    layer_times = []
    tot_accounted_time = 0.0
    if args.verbose:
        print("\n==========================================================")
        print("Layer Type    Forward Time (ms)    Backward Time (ms)")
        print("==========================================================")

    for i in range(len(layer_timestamps[0])):
        layer_type = str(layer_timestamps[0][i][0])
        layer_forward_time_sum = 0.0
        layer_backward_time_sum = 0.0
        for j in range(len(layer_timestamps)):
            layer_forward_time_sum += (layer_timestamps[j][i][2] / 1000)
            layer_backward_time_sum += (layer_timestamps[j][i][5] / 1000)
        layer_times.append((layer_type, layer_forward_time_sum / len(layer_timestamps),
                                    layer_backward_time_sum / len(layer_timestamps)))
        if args.verbose:
            print(layer_times[-1][0], layer_times[-1][1], layer_times[-1][2])
        tot_accounted_time += (layer_times[-1][1] + layer_times[-1][2])

    print()
    print("Total accounted time: %.3f ms" % tot_accounted_time)
    return layer_times, (sum(data_times) * 1000.0) / len(data_times)
Beispiel #2
0
    def profile_feed_data(self, data_loader, log_dir, training=True):
        pid = os.getpid()
        stdlog = open(os.path.join(log_dir, "stdout.{}.txt".format(pid)), "a+")
        #summary = torchsummary.summary(self.model, , data_loader.batch_size, False)
        if training:
            assert self.optimizer is not None
        if self.math != "fp32":
            print("fp32 only supperted currently")
            return

        fflayer_timestamps = []
        bplayer_timestamps = []
        iteration_timestamps = []
        timestamp_blobs = {}
        timestamp_blobs["optimizer_step"] = []
        timestamp_blobs["ffpass"] = []
        timestamp_blobs["loss"] = []
        timestamp_blobs["bppass"] = []

        if self.cupti:
            start_cupti_tracing()
        for i, (src, tgt, _) in enumerate(data_loader):
            print("iteration {}".format(i))
            if i >= self.num_steps:
                break
            # do a train/evaluate iteration
            # stats = self.iterate(src, tgt, training=training)
            src, src_length = src
            tgt, tgt_length = tgt
            src_length = torch.LongTensor(src_length)
            tgt_length = torch.LongTensor(tgt_length)

            num_toks = {}
            num_toks['tgt'] = int(sum(tgt_length - 1))
            num_toks['src'] = int(sum(src_length))

            t0 = time.time()

            if self.cuda:
                src = src.cuda()
                src_length = src_length.cuda()
                tgt = tgt.cuda()

            with torchprofiler.Profiling(self.model) as p:
                t1 = time.time()
                if self.batch_first:
                    output = self.model(src, src_length, tgt[:, :-1])
                    tgt_labels = tgt[:, 1:]
                    T, B = output.size(1), output.size(0)
                else:
                    output = self.model(src, src_length, tgt[:-1])
                    tgt_labels = tgt[1:]
                    T, B = output.size(0), output.size(1)
                t2 = time.time()
                loss = self.criterion(output.view(T * B, -1).float(),
                                      tgt_labels.contiguous().view(-1))
                loss_per_batch = loss.item()
                loss /= B
                t3 = time.time()

                if not training:
                    continue

                #print(type(self.fp_optimizer))
                #self.fp_optimizer.step(loss, self.optimizer, True)
                self.model.zero_grad()
                loss.backward()
                t4 = time.time()
                if self.grad_clip != float('inf'):
                    clip_grad_norm_(self.model.parameters(), self.grad_clip)
                self.optimizer.step()
                t5 = time.time()
            print("iteration time: {}".format(t5 - t0))

            fflayer_timestamps.append(p.processed_times_dir('forward'))
            bplayer_timestamps.append(p.processed_times_dir('backward'))
            timestamp_blobs["optimizer_step"].append({
                "start": t4 * 1000 * 1000,
                "duration": (t5 - t4) * 1000 * 1000,
                "in_critical_path": True,
                "pid": pid})
            timestamp_blobs["ffpass"].append({
                "start": t1 * 1000 * 1000,
                "duration": (t2 - t1) * 1000 * 1000,
                "in_critical_path": True,
                "pid": pid})
            timestamp_blobs["loss"].append({
                "start": t2 * 1000 * 1000,
                "duration": (t3 - t2) * 1000 * 1000,
                "in_critical_path": True,
                "pid": pid})
            timestamp_blobs["bppass"].append({
                "start": t3 * 1000 * 1000,
                "duration": (t4 - t3) * 1000 * 1000,
                "in_critical_path": True,
                "pid": pid})
            iteration_timestamps.append({
                "start": t1 * 1000 * 1000,
                "duration": (t5 - t1) * 1000 * 1000})

        '''
        layer_times = []

        for i in range(len(layer_timestamps[0])):
            layer_type = str(layer_timestamps[0][i][0])
            layer_forward_times = []
            layer_backward_times = []
            for j in range(len(layer_timestamps)):
                layer_forward_times.append(layer_timestamps[j][i][2])
                layer_backward_times.append(layer_timestamps[j][i][5])
            layer_times.append([layer_type, layer_forward_times, layer_backward_times])
        '''

        print("finish iterations")
        if self.cupti:
            end_cupti_tracing()
            print("end cupti tracing")

        torchprofiler.generate_json(0, self.num_steps - 1, fflayer_timestamps, bplayer_timestamps, iteration_timestamps, timestamp_blobs, output_filename=os.path.join(log_dir, "processed_time.{}.json".format(pid)))

        '''
Beispiel #3
0
    def feed_data(self, data_loader, training=True):
        """
        Runs training or validation on batches from data_loader.

        :param data_loader: data loader
        :param training: if True runs training else runs validation
        """
        if training:
            assert self.optimizer is not None
            eval_fractions = np.linspace(0, 1, self.intra_epoch_eval + 2)[1:-1]
            eval_iters = (eval_fractions * len(data_loader)).astype(int)
            eval_iters = set(eval_iters)

        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses_per_token = AverageMeter()
        losses_per_sentence = AverageMeter()

        tot_tok_time = AverageMeter()
        src_tok_time = AverageMeter()
        tgt_tok_time = AverageMeter()

        batch_size = data_loader.batch_size
        layer_timestamps = []
        verbose = True

        module_whitelist = ["EmuBidirLSTM", "RecurrentAttention", "Classifier"]

        for i, (src, tgt) in enumerate(data_loader):
            break
        (src, src_length) = src
        (tgt, tgt_length) = tgt
        src_length = torch.LongTensor(src_length).cuda()
        src = src.cuda()
        tgt = tgt.cuda()
        model_input = (src, src_length, tgt[:-1])
        summary = torchsummary.summary(model=self.model,
                                       module_whitelist=module_whitelist,
                                       model_input=model_input,
                                       verbose=True)

        end = time.time()
        NUM_STEPS_TO_PROFILE = 100  # profile 100 steps
        for i, (src, tgt) in enumerate(data_loader):
            self.save_counter += 1
            # measure data loading time
            data_time.update(time.time() - end)

            with torchprofiler.Profiling(self.model, module_whitelist) as p:
                # do a train/evaluate iteration
                stats = self.iterate(src, tgt, training=training)
                loss_per_token, loss_per_sentence, num_toks = stats
            print(str(p))
            layer_timestamps.append(p.processed_times())

            # measure accuracy and record loss
            losses_per_token.update(loss_per_token, num_toks['tgt'])
            losses_per_sentence.update(loss_per_sentence, batch_size)

            # measure elapsed time
            elapsed = time.time() - end
            batch_time.update(elapsed)
            src_tok_time.update(num_toks['src'] / elapsed)
            tgt_tok_time.update(num_toks['tgt'] / elapsed)
            tot_num_toks = num_toks['tgt'] + num_toks['src']
            tot_tok_time.update(tot_num_toks / elapsed)
            self.loss = losses_per_token.avg

            if training and i in eval_iters:
                test_bleu, _ = self.translator.run(calc_bleu=True,
                                                   epoch=self.epoch,
                                                   iteration=i)

                log = []
                log += [f'TRAIN [{self.epoch}][{i}/{len(data_loader)}]']
                log += [f'BLEU: {test_bleu:.2f}']
                log = '\t'.join(log)
                logging.info(log)

                self.model.train()
                self.preallocate(data_loader, training=True)

            if i % self.print_freq == 0:
                phase = 'TRAIN' if training else 'VALIDATION'
                log = []
                log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]']
                log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})']
                log += [f'Data {data_time.val:.5f} ({data_time.avg:.5f})']
                log += [
                    f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})'
                ]
                if self.verbose:
                    log += [
                        f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})'
                    ]
                    log += [
                        f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})'
                    ]
                    log += [
                        f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})'
                    ]
                log += [
                    f'Loss/tok {losses_per_token.val:.4f} ({losses_per_token.avg:.4f})'
                ]
                lr = [
                    param_group['lr']
                    for param_group in self.optimizer.param_groups
                ]
                log += [f'Learning Rate {lr}']
                log = '\t'.join(log)
                logging.info(log)

            if i >= NUM_STEPS_TO_PROFILE:
                break

            save_chkpt = (self.save_counter %
                          self.save_freq) == (self.save_freq - 1)
            if training and save_chkpt:
                self.save_counter = 0
                self.save_info['iteration'] = i
                identifier = next(self.checkpoint_counter, -1)
                if identifier != -1:
                    with sync_workers() as rank:
                        if rank == 0:
                            self.save(identifier=identifier)

            end = time.time()

        if verbose:
            print(
                "\n==========================================================")
            print("Layer Type    Forward Time (ms)    Backward Time (ms)")
            print("==========================================================")

        tot_accounted_time = 0.0
        per_layer_times = []
        for i in range(len(layer_timestamps[0])):
            layer_type = str(layer_timestamps[0][i][0])
            layer_forward_time_sum = 0.0
            layer_backward_time_sum = 0.0
            for j in range(len(layer_timestamps)):
                layer_forward_time_sum += (layer_timestamps[j][i][2] / 1000)
                layer_backward_time_sum += (layer_timestamps[j][i][5] / 1000)
            per_layer_times.append(
                (layer_type, layer_forward_time_sum / len(layer_timestamps),
                 layer_backward_time_sum / len(layer_timestamps)))
            if verbose:
                print(per_layer_times[-1][0], per_layer_times[-1][1],
                      per_layer_times[-1][2])
            tot_accounted_time += (per_layer_times[-1][1] +
                                   per_layer_times[-1][2])

        print("Total accounted time: %.3f ms" % tot_accounted_time)

        summary_i = 0
        per_layer_times_i = 0
        last_summary_i = -1
        last_per_layer_times_i = -1
        while len(per_layer_times) > 0:
            per_layer_time = per_layer_times.pop(0)
            for summary_i in range(len(summary)):
                summary_elem = summary[summary_i]
                if str(summary_elem['layer_name']) != str(per_layer_time[0]):
                    continue
                if 'forward_time' in summary_elem and 'backward_time' in summary_elem:
                    continue
                summary_elem['forward_time'] = per_layer_time[1]
                summary_elem['backward_time'] = per_layer_time[2]
                break

        if training:
            create_graph(self.model, module_whitelist, (src, tgt), summary,
                         os.path.join("profiles", self.arch))

        tot_tok_time.reduce('sum')
        losses_per_token.reduce('mean')

        return losses_per_token.avg, tot_tok_time.avg