Esempio n. 1
0
    def feed_data(self, data_loader, training=True):
        if training:
            assert self.optimizer is not None
        batch_size = data_loader.batch_size

        losses_per_token = AverageMeter()
        loss_per_sentence = AverageMeter()
        num_toks = AverageMeter()

        start_time = 0
        end_time = 0
        t0 = time.time()
        for i, (src, tgt, _) in enumerate(data_loader):
            if i == self.profile_start:
                if self.cupti:
                    start_cupti_tracing()
                elif self.nsight:
                    torch.cuda.profiler.start()
                start_time = time.time()

            if i == self.profile_stop:
                end_time = time.time()
                if self.cupti:
                    end_cupti_tracing()
                elif self.nsight:
                    torch.cuda.profiler.stop()

            if i == self.profile_stop:
                break
            self.save_counter += 1
            # measure data loading time

            # do a train/evaluate iteration
            stats = self.iterate(src, tgt, training=training)
            loss_per_token, loss_per_sentence, num_toks = stats

            save_chkpt = (self.save_counter %
                          self.save_freq) == (self.save_freq - 1)
            if training and save_chkpt:
                self.save_counter = 0
                self.save_info['iteration'] = i
                identifier = next(self.checkpoint_counter, -1)
                if identifier != -1:
                    with sync_workers() as rank:
                        if rank == 0:
                            self.save(identifier=identifier)
            t1 = time.time()
            print("iteration {}: {} ms".format(i, (t1 - t0) * 1000))
            t0 = t1
        print("average time {:.2f} ms".format(
            (end_time - start_time) * 1000 /
            (self.profile_stop - self.profile_start)))
        return losses_per_token.avg
Esempio n. 2
0
    def feed_data(self, data_loader, training=True):
        if training:
            assert self.optimizer is not None
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses_per_token = AverageMeter()
        losses_per_sentence = AverageMeter()

        tot_tok_time = AverageMeter()
        src_tok_time = AverageMeter()
        tgt_tok_time = AverageMeter()

        batch_size = data_loader.batch_size

        end = time.time()
        for i, (src, tgt, _) in enumerate(data_loader):
            print("iteration {}".format(i))
            if i >= self.num_steps and self.num_steps > 0:
                break
            if i == 5 and self.cupti:
                start_cupti_tracing()
            self.save_counter += 1
            # measure data loading time
            data_time.update(time.time() - end)

            # do a train/evaluate iteration
            stats = self.iterate(src, tgt, training=training)
            loss_per_token, loss_per_sentence, num_toks = stats

            # measure accuracy and record loss
            losses_per_token.update(loss_per_token, num_toks['tgt'])
            losses_per_sentence.update(loss_per_sentence, batch_size)

            # measure elapsed time
            elapsed = time.time() - end
            batch_time.update(elapsed)
            src_tok_time.update(num_toks['src'] / elapsed)
            tgt_tok_time.update(num_toks['tgt'] / elapsed)
            tot_num_toks = num_toks['tgt'] + num_toks['src']
            tot_tok_time.update(tot_num_toks / elapsed)
            self.loss = losses_per_token.avg
            end = time.time()

            if i % self.print_freq == 0:
                phase = 'TRAIN' if training else 'EVAL'
                log = []
                log += ['{} [{}][{}/{}]'.format(phase, self.epoch, i, len(data_loader))]
                log += ['Time {:.3f} ({:.3f})'.format(batch_time.val, batch_time.avg)]
                log += ['Data {:.3f} ({:.3f})'.format(data_time.val, data_time.avg)]
                log += ['Tok/s {:.0f} ({:.0f})'.format(tot_tok_time.val, tot_tok_time.avg)]
                if self.verbose:
                    log += ['Src tok/s {:.0f} ({:.0f})'.format(src_tok_time.val, src_tok_time.avg)]
                    log += ['Tgt tok/s {:.0f} ({:.0f})'.format(tgt_tok_time.val, tgt_tok_time.avg)]
                    log += ['Loss/sentence {:.1f} ({:.1f})'.format(losses_per_sentence.val, losses_per_sentence.avg)]
                log += ['Loss/tok {:.8f} ({:.8f})'.format(losses_per_token.val, losses_per_token.avg)]
                log = '\t'.join(log)
                print(log)
                #logging.info(log)

            save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1)
            if training and save_chkpt:
                self.save_counter = 0
                self.save_info['iteration'] = i
                identifier = next(self.checkpoint_counter, -1)
                if identifier != -1:
                    with sync_workers() as rank:
                        if rank == 0:
                            self.save(identifier=identifier)
        if self.cupti:
            end_cupti_tracing()
        return losses_per_token.avg
Esempio n. 3
0
    def feed_data(self, data_loader, training=True):
        """
        Runs training or validation on batches from data_loader.

        :param data_loader: data loader
        :param training: if True runs training else runs validation
        """
        if training:
            assert self.optimizer is not None
            eval_fractions = np.linspace(0, 1, self.intra_epoch_eval + 2)[1:-1]
            iters_with_update = len(data_loader) // self.iter_size
            eval_iters = (eval_fractions * iters_with_update).astype(int)
            eval_iters = eval_iters * self.iter_size
            eval_iters = set(eval_iters)

        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses_per_token = AverageMeter(skip_first=False)
        losses_per_sentence = AverageMeter(skip_first=False)

        tot_tok_time = AverageMeter()
        src_tok_time = AverageMeter()
        tgt_tok_time = AverageMeter()

        batch_size = data_loader.batch_size

        end = time.time()
        for i, (src, tgt) in enumerate(data_loader):
            self.save_counter += 1
            # measure data loading time
            data_time.update(time.time() - end)

            update = False
            if i % self.iter_size == self.iter_size - 1:
                update = True

            # do a train/evaluate iteration
            stats = self.iterate(src, tgt, update, training=training)
            loss_per_token, loss_per_sentence, num_toks = stats

            # measure accuracy and record loss
            losses_per_token.update(loss_per_token, num_toks['tgt'])
            losses_per_sentence.update(loss_per_sentence, batch_size)

            # measure elapsed time
            elapsed = time.time() - end
            batch_time.update(elapsed)
            src_tok_time.update(num_toks['src'] / elapsed)
            tgt_tok_time.update(num_toks['tgt'] / elapsed)
            tot_num_toks = num_toks['tgt'] + num_toks['src']
            tot_tok_time.update(tot_num_toks / elapsed)
            self.loss = losses_per_token.avg

            if training and i in eval_iters:
                test_bleu, _ = self.translator.run(calc_bleu=True,
                                                   epoch=self.epoch,
                                                   iteration=i)

                log = []
                log += [f'TRAIN [{self.epoch}][{i}/{len(data_loader)}]']
                log += [f'BLEU: {test_bleu:.2f}']
                log = '\t'.join(log)
                logging.info(log)

                self.model.train()
                self.preallocate(data_loader, training=True)

            if i % self.print_freq == 0:
                phase = 'TRAIN' if training else 'VALIDATION'
                log = []
                log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]']
                log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})']
                log += [f'Data {data_time.val:.2e} ({data_time.avg:.2e})']
                log += [
                    f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})'
                ]
                if self.verbose:
                    log += [
                        f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})'
                    ]
                    log += [
                        f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})'
                    ]
                    log += [
                        f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})'
                    ]
                log += [
                    f'Loss/tok {losses_per_token.val:.4f} ({losses_per_token.avg:.4f})'
                ]
                if training:
                    lr = self.optimizer.param_groups[0]['lr']
                    log += [f'LR {lr:.3e}']
                log = '\t'.join(log)
                logging.info(log)

            save_chkpt = (self.save_counter %
                          self.save_freq) == (self.save_freq - 1)
            if training and save_chkpt:
                self.save_counter = 0
                self.save_info['iteration'] = i
                identifier = next(self.checkpoint_counter, -1)
                if identifier != -1:
                    with sync_workers() as rank:
                        if rank == 0:
                            self.save(identifier=identifier)

            end = time.time()

        tot_tok_time.reduce('sum')
        losses_per_token.reduce('mean')

        return losses_per_token.avg, tot_tok_time.avg
Esempio n. 4
0
def main():
    args = parse_args()
    print(args)

    if args.cuda:
        torch.cuda.set_device(0)
    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if args.math == 'fp16' and not args.cuda:
        raise RuntimeError('fp16 requires cuda')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    vocab_size = checkpoint['tokenizer'].vocab_size
    model_config = dict(vocab_size=vocab_size,
                        math=checkpoint['config'].math,
                        **literal_eval(checkpoint['config'].model_config))
    model_config['batch_first'] = args.batch_first
    model = models.GNMT(**model_config)

    state_dict = checkpoint['state_dict']
    if checkpoint_from_distributed(state_dict):
        state_dict = unwrap_distributed(state_dict)

    model.load_state_dict(state_dict)

    if args.math == 'fp32':
        dtype = torch.FloatTensor
    if args.math == 'fp16':
        dtype = torch.HalfTensor

    model.type(dtype)
    if args.cuda:
        model = model.cuda()
    model.eval()

    tokenizer = checkpoint['tokenizer']

    test_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir,
                                                       config.SRC_TEST_FNAME),
                                tgt_fname=os.path.join(args.dataset_dir,
                                                       config.TGT_TEST_FNAME),
                                tokenizer=tokenizer,
                                min_len=0,
                                max_len=150,
                                sort=False)

    test_loader = test_data.get_loader(batch_size=args.batch_size,
                                       batch_first=True,
                                       shuffle=False,
                                       num_workers=0,
                                       drop_last=False,
                                       distributed=False)

    translator = Translator(model,
                            tokenizer,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_seq_len,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda)

    model.eval()
    torch.cuda.empty_cache()

    # only write the output to file in accuracy mode
    if args.mode == 'accuracy':
        test_file = open(args.output, 'w', encoding='UTF-8')

    batch_time = AverageMeter(False)
    tot_tok_per_sec = AverageMeter(False)
    iterations = AverageMeter(False)
    enc_seq_len = AverageMeter(False)
    dec_seq_len = AverageMeter(False)
    stats = {}

    for i, (src, tgt, indices) in enumerate(test_loader):
        translate_timer = time.time()
        src, src_length = src

        if translator.batch_first:
            batch_size = src.size(0)
        else:
            batch_size = src.size(1)
        beam_size = args.beam_size

        bos = [translator.insert_target_start] * (batch_size * beam_size)
        bos = torch.LongTensor(bos)
        if translator.batch_first:
            bos = bos.view(-1, 1)
        else:
            bos = bos.view(1, -1)

        src_length = torch.LongTensor(src_length)
        stats['total_enc_len'] = int(src_length.sum())

        if args.cuda:
            src = src.cuda()
            src_length = src_length.cuda()
            bos = bos.cuda()

        with torch.no_grad():
            context = translator.model.encode(src, src_length)
            context = [context, src_length, None]

            if beam_size == 1:
                generator = translator.generator.greedy_search
            else:
                generator = translator.generator.beam_search
            preds, lengths, counter = generator(batch_size, bos, context)

        stats['total_dec_len'] = lengths.sum().item()
        stats['iters'] = counter

        preds = preds.cpu()
        lengths = lengths.cpu()

        output = []
        for idx, pred in enumerate(preds):
            end = lengths[idx] - 1
            pred = pred[1:end]
            pred = pred.tolist()
            out = translator.tok.detokenize(pred)
            output.append(out)

        # only write the output to file in accuracy mode
        if args.mode == 'accuracy':
            output = [output[indices.index(i)] for i in range(len(output))]
            for line in output:
                test_file.write(line)
                test_file.write('\n')

        # Get timing
        elapsed = time.time() - translate_timer
        batch_time.update(elapsed, batch_size)

        total_tokens = stats['total_dec_len'] + stats['total_enc_len']
        ttps = total_tokens / elapsed
        tot_tok_per_sec.update(ttps, batch_size)

        iterations.update(stats['iters'])
        enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size)
        dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size)

        if i % 5 == 0:
            log = []
            log += 'TEST '
            log += 'Time {:.3f} ({:.3f})\t'.format(batch_time.val,
                                                   batch_time.avg)
            log += 'Decoder iters {:.1f} ({:.1f})\t'.format(
                iterations.val, iterations.avg)
            log += 'Tok/s {:.0f} ({:.0f})'.format(tot_tok_per_sec.val,
                                                  tot_tok_per_sec.avg)
            log = ''.join(log)
            print(log)

    # summary timing
    time_per_sentence = (batch_time.avg / batch_size)
    log = []
    log += 'TEST SUMMARY:\n'
    log += 'Lines translated: {}\t'.format(len(test_loader.dataset))
    log += 'Avg total tokens/s: {:.0f}\n'.format(tot_tok_per_sec.avg)
    log += 'Avg time per batch: {:.3f} s\t'.format(batch_time.avg)
    log += 'Avg time per sentence: {:.3f} ms\n'.format(1000 *
                                                       time_per_sentence)
    log += 'Avg encoder seq len: {:.2f}\t'.format(enc_seq_len.avg)
    log += 'Avg decoder seq len: {:.2f}\t'.format(dec_seq_len.avg)
    log += 'Total decoder iterations: {}'.format(int(iterations.sum))
    log = ''.join(log)
    print(log)

    # only write the output to file in accuracy mode
    if args.mode == 'accuracy':
        test_file.close()

        test_path = args.output
        # run moses detokenizer
        detok_path = os.path.join(args.dataset_dir, config.DETOKENIZER)
        detok_test_path = test_path + '.detok'

        with open(detok_test_path, 'w') as detok_test_file, \
                open(test_path, 'r') as test_file:
            subprocess.run(['perl', detok_path],
                           stdin=test_file,
                           stdout=detok_test_file,
                           stderr=subprocess.DEVNULL)

        # run sacrebleu
        reference_path = os.path.join(args.dataset_dir,
                                      config.TGT_TEST_TARGET_FNAME)
        sacrebleu = subprocess.run([
            'sacrebleu --input {} {} --score-only -lc --tokenize intl'.format(
                detok_test_path, reference_path)
        ],
                                   stdout=subprocess.PIPE,
                                   shell=True)
        bleu = float(sacrebleu.stdout.strip())

        print('BLEU on test dataset: {}'.format(bleu))

        print('Finished evaluation on test set')
Esempio n. 5
0
def main():
    args = parse_args()
    print(args)

    if args.cuda:
        torch.cuda.set_device(0)
    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if args.math == 'fp16' and not args.cuda:
        raise RuntimeError('fp16 requires cuda')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    vocab_size = checkpoint['tokenizer'].vocab_size
    model_config = dict(vocab_size=vocab_size, math=checkpoint['config'].math,
                        **literal_eval(checkpoint['config'].model_config))
    model_config['batch_first'] = args.batch_first
    model = models.GNMT(**model_config)

    state_dict = checkpoint['state_dict']
    if checkpoint_from_distributed(state_dict):
        state_dict = unwrap_distributed(state_dict)

    model.load_state_dict(state_dict)

    if args.math == 'fp32':
        dtype = torch.FloatTensor
    if args.math == 'fp16':
        dtype = torch.HalfTensor

    model.type(dtype)
    if args.cuda:
        model = model.cuda()
    model.eval()

    tokenizer = checkpoint['tokenizer']

    translation_model = Translator(model,
                                   tokenizer,
                                   beam_size=args.beam_size,
                                   max_seq_len=args.max_seq_len,
                                   len_norm_factor=args.len_norm_factor,
                                   len_norm_const=args.len_norm_const,
                                   cov_penalty_factor=args.cov_penalty_factor,
                                   cuda=args.cuda)

    output_file = codecs.open(args.output, 'w', encoding='UTF-8')

    # run model on generated data, for accurate timings starting from 1st batch
    dummy_data = ['abc ' * (args.max_seq_len // 4)] * args.batch_size
    translation_model.translate(dummy_data)

    if args.cuda:
        torch.cuda.synchronize()

    batch_time = AverageMeter(False)
    enc_tok_per_sec = AverageMeter(False)
    dec_tok_per_sec = AverageMeter(False)
    tot_tok_per_sec = AverageMeter(False)

    enc_seq_len = AverageMeter(False)
    dec_seq_len = AverageMeter(False)

    total_lines = 0
    total_iters = 0
    with codecs.open(args.input, encoding='UTF-8') as input_file:
        for idx, lines in enumerate(grouper(input_file, args.batch_size)):
            lines = [l for l in lines if l]
            n_lines = len(lines)
            total_lines += n_lines

            translate_timer = time.time()
            translated_lines, stats = translation_model.translate(lines)
            elapsed = time.time() - translate_timer

            batch_time.update(elapsed, n_lines)
            etps = stats['total_enc_len'] / elapsed
            dtps = stats['total_dec_len'] / elapsed
            enc_seq_len.update(stats['total_enc_len'] / n_lines, n_lines)
            dec_seq_len.update(stats['total_dec_len'] / n_lines, n_lines)
            enc_tok_per_sec.update(etps, n_lines)
            dec_tok_per_sec.update(dtps, n_lines)

            tot_tok = stats['total_dec_len'] + stats['total_enc_len']
            ttps = tot_tok / elapsed
            tot_tok_per_sec.update(ttps, n_lines)

            n_iterations = stats['iters']
            total_iters += n_iterations

            write_output(output_file, translated_lines)

            if idx % args.print_freq == args.print_freq - 1:
                print(f'TRANSLATION: '
                      f'Batch {idx} '
                      f'Iters {n_iterations}\t'
                      f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      f'Tot tok/s {tot_tok_per_sec.val:.0f} ({tot_tok_per_sec.avg:.0f})\t'
                      f'Enc tok/s {enc_tok_per_sec.val:.0f} ({enc_tok_per_sec.avg:.0f})\t'
                      f'Dec tok/s {dec_tok_per_sec.val:.0f} ({dec_tok_per_sec.avg:.0f})')

    output_file.close()

    print(f'TRANSLATION SUMMARY:\n'
          f'Lines translated: {total_lines}\t'
          f'Avg time per batch: {batch_time.avg:.3f} s\t'
          f'Avg time per sentence: {1000*(batch_time.avg / args.batch_size):.3f} ms\n'
          f'Avg enc seq len: {enc_seq_len.avg:.2f}\t'
          f'Avg dec seq len: {dec_seq_len.avg:.2f}\t'
          f'Total iterations: {total_iters}\t\n'
          f'Avg tot tok/s: {tot_tok_per_sec.avg:.0f}\t'
          f'Avg enc tok/s: {enc_tok_per_sec.avg:.0f}\t'
          f'Avg dec tok/s: {dec_tok_per_sec.avg:.0f}')
    def evaluate(self, epoch, iteration, summary):
        """
        Runs evaluation on test dataset.

        :param epoch: index of the current epoch
        :param iteration: index of the current iteration
        :param summary: if True prints summary
        """
        batch_time = AverageMeter(False)
        tot_tok_per_sec = AverageMeter(False)
        iterations = AverageMeter(False)
        enc_seq_len = AverageMeter(False)
        dec_seq_len = AverageMeter(False)
        stats = {}

        output = []

        for i, (src, indices) in enumerate(self.loader):
            translate_timer = time.time()
            src, src_length = src

            batch_size = self.loader.batch_size
            global_batch_size = batch_size * get_world_size()
            beam_size = self.beam_size

            bos = [self.insert_target_start] * (batch_size * beam_size)
            bos = torch.LongTensor(bos)
            if self.batch_first:
                bos = bos.view(-1, 1)
            else:
                bos = bos.view(1, -1)

            src_length = torch.LongTensor(src_length)
            stats['total_enc_len'] = int(src_length.sum())

            if self.cuda:
                src = src.cuda()
                src_length = src_length.cuda()
                bos = bos.cuda()

            with torch.no_grad():
                context = self.model.encode(src, src_length)
                context = [context, src_length, None]

                if beam_size == 1:
                    generator = self.generator.greedy_search
                else:
                    generator = self.generator.beam_search
                preds, lengths, counter = generator(batch_size, bos, context)

            stats['total_dec_len'] = lengths.sum().item()
            stats['iters'] = counter

            indices = torch.tensor(indices).to(preds)
            preds = preds.scatter(0,
                                  indices.unsqueeze(1).expand_as(preds), preds)

            preds = gather_predictions(preds).cpu()

            for pred in preds:
                pred = pred.tolist()
                detok = self.tokenizer.detokenize(pred)
                output.append(detok + '\n')

            elapsed = time.time() - translate_timer
            batch_time.update(elapsed, batch_size)

            total_tokens = stats['total_dec_len'] + stats['total_enc_len']
            ttps = total_tokens / elapsed
            tot_tok_per_sec.update(ttps, batch_size)

            iterations.update(stats['iters'])
            enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size)
            dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size)

            if i % self.print_freq == 0:
                log = []
                log += f'TEST '
                if epoch is not None:
                    log += f'[{epoch}]'
                if iteration is not None:
                    log += f'[{iteration}]'
                log += f'[{i}/{len(self.loader)}]\t'
                log += f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                log += f'Decoder iters {iterations.val:.1f} ({iterations.avg:.1f})\t'
                log += f'Tok/s {tot_tok_per_sec.val:.0f} ({tot_tok_per_sec.avg:.0f})'
                log = ''.join(log)
                logging.info(log)

        tot_tok_per_sec.reduce('sum')
        enc_seq_len.reduce('mean')
        dec_seq_len.reduce('mean')
        batch_time.reduce('mean')
        iterations.reduce('sum')

        if summary and get_rank() == 0:
            time_per_sentence = (batch_time.avg / global_batch_size)
            log = []
            log += f'TEST SUMMARY:\n'
            log += f'Lines translated: {len(self.loader.dataset)}\t'
            log += f'Avg total tokens/s: {tot_tok_per_sec.avg:.0f}\n'
            log += f'Avg time per batch: {batch_time.avg:.3f} s\t'
            log += f'Avg time per sentence: {1000*time_per_sentence:.3f} ms\n'
            log += f'Avg encoder seq len: {enc_seq_len.avg:.2f}\t'
            log += f'Avg decoder seq len: {dec_seq_len.avg:.2f}\t'
            log += f'Total decoder iterations: {int(iterations.sum)}'
            log = ''.join(log)
            logging.info(log)

        return output
    def feed_data(self, data_loader, training=True):
        if training:
            assert self.optimizer is not None
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses_per_token = AverageMeter()
        losses_per_sentence = AverageMeter()

        tot_tok_time = AverageMeter()
        src_tok_time = AverageMeter()
        tgt_tok_time = AverageMeter()

        batch_size = data_loader.batch_size

        end = time.time()
        for i, (src, tgt, _) in enumerate(data_loader):
            self.save_counter += 1
            # measure data loading time
            data_time.update(time.time() - end)

            # do a train/evaluate iteration
            stats = self.iterate(src, tgt, training=training)
            loss_per_token, loss_per_sentence, num_toks = stats

            # measure accuracy and record loss
            losses_per_token.update(loss_per_token, num_toks['tgt'])
            losses_per_sentence.update(loss_per_sentence, batch_size)

            # measure elapsed time
            elapsed = time.time() - end
            batch_time.update(elapsed)
            src_tok_time.update(num_toks['src'] / elapsed)
            tgt_tok_time.update(num_toks['tgt'] / elapsed)
            tot_num_toks = num_toks['tgt'] + num_toks['src']
            tot_tok_time.update(tot_num_toks / elapsed)
            self.loss = losses_per_token.avg
            end = time.time()

            if i % self.print_freq == 0:
                phase = 'TRAIN' if training else 'EVAL'
                log = []
                log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]']
                log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})']
                log += [f'Data {data_time.val:.3f} ({data_time.avg:.3f})']
                log += [
                    f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})'
                ]
                if self.verbose:
                    log += [
                        f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})'
                    ]
                    log += [
                        f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})'
                    ]
                    log += [
                        f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})'
                    ]
                log += [
                    f'Loss/tok {losses_per_token.val:.8f} ({losses_per_token.avg:.8f})'
                ]
                log = '\t'.join(log)
                logging.info(log)

            save_chkpt = (self.save_counter %
                          self.save_freq) == (self.save_freq - 1)
            if training and save_chkpt:
                self.save_counter = 0
                self.save_info['iteration'] = i
                identifier = next(self.checkpoint_counter, -1)
                if identifier != -1:
                    with sync_workers() as rank:
                        if rank == 0:
                            self.save(identifier=identifier)

        return losses_per_token.avg
Esempio n. 8
0
    def feed_data(self, data_loader, training=True):
        if training:
            assert self.optimizer is not None
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses_per_token = AverageMeter()
        losses_per_sentence = AverageMeter()

        tot_tok_time = AverageMeter()
        src_tok_time = AverageMeter()
        tgt_tok_time = AverageMeter()

        batch_size = data_loader.batch_size
        count = 0
        end = time.time()
        for i, (src, tgt, _) in enumerate(data_loader):
            if count > self.number:
                break
            count += 1

            self.save_counter += 1
            # measure data loading time
            data_time.update(time.time() - end)

            # do a train/evaluate iteration
            stats = self.iterate(src, tgt, training=training)
            loss_per_token, loss_per_sentence, num_toks = stats

            # measure accuracy and record loss
            losses_per_token.update(loss_per_token, num_toks['tgt'])
            losses_per_sentence.update(loss_per_sentence, batch_size)

            # measure elapsed time
            elapsed = time.time() - end
            batch_time.update(elapsed)
            src_tok_time.update(num_toks['src'] / elapsed)
            tgt_tok_time.update(num_toks['tgt'] / elapsed)
            tot_num_toks = num_toks['tgt'] + num_toks['src']
            tot_tok_time.update(tot_num_toks / elapsed)
            self.loss = losses_per_token.avg
            end = time.time()

        return losses_per_token.avg
Esempio n. 9
0
    def feed_data(self, data_loader, training=True):
        """
        Runs training or validation on batches from data_loader.

        :param data_loader: data loader
        :param training: if True runs training else runs validation
        """
        if training:
            assert self.optimizer is not None
            eval_fractions = np.linspace(0, 1, self.intra_epoch_eval + 2)[1:-1]
            eval_iters = (eval_fractions * len(data_loader)).astype(int)
            eval_iters = set(eval_iters)

        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses_per_token = AverageMeter()
        losses_per_sentence = AverageMeter()

        tot_tok_time = AverageMeter()
        src_tok_time = AverageMeter()
        tgt_tok_time = AverageMeter()

        batch_size = data_loader.batch_size
        layer_timestamps = []
        verbose = True

        module_whitelist = ["EmuBidirLSTM", "RecurrentAttention", "Classifier"]

        for i, (src, tgt) in enumerate(data_loader):
            break
        (src, src_length) = src
        (tgt, tgt_length) = tgt
        src_length = torch.LongTensor(src_length).cuda()
        src = src.cuda()
        tgt = tgt.cuda()
        model_input = (src, src_length, tgt[:-1])
        summary = torchsummary.summary(model=self.model,
                                       module_whitelist=module_whitelist,
                                       model_input=model_input,
                                       verbose=True)

        end = time.time()
        NUM_STEPS_TO_PROFILE = 100  # profile 100 steps
        for i, (src, tgt) in enumerate(data_loader):
            self.save_counter += 1
            # measure data loading time
            data_time.update(time.time() - end)

            with torchprofiler.Profiling(self.model, module_whitelist) as p:
                # do a train/evaluate iteration
                stats = self.iterate(src, tgt, training=training)
                loss_per_token, loss_per_sentence, num_toks = stats
            print(str(p))
            layer_timestamps.append(p.processed_times())

            # measure accuracy and record loss
            losses_per_token.update(loss_per_token, num_toks['tgt'])
            losses_per_sentence.update(loss_per_sentence, batch_size)

            # measure elapsed time
            elapsed = time.time() - end
            batch_time.update(elapsed)
            src_tok_time.update(num_toks['src'] / elapsed)
            tgt_tok_time.update(num_toks['tgt'] / elapsed)
            tot_num_toks = num_toks['tgt'] + num_toks['src']
            tot_tok_time.update(tot_num_toks / elapsed)
            self.loss = losses_per_token.avg

            if training and i in eval_iters:
                test_bleu, _ = self.translator.run(calc_bleu=True,
                                                   epoch=self.epoch,
                                                   iteration=i)

                log = []
                log += [f'TRAIN [{self.epoch}][{i}/{len(data_loader)}]']
                log += [f'BLEU: {test_bleu:.2f}']
                log = '\t'.join(log)
                logging.info(log)

                self.model.train()
                self.preallocate(data_loader, training=True)

            if i % self.print_freq == 0:
                phase = 'TRAIN' if training else 'VALIDATION'
                log = []
                log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]']
                log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})']
                log += [f'Data {data_time.val:.5f} ({data_time.avg:.5f})']
                log += [
                    f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})'
                ]
                if self.verbose:
                    log += [
                        f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})'
                    ]
                    log += [
                        f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})'
                    ]
                    log += [
                        f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})'
                    ]
                log += [
                    f'Loss/tok {losses_per_token.val:.4f} ({losses_per_token.avg:.4f})'
                ]
                lr = [
                    param_group['lr']
                    for param_group in self.optimizer.param_groups
                ]
                log += [f'Learning Rate {lr}']
                log = '\t'.join(log)
                logging.info(log)

            if i >= NUM_STEPS_TO_PROFILE:
                break

            save_chkpt = (self.save_counter %
                          self.save_freq) == (self.save_freq - 1)
            if training and save_chkpt:
                self.save_counter = 0
                self.save_info['iteration'] = i
                identifier = next(self.checkpoint_counter, -1)
                if identifier != -1:
                    with sync_workers() as rank:
                        if rank == 0:
                            self.save(identifier=identifier)

            end = time.time()

        if verbose:
            print(
                "\n==========================================================")
            print("Layer Type    Forward Time (ms)    Backward Time (ms)")
            print("==========================================================")

        tot_accounted_time = 0.0
        per_layer_times = []
        for i in range(len(layer_timestamps[0])):
            layer_type = str(layer_timestamps[0][i][0])
            layer_forward_time_sum = 0.0
            layer_backward_time_sum = 0.0
            for j in range(len(layer_timestamps)):
                layer_forward_time_sum += (layer_timestamps[j][i][2] / 1000)
                layer_backward_time_sum += (layer_timestamps[j][i][5] / 1000)
            per_layer_times.append(
                (layer_type, layer_forward_time_sum / len(layer_timestamps),
                 layer_backward_time_sum / len(layer_timestamps)))
            if verbose:
                print(per_layer_times[-1][0], per_layer_times[-1][1],
                      per_layer_times[-1][2])
            tot_accounted_time += (per_layer_times[-1][1] +
                                   per_layer_times[-1][2])

        print("Total accounted time: %.3f ms" % tot_accounted_time)

        summary_i = 0
        per_layer_times_i = 0
        last_summary_i = -1
        last_per_layer_times_i = -1
        while len(per_layer_times) > 0:
            per_layer_time = per_layer_times.pop(0)
            for summary_i in range(len(summary)):
                summary_elem = summary[summary_i]
                if str(summary_elem['layer_name']) != str(per_layer_time[0]):
                    continue
                if 'forward_time' in summary_elem and 'backward_time' in summary_elem:
                    continue
                summary_elem['forward_time'] = per_layer_time[1]
                summary_elem['backward_time'] = per_layer_time[2]
                break

        if training:
            create_graph(self.model, module_whitelist, (src, tgt), summary,
                         os.path.join("profiles", self.arch))

        tot_tok_time.reduce('sum')
        losses_per_token.reduce('mean')

        return losses_per_token.avg, tot_tok_time.avg
Esempio n. 10
0
    def evaluate(self, epoch, iteration, eval_path, summary):
        """
        Runs evaluation on test dataset.

        :param epoch: index of the current epoch
        :param iteration: index of the current iteration
        :param eval_path: path to the file for saving results
        :param summary: if True prints summary
        """
        eval_file = open(eval_path, 'w')

        batch_time = AverageMeter(False)
        tot_tok_per_sec = AverageMeter(False)
        iterations = AverageMeter(False)
        enc_seq_len = AverageMeter(False)
        dec_seq_len = AverageMeter(False)
        total_iters = 0
        total_lines = 0
        stats = {}

        for i, (src, indices) in enumerate(self.loader):
            translate_timer = time.time()
            src, src_length = src

            if self.batch_first:
                batch_size = src.size(0)
            else:
                batch_size = src.size(1)
            total_lines += batch_size
            beam_size = self.beam_size

            bos = [self.insert_target_start] * (batch_size * beam_size)
            bos = torch.LongTensor(bos)
            if self.batch_first:
                bos = bos.view(-1, 1)
            else:
                bos = bos.view(1, -1)

            src_length = torch.LongTensor(src_length)
            stats['total_enc_len'] = int(src_length.sum())

            if self.cuda:
                src = src.cuda()
                src_length = src_length.cuda()
                bos = bos.cuda()

            with torch.no_grad():
                context = self.model.encode(src, src_length)
                context = [context, src_length, None]

                if beam_size == 1:
                    generator = self.generator.greedy_search
                else:
                    generator = self.generator.beam_search
                preds, lengths, counter = generator(batch_size, bos, context)

            preds = preds.cpu()
            lengths = lengths.cpu()
            stats['total_dec_len'] = int(lengths.sum())
            stats['iters'] = counter
            total_iters += stats['iters']

            output = []
            for idx, pred in enumerate(preds):
                end = lengths[idx] - 1
                pred = pred[1:end].tolist()
                out = self.tokenizer.detokenize(pred)
                output.append(out)

            output = [output[indices.index(i)] for i in range(len(output))]

            elapsed = time.time() - translate_timer
            batch_time.update(elapsed, batch_size)

            total_tokens = stats['total_dec_len'] + stats['total_enc_len']
            ttps = total_tokens / elapsed
            tot_tok_per_sec.update(ttps, batch_size)

            iterations.update(stats['iters'])
            enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size)
            dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size)

            if i % self.print_freq == 0:
                log = []
                log += f'TEST '
                if epoch is not None:
                    log += f'[{epoch}]'
                if iteration is not None:
                    log += f'[{iteration}]'
                log += f'[{i}/{len(self.loader)}]\t'
                log += f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                log += f'Decoder iters {iterations.val:.1f} ({iterations.avg:.1f})\t'
                log += f'Tok/s {tot_tok_per_sec.val:.0f} ({tot_tok_per_sec.avg:.0f})'
                log = ''.join(log)
                logging.info(log)

            for line in output:
                eval_file.write(line)
                eval_file.write('\n')

        eval_file.close()
        if summary:
            time_per_sentence = (batch_time.avg / self.loader.batch_size)
            log = []
            log += f'TEST SUMMARY:\n'
            log += f'Lines translated: {total_lines}\t'
            log += f'Avg total tokens/s: {tot_tok_per_sec.avg:.0f}\n'
            log += f'Avg time per batch: {batch_time.avg:.3f} s\t'
            log += f'Avg time per sentence: {1000*time_per_sentence:.3f} ms\n'
            log += f'Avg encoder seq len: {enc_seq_len.avg:.2f}\t'
            log += f'Avg decoder seq len: {dec_seq_len.avg:.2f}\t'
            log += f'Total decoder iterations: {total_iters}'
            log = ''.join(log)
            logging.info(log)
Esempio n. 11
0
def main():
    execution_timer = time.time()

    tfiargs = tfiParser.getParser()
    args = tfiargs.parse_args()

    # import os
    # os.environ['CUDA_LAUNCH_BLOCKING']='1'

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True

    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        cudnn.benchmark = True
        print("Use GPU: {} for training".format(args.gpu))

    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    vocab_size = checkpoint['tokenizer'].vocab_size

    model_config = dict(vocab_size=vocab_size,
                        math=checkpoint['config'].math,
                        **literal_eval(checkpoint['config'].model_config))

    model_config['batch_first'] = args.batch_first

    model = models.GNMT(**model_config)

    state_dict = checkpoint['state_dict']

    if checkpoint_from_distributed(state_dict):
        state_dict = unwrap_distributed(state_dict)

    model.load_state_dict(state_dict)

    if args.gpu is not None:
        model = model.cuda()

    tokenizer = checkpoint['tokenizer']

    test_data = ParallelDataset(src_fname=os.path.join(args.data,
                                                       config.SRC_TEST_FNAME),
                                tgt_fname=os.path.join(args.data,
                                                       config.TGT_TEST_FNAME),
                                tokenizer=tokenizer,
                                min_len=0,
                                max_len=150,
                                sort=False)

    test_loader = test_data.get_loader(batch_size=args.batch_size,
                                       batch_first=True,
                                       shuffle=False,
                                       num_workers=0,
                                       drop_last=False,
                                       distributed=False)

    translator = Translator(model,
                            tokenizer,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_seq_len,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.gpu is not None)

    model.eval()
    # torch.cuda.empty_cache()

    if args.record_prefix is not None:
        record = Record('GNMTv2',
                        batch_size=args.batch_size,
                        injection=args.injection,
                        fiLayer=args.layer,
                        fiFeatures=args.fiFeats,
                        fiWeights=args.fiWeights)
    # Faulty Run
    if args.faulty:
        fi = FI(model,
                record=record,
                fiMode=args.injection,
                fiLayer=args.layer,
                fiBit=args.bit,
                fiFeatures=args.fiFeats,
                fiWeights=args.fiWeights,
                log=args.log)

        traverse_time = AverageMeter()
        start = time.time()
        fi.traverseModel(model)
        traverse_time.update(time.time() - start)

        displayConfig(args)
        fi.injectionMode = True
        print("\n Number of new layers: #%d \n" % fi.numNewLayers)

    elif args.golden:
        import distiller.modules as dist
        model = dist.convert_model_to_distiller_lstm(model)

    if args.quantize:
        overrides_yaml = """
        .*att_rnn.attn.*:
            clip_acts: NONE # Quantize without clipping
        decoder.classifier.classifier:
            clip_acts: NONE # Quantize without clipping
        """
        from distiller.utils import yaml_ordered_load
        overrides = yaml_ordered_load(
            overrides_yaml)  # Basic quantizer defintion

        stats_file = '/home/bfgoldstein/torchfi/examples/wmt16/model_stats.yaml'

        quantizer = tfi.FIPostTraLinearQuantizer(
            model,
            mode=args.quant_mode,
            bits_activations=args.quant_bacts,
            bits_parameters=args.quant_bwts,
            bits_accum=args.quant_baccum,
            per_channel_wts=args.quant_channel,
            clip_acts=args.quant_cacts,
            model_activation_stats=args.quant_stats_file,
            overrides=overrides,
            clip_n_stds=args.quant_cnstds,
            scale_approx_mult_bits=args.quant_scalebits)
        quantizer.prepare_model()
        # model = quantizer.model
        if args.faulty:
            fi.setQuantParams(args)

    print(model._modules.items())

    # Setting model to evaluation mode and cuda (if enabled) after FI traverse
    model.eval()
    if args.gpu is not None:
        model = model.cuda()

    test_file = open(args.record_prefix +
                     getRecordPrefix(args, 'fp32', faulty=args.faulty) +
                     ".tok",
                     'w',
                     encoding='UTF-8')

    batch_time = AverageMeter(False)
    tot_tok_per_sec = AverageMeter(False)
    iterations = AverageMeter(False)
    enc_seq_len = AverageMeter(False)
    dec_seq_len = AverageMeter(False)
    bleu_score = AverageMeter(False)
    score_time = AverageMeter(False)
    stats = {}

    reference_content = readReferenceFile(args)

    for batch_idx, (input, target, indices) in enumerate(test_loader):
        translate_timer = time.time()
        input_data, input_lenght = input

        if translator.batch_first:
            batch_size = input_data.size(0)
        else:
            batch_size = input_data.size(1)
        beam_size = args.beam_size

        bos = [translator.insert_target_start] * (batch_size * beam_size)
        bos = torch.LongTensor(bos)

        if translator.batch_first:
            bos = bos.view(-1, 1)
        else:
            bos = bos.view(1, -1)

        input_lenght = torch.LongTensor(input_lenght)
        stats['total_enc_len'] = int(input_lenght.sum())

        if args.gpu is not None:
            input_data = input_data.cuda(args.gpu, non_blocking=True)
            input_lenght = input_lenght.cuda(args.gpu, non_blocking=True)
            bos = bos.cuda(args.gpu, non_blocking=True)

        with torch.no_grad():
            context = translator.model.encode(input_data, input_lenght)
            context = [context, input_lenght, None]

            if beam_size == 1:
                generator = translator.generator.greedy_search
            else:
                generator = translator.generator.beam_search

            preds, lengths, counter = generator(batch_size, bos, context)

        if args.faulty:
            fi.injectionMode = True

        stats['total_dec_len'] = lengths.sum().item()
        stats['iters'] = counter

        preds = preds.cpu()
        lengths = lengths.cpu()

        output = []
        for idx, pred in enumerate(preds):
            end = lengths[idx] - 1
            pred = pred[1:end]
            pred = pred.tolist()
            out = translator.tok.detokenize(pred)
            output.append(out)

        output = [output[indices.index(i)] for i in range(len(output))]

        for line_idx, line in enumerate(output):
            score_timer = time.time()
            detok_sentence = detokenizeSentence(args, line)
            chunk = (batch_idx * batch_size) + line_idx
            score = scoreBleuSentence(args, detok_sentence,
                                      reference_content[chunk])
            bleu_score.update(score)
            record.addBleuScores(score)
            # Get timing
            elapsed = time.time() - score_timer
            score_time.update(elapsed)
            test_file.write(line)
            test_file.write('\n')

        # Get timing
        elapsed = time.time() - translate_timer
        batch_time.update(elapsed, batch_size)

        total_tokens = stats['total_dec_len'] + stats['total_enc_len']
        ttps = total_tokens / elapsed
        tot_tok_per_sec.update(ttps, batch_size)

        iterations.update(stats['iters'])
        enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size)
        dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size)

        if batch_idx % args.print_freq == 0:
            print('[Test {}] Time: {:.3f} ({:.3f})\t   \
                    Decoder iters {:.1f} ({:.1f})\t \
                    Tok/s {:.0f} ({:.0f})\n \
                    Bleu score: {:.2f} ({:.2f})\t \
                    Bleu time: {:.3f} ({:.3f})'.format(
                batch_idx, batch_time.val, batch_time.avg, iterations.val,
                iterations.avg, tot_tok_per_sec.val, tot_tok_per_sec.avg,
                bleu_score.val, bleu_score.avg, score_time.val,
                score_time.avg))

    # summary timing
    time_per_sentence = (batch_time.avg / batch_size)

    print('[Test] Summary \n    \
        Lines translated: {}\t  \
        Avg total tokens/s: {:.0f}\n    \
        Avg time per batch: {:.3f} s\t  \
        Avg time per sentence: {:.3f} ms\n  \
        Avg encoder seq len: {:.2f}\t   \
        Avg decoder seq len: {:.2f}\t   \
        Total decoder iterations: {}\n  \
        Traverse time : {:.3f} s\t  \
        Total number of injections: {}'.format(
        len(test_loader.dataset), tot_tok_per_sec.avg, batch_time.avg,
        1000 * time_per_sentence, enc_seq_len.avg, dec_seq_len.avg,
        int(iterations.sum), traverse_time.val if args.faulty else 0.0,
        int(fi.numInjections) if args.faulty else 0))

    test_file.close()

    detok = detokenizeFile(args)
    bleu = scoreBleuFile(args, detok)

    record.setBleuScoreAvg(bleu)
    saveRecord(
        args.record_prefix + getRecordPrefix(args, 'fp32', faulty=args.faulty),
        record)

    print('BLEU on test dataset: {}'.format(bleu))
    # Get timing
    execution_elapsed = time.time() - execution_timer
    print('Finished evaluation on test set in {:.2f} seconds'.format(
        execution_elapsed))