コード例 #1
0
ファイル: train.py プロジェクト: CGCL-codes/naturalcc
def single_main(args, init_distributed=False):
    assert args['dataset']['max_tokens'] is not None or args['dataset']['max_sentences'] is not None, \
        'Must specify batch size either with --max-tokens or --max-sentences'
    metrics.reset()

    # 0. Initialize CUDA and distributed training
    if torch.cuda.is_available() and not args['common']['cpu']:
        torch.cuda.set_device(args['distributed_training']['device_id'])
    set_seed.set_seed(args['common']['seed'])
    if init_distributed:
        args['distributed_training'][
            'distributed_rank'] = distributed_utils.distributed_init(args)

    # Verify checkpoint directory
    if distributed_utils.is_master(args):
        save_dir = args['checkpoint']['save_dir']
        checkpoint_utils.verify_checkpoint_directory(save_dir)
        PathManager.rm(os.path.join(
            save_dir, '*.pt'))  # this code will remove pre-trained models

    # 1. Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # 2. Load valid dataset (we load training data below, based on the latest checkpoint)
    task.load_dataset(args['dataset']['valid_subset'], combine=False, epoch=1)

    # 3. Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    LOGGER.info(model)
    LOGGER.info('model {}, criterion {}'.format(args['model']['arch'],
                                                criterion.__class__.__name__))
    LOGGER.info('num. model params: {} (num. trained: {})'.format(
        sum(p.numel() for p in model.parameters()),
        sum(p.numel() for p in model.parameters() if p.requires_grad),
    ))

    # 4. Build trainer
    trainer = Trainer(args, task, model, criterion)
    LOGGER.info('training on {} GPUs'.format(
        args['distributed_training']['distributed_world_size']))
    LOGGER.info(
        'max tokens per GPU = {} and max sentences per GPU = {}'.format(
            args['dataset']['max_tokens'],
            args['dataset']['max_sentences'],
        ))

    # 5. Load the latest checkpoint if one is available and restore the corresponding train iterator
    extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args,
                                                              trainer,
                                                              combine=False)

    # 6. Train until the learning rate gets too small
    max_epoch = args['optimization']['max_epoch'] or math.inf
    max_update = args['optimization']['max_update'] or math.inf
    lr = trainer.get_lr()
    train_meter = meters.StopwatchMeter()
    train_meter.start()
    valid_subsets = args['dataset']['valid_subset'].split(',')
    while (lr > args['optimization']['min_lr']
           and epoch_itr.next_epoch_idx <= max_epoch
           and trainer.get_num_updates() < max_update):
        # train for one epoch
        train(args, trainer, task, epoch_itr)

        if not args['dataset']['disable_validation'] and epoch_itr.epoch % args[
                'dataset']['validate_interval'] == 0:
            valid_losses = validate(args, trainer, task, epoch_itr,
                                    valid_subsets)
        else:
            valid_losses = [None]

        # only use first validation loss to update the learning rate
        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])

        # save checkpoint
        if epoch_itr.epoch % args['checkpoint']['save_interval'] == 0:
            checkpoint_utils.save_checkpoint(args, trainer, epoch_itr,
                                             valid_losses[0])

        # early stop
        if should_stop_early(args, valid_losses[0]):
            LOGGER.info(
                'early stop since valid performance hasn\'t improved for last {} runs'
                .format(args['checkpoint']['patience']))
            break

        epoch_itr = trainer.get_train_iterator(
            epoch_itr.next_epoch_idx,
            combine=False,  # TODO to be checked
            # sharded data: get train iterator for next epoch
            load_dataset=(os.pathsep in args['task']['data']),
        )

    train_meter.stop()
    LOGGER.info('done training in {:.1f} seconds'.format(train_meter.sum))
コード例 #2
0
def cli_main():
    SEED = 204
    BATCH_SIZE = 64
    MAX_SOURCE_POSITIONS = 1024
    EPOCH = 50

    from ncc.utils.set_seed import set_seed
    set_seed(SEED)

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0]  # get first device as default
        torch.cuda.set_device(f'cuda:{device}')
    criterion = DeepTuneLoss(task=None, sentence_avg=-1)
    if use_cuda:
        criterion = criterion.cuda()

    data = []
    for i, platform in enumerate(LANGUAGES):
        DATA_DIR = os.path.join(DATASET_DIR, f'mapping/{platform}/data-mmap')

        def get_attr(attr):
            oracle_file = os.path.join(DATA_DIR, f'train.{attr}')
            with open(oracle_file, 'rb') as reader:
                out = pickle.load(reader)
            return np.asarray(out)

        platform_name = mapping_metrics.platform2str(platform)
        benchmarks = get_attr('benchmark')
        runtime_cpus = get_attr('runtime_cpu')
        runtime_gpus = get_attr('runtime_gpu')

        #################### load dataset ####################
        src_dataset = load_mmap_dataset(os.path.join(DATA_DIR, f'train.src_tokens'))
        src_dataset = TruncateDataset(src_dataset, truncation_length=MAX_SOURCE_POSITIONS, truncate_prefix=0)
        tgt_dataset = load_mmap_dataset(os.path.join(DATA_DIR, f'train.oracle'))

        src_dict = Dictionary.load(os.path.join(DATA_DIR, 'src_tokens.dict.jsonl'))
        src_aux = OrderedDict()
        src_aux['transfer'] = get_attr('transfer')
        src_aux['wgsize'] = get_attr('wgsize')

        tgt_dict = Dictionary.load(os.path.join(DATA_DIR, 'oracle.dict.jsonl'))

        dataset = LanguagePairDataset(
            src=src_dataset, src_sizes=src_dataset.sizes, src_dict=src_dict, src_aux=src_aux,
            tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=tgt_dict, tgt_aux=None,
            left_pad_source=True, max_source_positions=MAX_SOURCE_POSITIONS,
        )
        #################### load dataset ####################

        # build toy dataset for 10-fold cross validation
        tgt_data = [tgt_dataset[idx].item() for idx in range(len(tgt_dataset))]
        src_data = [None] * len(tgt_data)

        # 10-fold cross-validation
        kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
        for j, (train_ids, test_ids) in enumerate(kf.split(src_data, tgt_data)):
            # deeptune model
            model = DeepTuneEncoder(dictionary=src_dict, embed_dim=64,
                                    rnn_cell='lstm', rnn_hidden_dim=64, rnn_dropout=0., rnn_num_layers=2,
                                    aux_dim=2, inner_dim=32, out_dim=2)
            if use_cuda:
                model = model.cuda()
            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
            for epoch_i in range(EPOCH):
                if dataset.shuffle:
                    random.shuffle(train_ids)
                train_batch_sampler = data_utils.batch_by_size(
                    train_ids,
                    num_tokens_fn=lambda *args: -1,
                    max_sentences=BATCH_SIZE,
                )
                train_dataloader = DataLoader(dataset=dataset,
                                              batch_sampler=train_batch_sampler,
                                              collate_fn=collate, )
                with tqdm(total=len(train_dataloader)) as t:
                    for sample_i, sample in enumerate(train_dataloader, start=1):
                        t.set_description(f'Epoch {epoch_i + 1}/{EPOCH} Batch {sample_i}/{len(train_dataloader)}')
                        if use_cuda:
                            sample = move_to_cuda(sample)
                        loss, sample_size, logging_output = criterion(model, sample)
                        loss.div_(sample_size)
                        t.set_postfix(loss=loss.item())
                        t.update()

                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()

            # test accuracy
            test_batch_sampler = data_utils.batch_by_size(
                test_ids,
                num_tokens_fn=lambda *args: -1,
                max_sentences=BATCH_SIZE,
            )
            test_dataloader = DataLoader(dataset=dataset,
                                         batch_sampler=test_batch_sampler,
                                         collate_fn=collate, )
            predictions, ground_truth = [], []
            for sample in test_dataloader:
                if use_cuda:
                    sample = move_to_cuda(sample)
                hybrid_out, _ = model(**sample['net_input'])
                predictions.append(hybrid_out.max(dim=-1)[1])
                ground_truth.append(sample['target'].view(-1))
            predictions = torch.cat(predictions)
            ground_truth = torch.cat(ground_truth)

            accuracy = (predictions == ground_truth).tolist()
            # runtimes of baseline mapping (CPU on AMD, GPU on NVIDIA)
            gt_runtimes = (runtime_cpus if platform == "amd" else runtime_gpus)[test_ids]
            pred_runtimes = [
                (runtime_cpus if pred == 0 else runtime_gpus)[idx]
                for idx, pred in zip(test_ids, predictions)
            ]
            speedup = gt_runtimes / pred_runtimes

            # record results
            for benchmark_, o_, p_, accuracy_, p_speedup_ in \
                zip(benchmarks[test_ids], ground_truth, predictions, accuracy, speedup):
                data.append({
                    "Model": model.__class__.__name__,
                    "Platform": platform_name,
                    'Benchmark': mapping_metrics.escape_benchmark_name(benchmark_),
                    'Benchmark Suite': mapping_metrics.escape_suite_name(benchmark_),
                    "Oracle Mapping": o_,
                    "Predicted Mapping": p_,
                    "Accuracy": accuracy_,
                    "Speedup": p_speedup_,
                })
            del model, optimizer
    performance = pd.DataFrame(
        data, index=range(1, len(data) + 1), columns=[
            "Model",
            "Platform",
            "Benchmark",
            "Benchmark Suite",
            "Oracle Mapping",
            "Predicted Mapping",
            "Accuracy",
            "Speedup"
        ])
    benchmark_out = performance.groupby(['Platform', 'Benchmark Suite'])[['Platform', 'Accuracy', 'Speedup']].mean()
    benchmark_out['Accuracy'] = round(benchmark_out['Accuracy'] * 100, 2)
    benchmark_out['Speedup'] = round(benchmark_out['Speedup'], 2)
    print(benchmark_out)
    out = performance.groupby(['Platform'])[['Platform', 'Accuracy', 'Speedup']].mean()
    out['Accuracy'] = round(out['Accuracy'] * 100, 2)
    out['Speedup'] = round(out['Speedup'], 2)
    print(out)