def __init__(self, config: Config, output_encoded_layers: bool, *args,
                 **kwargs) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)
        # Load config
        config_file = os.path.join(config.bert_cpt_dir, "bert_config.json")
        bert_config = BertConfig.from_json_file(config_file)
        print("Bert model config {}".format(bert_config))
        # Instantiate model.
        model = BertModel(bert_config)
        weights_path = os.path.join(config.bert_cpt_dir, "pytorch_model.bin")
        # load pre-trained weights if weights_path exists
        if config.load_weights and PathManager.isfile(weights_path):
            state_dict = torch.load(weights_path)

            missing_keys: List[str] = []
            unexpected_keys: List[str] = []
            error_msgs: List[str] = []
            # copy state_dict so _load_from_state_dict can modify it
            metadata = getattr(state_dict, "_metadata", None)
            state_dict = state_dict.copy()
            if metadata is not None:
                state_dict._metadata = metadata

            def load(module, prefix=""):
                local_metadata = ({} if metadata is None else metadata.get(
                    prefix[:-1], {}))
                module._load_from_state_dict(
                    state_dict,
                    prefix,
                    local_metadata,
                    True,
                    missing_keys,
                    unexpected_keys,
                    error_msgs,
                )
                for name, child in module._modules.items():
                    if child is not None:
                        load(child, prefix + name + ".")

            load(model, prefix="" if hasattr(model, "bert") else "bert.")
            if len(missing_keys) > 0:
                print(
                    "Weights of {} not initialized from pretrained model: {}".
                    format(model.__class__.__name__, missing_keys))
            if len(unexpected_keys) > 0:
                print(
                    "Weights from pretrained model not used in {}: {}".format(
                        model.__class__.__name__, unexpected_keys))

        self.bert = model
        self.projection = (torch.nn.Linear(model.config.hidden_size,
                                           config.projection_dim)
                           if config.projection_dim > 0 else None)
        log_class_usage(__class__)
Beispiel #2
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
    # Initialise PyTorch models
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch models from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_bert(model, tf_checkpoint_path)

    # Save pytorch-models
    print("Save PyTorch models to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
Beispiel #3
0
    def __init__(self,
                 bert_config: str,
                 requires_grad: bool = False,
                 dropout: float = 0.1,
                 layer_dropout: float = 0.1,
                 combine_layers: str = "mix") -> None:
        model = BertModel(BertConfig.from_json_file(bert_config))

        for param in model.parameters():
            param.requires_grad = requires_grad

        super().__init__(bert_model=model,
                         layer_dropout=layer_dropout,
                         combine_layers=combine_layers)

        self.model = model
        self.dropout = dropout
        self.set_dropout(dropout)
Beispiel #4
0
    def convert_all_tensorflow_bert_weights_to_pytorch(self, input_folder: str) -> None:
        """
        Tensorflow to Pytorch weight conversion based on huggingface's library

        Parameters
        ----------
        input_folder: `str`, required
            The folder containing the tensorflow files
        """
        files = [e for e in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, e))]
        folders = [os.path.join(input_folder, e) for e in os.listdir(input_folder) if
                   os.path.isdir(os.path.join(input_folder, e))]

        flag = -4
        for file in files:
            if file == 'vocab.txt' or \
                    file.endswith('.data-00000-of-00001') or \
                    file.endswith('.index') or \
                    file.endswith('.meta') or \
                    file.endswith('.json'):
                flag += 1
                if file.endswith('.json'):
                    config_file = file

        if flag > 0:
            assert type(config_file) == str, "no valid config file, but is attempting to convert"
            pytorch_path = os.path.join(input_folder, 'pytorch')
            tensorflow_path = os.path.join(input_folder, 'tensorflow')

            force_folder_to_exist(pytorch_path)
            force_folder_to_exist(tensorflow_path)

            os.system('mv ' + os.path.join(input_folder, '*.*') + ' ' + tensorflow_path)
            os.system('cp ' + os.path.join(tensorflow_path, '*.txt') + ' ' + pytorch_path)
            os.system('cp ' + os.path.join(tensorflow_path, '*.json') + ' ' + pytorch_path)

            config = BertConfig.from_json_file(os.path.join(tensorflow_path, config_file))
            model = BertForPreTraining(config)
            load_tf_weights_in_bert(model=model, tf_checkpoint_path=os.path.join(tensorflow_path, 'bert_model.ckpt'))
            torch.save(model.state_dict(), os.path.join(pytorch_path, 'pytorch_model.bin'))

        else:
            for folder in folders:
                self.convert_all_tensorflow_bert_weights_to_pytorch(input_folder=folder)
Beispiel #5
0
    def __init__(self,
                 bert_model_name,
                 pretrained_file_name,
                 label_map,
                 do_lower_case=True,
                 max_seq_length=128,
                 batch_size=32,
                 cuda_device=-1,
                 reorder=False):

        #initialize tokenizer and models here
        self.tokenizer = BertTokenizer.from_pretrained(
            bert_model_name, do_lower_case=do_lower_case)

        pretrained_file_name = cached_path(pretrained_file_name)
        tempdir = tempfile.mkdtemp()
        with tarfile.open(pretrained_file_name, 'r:gz') as archive:
            archive.extractall(tempdir)
        serialization_dir = tempdir
        # Load config
        config_file = os.path.join(serialization_dir, CONFIG_NAME)
        config = BertConfig.from_json_file(config_file)

        if cuda_device == -1:
            cuda_device = "cpu"
        self.cuda_device = cuda_device

        with open(os.path.join(serialization_dir,
                               CLASSIFIER_CONFIG_NAME)) as f:
            classifier_config = json.load(f)
        # Instantiate model.
        self.model = BertForMultipleSequenceClassification(
            config, **classifier_config)
        self.model.load_weights(serialization_dir, cuda_device)
        #self.model = BertForMultipleSequenceClassification.from_pretrained(pretrained_file_name)

        self.label_map = label_map
        self.max_seq_length = max_seq_length

        self.batch_size = batch_size
        self.verbose = False
        self.reorder = reorder
Beispiel #6
0
def load_model(args, device):
    # initial necessary argument
    args.cache_dir = ''
    args.param_init = 0.0
    args.param_init_glorot = False
    args.bert_model = 'bert-base-chinese'

    # Prepare model
    model_file = os.path.join(args.model_path, WEIGHTS_NAME)
    config_file = os.path.join(args.model_path, CONFIG_NAME)
    print('[Segment] Load model...')
    config = BertConfig.from_json_file(config_file)
    model = Summarizer(args, device, load_pretrained_bert=False, bert_config=config)

    model.to(device)
    # load check points
    model.load_cp(torch.load(model_file))
    
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    return model
Beispiel #7
0
def model_transfer():
    model = BertForMaskedLM(
        config=BertConfig.from_json_file(args.bert_config_json))
    # print('language_model',model.state_dict()['bert.embeddings.word_embeddings.weight'])
    # print('language_model',model.state_dict()['bert.embeddings.LayerNorm.weight'])
    # print('language_model',model.state_dict()['bert.encoder.layer.0.attention.self.key.weight'])
    model = model.bert
    # print('bert_model',model.state_dict()['embeddings.word_embeddings.weight'])
    # print('bert_model',model.state_dict()['embeddings.LayerNorm.weight'])
    # print('bert_model',model.state_dict()['encoder.layer.0.attention.self.key.weight'])
    model_dict = model.state_dict()
    lm_dict = torch.load('./lm_smallBert/outputs/1.41_150000_step')
    for k, v in lm_dict.items():
        print(k, v)
    # print('lm_dict',lm_dict['bert.embeddings.word_embeddings.weight'])
    # print('lm_dict',lm_dict['bert.embeddings.LayerNorm.weight'])
    # print('lm_dict',lm_dict['bert.encoder.layer.0.attention.self.key.weight'])
    pretrained_dict = {
        k[5:]: v
        for k, v in lm_dict.items() if k[5:] in model_dict.keys()
    }
    # print('pretrained_dict',pretrained_dict)
    model.load_state_dict(pretrained_dict)
    torch.save(model.state_dict(), '1.41_bert_weight.bin')
    def __init__(self, num_choices, bert_config_file, init_embeddings):
        self.num_choices = num_choices
        self.bert_config = BertConfig.from_json_file(bert_config_file)
        BertPreTrainedModel.__init__(self, self.bert_config)

        self.bert = BertModel(self.bert_config)
        self.apply(self.init_bert_weights)
        self.dropout = nn.Dropout(self.bert_config.hidden_dropout_prob)

        self.vocab_size, self.embed_size = np.shape(init_embeddings)
        self.embed = nn.Embedding.from_pretrained(
            torch.FloatTensor(init_embeddings), freeze=False)

        #self.classifier = nn.Linear(self.bert_config.hidden_size + self.embed_size, 1)
        self.classifier = nn.Linear(self.bert_config.hidden_size, 1)
        self.reshape = nn.Linear(self.bert_config.hidden_size,
                                 self.embed_size,
                                 bias=False)
        self.reshape_know = nn.Linear(self.embed_size,
                                      self.bert_config.hidden_size,
                                      bias=True)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=-1)
        self.activation = nn.Sigmoid()
Beispiel #9
0
def convert(args):
    tf_checkpoint_path = args.tf_checkpoint_root
    bert_config_file = os.path.join(tf_checkpoint_path, 'bert_config.json')
    pytorch_dump_path = args.pytorch_checkpoint_path
    config = BertConfig.from_json_file(bert_config_file)
    opt = vars(args)
    opt.update(config.to_dict())
    model = SANBertNetwork(opt)
    path = os.path.join(tf_checkpoint_path, 'bert_model.ckpt')
    logger.info('Converting TensorFlow checkpoint from {}'.format(path))
    init_vars = tf.train.list_variables(path)
    names = []
    arrays = []

    for name, shape in init_vars:
        logger.info('Loading {} with shape {}'.format(name, shape))
        array = tf.train.load_variable(path, name)
        logger.info('Numpy array shape {}'.format(array.shape))

        # new layer norm var name
        # make sure you use the latest huggingface's new layernorm implementation
        # if you still use beta/gamma, remove line: 48-52
        if name.endswith('LayerNorm/beta'):
            name = name[:-14] + 'LayerNorm/bias'
        if name.endswith('LayerNorm/gamma'):
            name = name[:-15] + 'LayerNorm/weight'

        if name.endswith('bad_steps'):
            print('bad_steps')
            continue
        if name.endswith('steps'):
            print('step')
            continue
        if name.endswith('step'):
            print('step')
            continue
        if name.endswith('adam_m'):
            print('adam_m')
            continue
        if name.endswith('adam_v'):
            print('adam_v')
            continue
        if name.endswith('loss_scale'):
            print('loss_scale')
            continue
        names.append(name)
        arrays.append(array)

    for name, array in zip(names, arrays):
        flag = False
        if name == 'cls/squad/output_bias':
            name = 'out_proj/bias'
            flag = True
        if name == 'cls/squad/output_weights':
            name = 'out_proj/weight'
            flag = True

        logger.info('Loading {}'.format(name))
        name = name.split('/')
        if name[0] in ['redictions', 'eq_relationship', 'cls', 'output']:
            logger.info('Skipping')
            continue
        pointer = model
        for m_name in name:
            if flag: continue
            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
                l = re.split(r'_(\d+)', m_name)
            else:
                l = [m_name]
            if l[0] == 'kernel':
                pointer = getattr(pointer, 'weight')
            else:
                pointer = getattr(pointer, l[0])
            if len(l) >= 2:
                num = int(l[1])
                pointer = pointer[num]
        if m_name[-11:] == '_embeddings':
            pointer = getattr(pointer, 'weight')
        elif m_name == 'kernel':
            array = np.transpose(array)
        elif flag:
            continue
            pointer = getattr(getattr(pointer, name[0]), name[1])
        try:
            assert tuple(pointer.shape) == array.shape
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        pointer.data = torch.from_numpy(array)

    nstate_dict = model.state_dict()
    params = {'state': nstate_dict, 'config': config.to_dict()}
    torch.save(params, pytorch_dump_path)
Beispiel #10
0
def getErrors():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="results",
        type=str,
        help=
        "The output directory where the model checkpoints will be written.",
    )
    parser.add_argument(
        "--config_file",
        default="config/bert_config.json",
        type=str,
        help="The config file which specified the model details.",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument("--num_workers",
                        type=int,
                        default=10,
                        help="Number of workers in the dataloader.")
    parser.add_argument(
        "--save_name",
        default='',
        type=str,
        help="save name for training.",
    )
    parser.add_argument("--batch_size",
                        default=1000,
                        type=int,
                        help="what is the batch size?")
    parser.add_argument("--tasks",
                        default='',
                        type=str,
                        help="1-2-3... training task separate by -")
    parser.add_argument("--in_memory",
                        default=False,
                        type=bool,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--baseline",
                        action="store_true",
                        help="whether use single stream baseline.")
    parser.add_argument("--split",
                        default="",
                        type=str,
                        help="which split to use.")

    args = parser.parse_args()
    with open('vlbert_tasks.yml', 'r') as f:
        task_cfg = edict(yaml.safe_load(f))

    if args.baseline:
        from pytorch_pretrained_bert.modeling import BertConfig
        from vilbert.basebert import BaseBertForVLTasks
    else:
        from vilbert.vilbert import BertConfig
        from vilbert.vilbert import VILBertForVLTasks

    task_names = []
    for i, task_id in enumerate(args.tasks.split('-')):
        task = 'TASK' + task_id
        name = task_cfg[task]['name']
        task_names.append(name)

    # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0]
    timeStamp = args.from_pretrained.split('/')[1] + '-' + args.save_name
    savePath = os.path.join(args.output_dir, timeStamp)

    config = BertConfig.from_json_file(args.config_file)
    bert_weight_name = json.load(
        open("config/" + args.bert_model + "_weight_name.json", "r"))

    task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val \
        = LoadDatasetEval(args, task_cfg, args.tasks.split('-'))

    # ["img_id"] for indexing
    val_results = json.load(
        open(
            os.path.join(
                './results/best_finetune_checkpoint-/val_result.json'), 'r'))
    true_val_items = task_datasets_val['TASK1']._entries

    assert len(val_results) == len(true_val_items)

    errors = []
    for i in tqdm(range(len(val_results))):
        if np.asarray(val_results[i]
                      ['answer']).argmax() != true_val_items[i]['target']:
            errors.append(true_val_items[i])
    print(len(errors) / len(val_results))
    pickle.dump(
        errors,
        open(
            os.path.join('./results/best_finetune_checkpoint-/val_errors.pkl'),
            'wb'))
    return errors
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained BERT model. "
        "This specifies the model architecture.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")

    ## Required parameters
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    ## Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--train_ans_file",
        default=None,
        type=str,
        help="SQuAD answer for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if os.path.exists(args.output_dir) == False:
        # raise ValueError("Output directory () already exists and is not empty.")
        os.makedirs(args.output_dir, exist_ok=True)
    raw_test_data = open(args.predict_file, mode='r')
    raw_train_data = open(args.train_file, mode='r')

    import pickle as cPickle
    train_examples = None
    num_train_steps = None
    if args.do_train:

        if os.path.exists("train_file_baseline.pkl") and False:
            train_examples = cPickle.load(
                open("train_file_baseline.pkl", mode='rb'))
        else:
            ans_dict = {}
            with open(args.train_ans_file) as f:
                for line in f:
                    line = line.split(',')
                    ans_dict[line[0]] = int(line[1])
            train_examples = read_chid_examples(raw_train_data,
                                                is_training=True,
                                                ans_dict=ans_dict)
            cPickle.dump(train_examples,
                         open("train_file_baseline.pkl", mode='wb'))

        #tt = len(train_examples) // 2
        #train_examples = train_examples[:tt]

        logger.info("train examples {}".format(len(train_examples)))
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    bert_config = BertConfig.from_json_file(args.bert_config_file)
    tokenizer = BertTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)
    model = BertForCloze(bert_config, num_choices=10)
    if args.init_checkpoint is not None:
        logger.info('load bert weight')
        state_dict = torch.load(args.init_checkpoint, map_location='cpu')
        missing_keys = []
        unexpected_keys = []
        error_msgs = []
        # copy state_dict so _load_from_state_dict can modify it
        metadata = getattr(state_dict, '_metadata', None)
        state_dict = state_dict.copy()
        # new_state_dict=state_dict.copy()
        # for kye ,value in state_dict.items():
        #     new_state_dict[kye.replace("bert","c_bert")]=value
        # state_dict=new_state_dict
        if metadata is not None:
            state_dict._metadata = metadata

        def load(module, prefix=''):
            local_metadata = {} if metadata is None else metadata.get(
                prefix[:-1], {})

            module._load_from_state_dict(state_dict, prefix, local_metadata,
                                         True, missing_keys, unexpected_keys,
                                         error_msgs)
            for name, child in module._modules.items():
                # logger.info("name {} chile {}".format(name,child))
                if child is not None:
                    load(child, prefix + name + '.')

        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
        logger.info("missing keys:{}".format(missing_keys))
        logger.info('unexpected keys:{}'.format(unexpected_keys))
        logger.info('error msgs:{}'.format(error_msgs))
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    global_step = 0
    if args.do_train:
        cached_train_features_file = args.train_file + '_{0}_v{1}'.format(
            str(args.max_seq_length), str(4))
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length)

            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_labels = torch.tensor([f.label for f in train_features],
                                  dtype=torch.long)
        all_option_ids = torch.tensor([f.option_ids for f in train_features],
                                      dtype=torch.long)
        all_positions = torch.tensor([f.position for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_labels, all_option_ids,
                                   all_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size,
                                      drop_last=True)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            model.zero_grad()
            epoch_itorator = tqdm(train_dataloader, disable=None)
            for step, batch in enumerate(epoch_itorator):
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, labels, option_ids, positions = batch
                loss = model(input_ids, option_ids, segment_ids, input_mask,
                             positions, labels)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if (step + 1) % 1000 == 0:
                    logger.info("loss@{}:{}".format(step, loss.cpu().item()))

    # Save a trained model
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    if args.do_train:
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have fine-tuned
    model_state_dict = torch.load(output_model_file)
    model = BertForCloze(bert_config, num_choices=10)
    model.load_state_dict(model_state_dict)
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        eval_examples = read_chid_examples(raw_test_data, is_training=False)
        # eval_examples=eval_examples[:100]
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_option_ids = torch.tensor([f.option_ids for f in eval_features],
                                      dtype=torch.long)
        all_positions = torch.tensor([f.position for f in eval_features],
                                     dtype=torch.long)
        all_tags = torch.tensor([f.tag for f in eval_features],
                                dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_option_ids,
                                  all_positions, all_tags)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = {}
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, option_ids, positions, tags in \
                tqdm(eval_dataloader, desc="Evaluating",disable=None):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            option_ids = option_ids.to(device)
            positions = positions.to(device)
            with torch.no_grad():
                batch_logits = model(input_ids, option_ids, segment_ids,
                                     input_mask, positions)
            for i, tag in enumerate(tags):
                logits = batch_logits[i].detach().cpu().numpy()
                ans = np.argmax(logits)
                all_results["#idiom%06d#" % tag] = ans

        output_prediction_file = os.path.join(args.output_dir,
                                              "prediction.csv")
        with open(output_prediction_file, "w") as f:
            for each in all_results:
                f.write(each + ',' + str(all_results[each]) + "\n")
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="results",
        type=str,
        help=
        "The output directory where the model checkpoints will be written.",
    )
    parser.add_argument(
        "--config_file",
        default="config/bert_config.json",
        type=str,
        help="The config file which specified the model details.",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument("--num_workers",
                        type=int,
                        default=16,
                        help="Number of workers in the dataloader.")
    parser.add_argument(
        "--save_name",
        default='',
        type=str,
        help="save name for training.",
    )
    parser.add_argument("--tasks",
                        default='',
                        type=str,
                        help="1-2-3... training task separate by -")
    parser.add_argument("--in_memory",
                        default=False,
                        type=bool,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--baseline",
                        action="store_true",
                        help="whether use single stream baseline.")
    parser.add_argument("--zero_shot",
                        action="store_true",
                        help="whether use single stream baseline.")
    parser.add_argument("--split",
                        default="",
                        type=str,
                        help="which split to use.")
    parser.add_argument("--batch_size",
                        default=1,
                        type=int,
                        help="which split to use.")
    args = parser.parse_args()
    with open('vlbert_tasks.yml', 'r') as f:
        task_cfg = edict(yaml.safe_load(f))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if args.baseline:
        from pytorch_pretrained_bert.modeling import BertConfig
    else:
        from vilbert.vilbert import BertConfig

    task_names = []
    for i, task_id in enumerate(args.tasks.split('-')):
        task = 'TASK' + task_id
        name = task_cfg[task]['name']
        task_names.append(name)

    # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0]
    if '/' in args.from_pretrained:
        timeStamp = args.from_pretrained.split('/')[1]
    else:
        timeStamp = args.from_pretrained

    savePath = os.path.join(args.output_dir, timeStamp)

    config = BertConfig.from_json_file(args.config_file)
    bert_weight_name = json.load(
        open("config/" + args.bert_model + "_weight_name.json", "r"))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        # changed so it runs on gpu short
        n_gpu = 1
        # n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu and not os.path.exists(savePath):
        os.makedirs(savePath)

    task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val \
                        = LoadDatasetEval(args, task_cfg, args.tasks.split('-'))

    num_labels = max(
        [dataset.num_labels for dataset in task_datasets_val.values()])

    config.fast_mode = True
    if args.zero_shot:
        model = BertForMultiModalPreTraining.from_pretrained(
            args.from_pretrained, config)
    else:
        model = VILBertForVLTasks.from_pretrained(args.from_pretrained,
                                                  config,
                                                  num_labels=num_labels,
                                                  default_gpu=default_gpu)

    task_losses = LoadLosses(args, task_cfg, args.tasks.split('-'))
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model, deay_allreduce=True)

    elif n_gpu > 1:
        model = nn.DataParallel(model)

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    print("  Num Iters: ", task_num_iters)
    print("  Batch size: ", task_batch_size)

    model.eval()
    # when run evaluate, we run each task sequentially.
    for task_id in task_ids:
        results = []
        others = []

        score_matrix = np.zeros((1000, 1000))
        target_matrix = np.zeros((1000, 1000))
        rank_matrix = np.ones((1000)) * 1000
        count = 0

        for i, batch in enumerate(task_dataloader_val[task_id]):
            if torch.cuda.is_available():
                batch = tuple(
                    t.cuda(device=device, non_blocking=True) for t in batch)
            features, spatials, image_mask, question, input_mask, segment_ids, target, caption_idx, image_idx = batch

            if task_id in ['TASK3']:
                batch_size = features.size(0)
                features = features.squeeze(0)
                spatials = spatials.squeeze(0)
                image_mask = image_mask.squeeze(0)

            with torch.no_grad():
                if args.zero_shot:
                    _, _, vil_logit, _ = model(question, features, spatials,
                                               segment_ids, input_mask,
                                               image_mask)

                    score_matrix[caption_idx, image_idx * 500:(image_idx + 1) *
                                 500] = torch.softmax(
                                     vil_logit,
                                     dim=1)[:, 0].view(-1).cpu().numpy()
                    target_matrix[caption_idx,
                                  image_idx * 500:(image_idx + 1) *
                                  500] = target.view(-1).float().cpu().numpy()

                else:
                    _, vil_logit, _, _, _, _, _ = model(
                        question, features, spatials, segment_ids, input_mask,
                        image_mask)
                    score_matrix[caption_idx, image_idx * 500:(image_idx + 1) *
                                 500] = vil_logit.view(-1).cpu().numpy()
                    target_matrix[caption_idx,
                                  image_idx * 500:(image_idx + 1) *
                                  500] = target.view(-1).float().cpu().numpy()

                if image_idx.item() == 1:
                    rank = np.where(
                        (np.argsort(-score_matrix[caption_idx]) == np.where(
                            target_matrix[caption_idx] == 1)[0][0]) == 1)[0][0]
                    rank_matrix[caption_idx] = rank

                    rank_matrix_tmp = rank_matrix[:caption_idx + 1]
                    r1 = 100.0 * np.sum(
                        rank_matrix_tmp < 1) / len(rank_matrix_tmp)
                    r5 = 100.0 * np.sum(
                        rank_matrix_tmp < 5) / len(rank_matrix_tmp)
                    r10 = 100.0 * np.sum(
                        rank_matrix_tmp < 10) / len(rank_matrix_tmp)

                    medr = np.floor(np.median(rank_matrix_tmp) + 1)
                    meanr = np.mean(rank_matrix_tmp) + 1
                    print(
                        "%d Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f"
                        % (count, r1, r5, r10, medr, meanr))

                    results.append(
                        np.argsort(-score_matrix[caption_idx]).tolist()[:20])
            count += 1

        r1 = 100.0 * np.sum(rank_matrix < 1) / len(rank_matrix)
        r5 = 100.0 * np.sum(rank_matrix < 5) / len(rank_matrix)
        r10 = 100.0 * np.sum(rank_matrix < 10) / len(rank_matrix)

        medr = np.floor(np.median(rank_matrix) + 1)
        meanr = np.mean(rank_matrix) + 1

        print("************************************************")
        print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" %
              (r1, r5, r10, medr, meanr))
        print("************************************************")

        if args.split:
            json_path = os.path.join(savePath, args.split)
        else:
            json_path = os.path.join(savePath, task_cfg[task_id]['val_split'])
        json.dump(results, open(json_path + '_result.json', 'w'))
        json.dump(others, open(json_path + '_others.json', 'w'))
Beispiel #13
0
def train(args):
    # initial device and gpu number
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    print(
        "[Train] device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}"
        .format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # get train example
    train_examples = None
    num_train_optimization_steps = None

    print('[Train] Load data...')
    train_examples = torch.load(os.path.join(args.data_path, 'train.pt'))
    num_train_optimization_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )
    print('[Train] Finish loading data...')

    # Prepare model
    print('[Train] Initial model...')
    if args.bert_config:
        model_file = os.path.join(args.bert_config, WEIGHTS_NAME)
        config_file = os.path.join(args.bert_config, CONFIG_NAME)
        print('[Train] Load model...')
        config = BertConfig.from_json_file(config_file)
        model = Summarizer(args,
                           device,
                           load_pretrained_bert=False,
                           bert_config=config)
        # load check points
        model.load_cp(torch.load(model_file))
    else:
        model = Summarizer(args, device, load_pretrained_bert=True)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    # prepare training data
    train_features = convert_examples_to_features(train_examples)
    print("[Train] ***** Running training *****")
    print(f"[Train]   Num examples = {len(train_examples)}")
    print(f"[Train]   Batch size = {args.train_batch_size}")
    print(f"[Train]   Num steps = {num_train_optimization_steps}")
    train_data = get_dataset(train_features)
    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    # prepare evaluation data
    eval_dataloader = None
    eval_path = os.path.join(args.data_path, 'dev.pt')
    if os.path.isfile(eval_path):
        print(f"[Train] ***** Prepare evaluation data from {eval_path} *****")

        eval_examples = torch.load(eval_path)
        eval_features = convert_examples_to_features(eval_examples)
        eval_data = get_dataset(eval_features)

        print(f"[Train]   Num examples = {len(eval_examples)}")
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

    global_step = 0
    tr_loss = 0

    loss_f = torch.nn.BCELoss(reduction='none')
    best_performance = 1000000
    best_epoch = 0
    output_model_file = os.path.join(args.model_path, WEIGHTS_NAME)
    output_config_file = os.path.join(args.model_path, CONFIG_NAME)
    # begin training
    for epoch_i in range(int(args.num_train_epochs)):
        print('[ Epoch', epoch_i, ']')

        # training
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            sent_idx, input_ids, input_mask, segment_ids, clss_ids, clss_mask, label_ids = batch

            sent_scores, mask = model(input_ids, segment_ids, clss_ids,
                                      input_mask, clss_mask)

            loss = loss_f(sent_scores, label_ids.float())
            loss = (loss * mask.float()).sum()
            loss = loss / loss.numel()
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / num_train_optimization_steps,
                        args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

        train_loss = tr_loss / nb_tr_steps
        cur_loss = train_loss

        # evaluation
        if eval_dataloader and (args.local_rank == -1
                                or torch.distributed.get_rank() == 0):
            model.eval()

            eval_loss, eval_pk, eval_windiff, eval_bound_sim = 0, 0, 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0

            for sent_idx, input_ids, input_mask, segment_ids, clss_ids, clss_mask, label_ids in tqdm(
                    eval_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                clss_ids = clss_ids.to(device)
                clss_mask = clss_mask.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    sent_scores, mask = model(input_ids, segment_ids, clss_ids,
                                              input_mask, clss_mask)

                    tmp_eval_loss = loss_f(sent_scores, label_ids.float())
                    tmp_eval_loss = (tmp_eval_loss * mask.float()).sum()
                    tmp_eval_loss = tmp_eval_loss / tmp_eval_loss.numel()

                sent_scores = sent_scores.detach().cpu().numpy()
                mask = mask.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                # find selected sentences
                _pred, selected_ids = select_seg(sent_idx, eval_examples,
                                                 sent_scores, mask)
                tmp_pk, tmp_windiff, tmp_bound_sim = accuracy(
                    selected_ids, label_ids, mask)

                eval_loss += tmp_eval_loss.mean().item()
                eval_pk += tmp_pk
                eval_windiff += tmp_windiff
                eval_bound_sim += tmp_bound_sim

                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            eval_pk = eval_pk / nb_eval_examples
            eval_windiff = eval_windiff / nb_eval_examples
            eval_bound_sim = eval_bound_sim / nb_eval_examples
            cur_loss = eval_loss

        # output result of an epoch
        print(f'  - (Training)   loss: {train_loss: 8.5f}')
        print(
            f'  - (Validation) loss: {eval_loss: 8.5f}, Pk: {100 * eval_pk: 3.3f} , WinDiff: {100 * eval_windiff: 3.3f} , Boundary Similarity: {100 * eval_bound_sim: 3.3f} '
        )

        # record best model
        if cur_loss < best_performance:
            best_performance = cur_loss
            best_epoch = epoch_i
            # Save a trained model and the associated configuration
            model_to_save = model.module if hasattr(
                model, 'module') else model  # Only save the model it-self
            torch.save(model_to_save.state_dict(), output_model_file)
            with open(output_config_file, 'w') as f:
                f.write(model_to_save.config.to_json_string())
            print("[Train] Successfully save the best model.")
Beispiel #14
0
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
        """
        Instantiate a BertWordEmbeddings from a pre-trained model file or a pytorch state dict.
        Download and cache the pre-trained model file if needed.
        Params:
            pretrained_model_name_or_path: either:
                - a str with the name of a pre-trained model to load selected in the list of:
                    . `bert-base-uncased`
                    . `bert-large-uncased`
                    . `bert-base-cased`
                    . `bert-large-cased`
                    . `bert-base-multilingual-uncased`
                    . `bert-base-multilingual-cased`
                    . `bert-base-chinese`
                - a path or url to a pretrained model archive containing:
                    . `bert_config.json` a configuration file for the model
                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
                - a path or url to a pretrained model archive containing:
                    . `bert_config.json` a configuration file for the model
                    . `model.chkpt` a TensorFlow checkpoint
            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
            *inputs, **kwargs: additional input for the specific Bert class
                (ex: num_labels for BertForSequenceClassification)
        """
        state_dict = kwargs.get('state_dict', None)
        kwargs.pop('state_dict', None)
        cache_dir = kwargs.get('cache_dir', None)
        kwargs.pop('cache_dir', None)
        from_tf = kwargs.get('from_tf', False)
        kwargs.pop('from_tf', None)

        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
        else:
            archive_file = pretrained_model_name_or_path
        # redirect to the cache, if necessary
        try:
            resolved_archive_file = cached_path(
                archive_file, cache_dir=cache_dir)
        except EnvironmentError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find any file "
                "associated to this path or url.".format(
                    pretrained_model_name_or_path,
                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
                    archive_file))
            return None
        if resolved_archive_file == archive_file:
            logger.info("loading archive file {}".format(archive_file))
        else:
            logger.info("loading archive file {} from cache at {}".format(
                archive_file, resolved_archive_file))
        tempdir = None
        if os.path.isdir(resolved_archive_file) or from_tf:
            serialization_dir = resolved_archive_file
        else:
            # Extract archive to temp dir
            tempdir = tempfile.mkdtemp()
            logger.info("extracting archive file {} to temp dir {}".format(
                resolved_archive_file, tempdir))
            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
                archive.extractall(tempdir)
            serialization_dir = tempdir
        # Load config
        config_file = os.path.join(serialization_dir, CONFIG_NAME)
        if not os.path.exists(config_file):
            # Backward compatibility with old naming format
            config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
        config = BertConfig.from_json_file(config_file)
        logger.info("Model config {}".format(config))
        # Instantiate model.
        #model = cls(config, *inputs, **kwargs)
        emb = cls(config, *inputs, **kwargs)
        if state_dict is None and not from_tf:
            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
            state_dict = torch.load(weights_path, map_location='cpu')
        if tempdir:
            # Clean up temp dir
            shutil.rmtree(tempdir)
        if from_tf:
            # Directly load from a TensorFlow checkpoint
            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
            raise RuntimeError(
                "Loading from tensorflow weights is not supported")
            # return load_tf_weights_in_bert(model, weights_path)
        # Load from a PyTorch state_dict
        old_keys = []
        new_keys = []
        for key in state_dict.keys():
            new_key = None
            if 'gamma' in key:
                new_key = key.replace('gamma', 'weight')
            if 'beta' in key:
                new_key = key.replace('beta', 'bias')
            if new_key:
                old_keys.append(key)
                new_keys.append(new_key)
        for old_key, new_key in zip(old_keys, new_keys):
            state_dict[new_key] = state_dict.pop(old_key)

        missing_keys = []
        unexpected_keys = []
        error_msgs = []
        # copy state_dict so _load_from_state_dict can modify it
        metadata = getattr(state_dict, '_metadata', None)
        state_dict = state_dict.copy()
        if metadata is not None:
            state_dict._metadata = metadata

        def load(module, prefix=''):
            local_metadata = {} if metadata is None else metadata.get(
                prefix[:-1], {})
            module._load_from_state_dict(
                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
            for name, child in module._modules.items():
                if child is not None:
                    load(child, prefix + name + '.')
        start_prefix = 'bert.embeddings'
        # if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
        #    start_prefix = 'bert.'
        load(emb, prefix=start_prefix)
        # if len(missing_keys) > 0:
        #    logger.info("Weights of {} not initialized from pretrained model: {}".format(
        #        model.__class__.__name__, missing_keys))
        # if len(unexpected_keys) > 0:
        #    logger.info("Weights from pretrained model not used in {}: {}".format(
        #        model.__class__.__name__, unexpected_keys))
        # if len(error_msgs) > 0:
        #    raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
        #                       model.__class__.__name__, "\n\t".join(error_msgs)))
        # return model
        return emb
Beispiel #15
0
def main(args):
    if not os.path.exists(args.test_eval_dir):
        os.makedirs(args.test_eval_dir)
    if not os.path.exists(args.eval_dir):
        os.makedirs(args.eval_dir)
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    #### print config ####
    print(args)

    #### add label ####
    label_alphabet = Alphabet('label', True)
    label_alphabet.add("O")
    label_alphabet.add("B-T")
    label_alphabet.add("I-T")
    label_alphabet.add("B-P")
    label_alphabet.add("I-P")

    # read data
    print("Loading data....")
    datasets = torch.load(args.data)
    train_set = datasets["train"]
    test_set = datasets["test"]
    train_dataloader = read_data(train_set, "train", args.batchSize)
    eval_dataloader = read_data(test_set, "test", args.batchSize)

    #### load BERT config ####
    print("Loading BERT config....")
    bert_config = BertConfig.from_json_file(args.bert_json_dir)

    #### defined model ####
    model = opinionMining(args, bert_config, label_alphabet)
    if args.mode == "test":
        assert args.test_model != ""
        model = torch.load(args.test_model)
        test_start = time.time()
        # evaluate
        RP, RR, RF, TP, TR, TF, OP, OR, OF = evaluate(
            eval_dataloader, test_set, model,
            args.test_eval_dir + "/test_output", args)
        test_finish = time.time()
        test_cost = test_finish - test_start
        print("test: time: %.2fs, speed: %.2fst/s" % (test_cost, 0))
        print("relation result: Precision: %.4f; Recall: %.4f; F1: %.4f" %
              (RP, RR, RF))
        print("target result: Precision: %.4f; Recall: %.4f; F1: %.4f" %
              (TP, TR, TF))
        print("opinion result: Precision: %.4f; Recall: %.4f; F1: %.4f" %
              (OP, OR, OF))
    else:
        print("Loading model from pretrained checkpoint: " +
              args.bert_checkpoint_dir)
        model = bert_load_state_dict(
            model, torch.load(args.bert_checkpoint_dir, map_location='cpu'))

        #### define optimizer ####
        num_train_steps = int(len(train_set) / args.batchSize * args.iteration)
        param_optimizer = list(model.named_parameters())
        optimizer_grouped_parameters = [{
            'params': [p for n, p in param_optimizer if "bert" in n],
            'weight_decay':
            0.01
        }, {
            'params': [p for n, p in param_optimizer if "bert" not in n],
            'lr':
            args.lr_rate,
            'weight_decay':
            0.01
        }]
        optimizer_grouped_parameters_r = [{
            'params': [p for n, p in param_optimizer if "bert" in n],
            'weight_decay':
            0.01
        }, {
            'params': [p for n, p in param_optimizer if "relation" in n],
            'lr':
            args.R_lr_rate,
            'weight_decay':
            0.01
        }]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=2e-05,
                             warmup=0.1,
                             t_total=num_train_steps)
        optimizer_r = BertAdam(optimizer_grouped_parameters_r,
                               lr=2e-05,
                               warmup=0.1,
                               t_total=num_train_steps)

        #### train ####
        print("start training......")
        best_Score = -10000
        lr = args.lr_rate
        for idx in range(args.iteration):
            epoch_start = time.time()
            temp_start = epoch_start
            print("Epoch: %s/%s" % (idx, args.iteration))

            if idx > 10:
                lr = lr * args.lr_decay
                print(lr)
                optimizer.param_groups[1]["lr"] = lr
                optimizer_r.param_groups[1]["lr"] = lr

            sample_loss = 0
            total_loss = 0
            right_target_token = 0
            whole_target_token = 0
            right_relation_token = 0
            whole_relation_token = 0

            model.train()
            model.zero_grad()
            for step, batch in enumerate(train_dataloader):
                if args.ifgpu:
                    batch = tuple(t.cuda() for t in batch)
                all_input_ids, all_input_mask, all_segment_ids, all_relations, all_labels = batch
                max_seq_len = torch.max(torch.sum(all_input_mask, dim=1))
                all_input_ids = all_input_ids[:, :max_seq_len].contiguous()
                all_input_mask = all_input_mask[:, :max_seq_len].contiguous()
                all_segment_ids = all_segment_ids[:, :max_seq_len].contiguous()
                all_relations = all_relations[:, :max_seq_len, :
                                              max_seq_len].contiguous()
                all_labels = all_labels[:, :max_seq_len].contiguous()
                tloss, rloss, targetPredict, relationPredict = model.neg_log_likelihood_loss(
                    all_input_ids, all_segment_ids, all_labels, all_relations,
                    all_input_mask)
                # check right number
                targetRight, targetWhole = targetPredictCheck(
                    targetPredict, all_labels, all_input_mask)
                relationRight, relationWhole = relationPredictCheck(
                    relationPredict, all_relations)

                # cal right and whole label number
                right_target_token += targetRight
                whole_target_token += targetWhole
                right_relation_token += relationRight
                whole_relation_token += relationWhole
                # cal loss
                sample_loss += rloss.data[0] + tloss.data[0]
                total_loss += rloss.data[0] + tloss.data[0]
                # print train info
                if step % 20 == 0:
                    temp_time = time.time()
                    temp_cost = temp_time - temp_start
                    temp_start = temp_time
                    print(
                        "     Instance: %s; Time: %.2fs; loss: %.4f; target_acc: %s/%s=%.4f; relation_acc: %s/%s=%.4f"
                        % (step * args.batchSize, temp_cost, sample_loss,
                           right_target_token, whole_target_token,
                           (right_target_token + 0.) / whole_target_token,
                           right_relation_token, whole_relation_token,
                           (right_relation_token + 0.) / whole_relation_token))
                    if sample_loss > 1e8 or str(sample_loss) == "nan":
                        print(
                            "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
                        )
                        exit(1)
                    sys.stdout.flush()
                    sample_loss = 0

                if step % 2 == 0:
                    loss = 9 * rloss + tloss  #
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()
                else:
                    rloss.backward()
                    optimizer_r.step()
                    optimizer_r.zero_grad()

            temp_time = time.time()
            temp_cost = temp_time - temp_start
            print(
                "     Instance: %s; Time: %.2fs; loss: %.4f; target_acc: %s/%s=%.4f; relation_acc: %s/%s=%.4f"
                % (step * args.batchSize, temp_cost, sample_loss,
                   right_target_token, whole_target_token,
                   (right_target_token + 0.) / whole_target_token,
                   right_relation_token, whole_relation_token,
                   (right_relation_token + 0.) / whole_relation_token))

            epoch_finish = time.time()
            epoch_cost = epoch_finish - epoch_start
            print(
                "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"
                % (idx, epoch_cost, len(train_set) / epoch_cost, total_loss))
            print("totalloss:", total_loss)
            if total_loss > 1e8 or str(total_loss) == "nan":
                print(
                    "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
                )
                exit(1)

            # evaluate
            RP, RR, RF, TP, TR, TF, OP, OR, OF = evaluate(
                eval_dataloader, test_set, model,
                args.eval_dir + "/test_output_" + str(idx), args)
            test_finish = time.time()
            test_cost = test_finish - epoch_finish
            current_Score = RF

            print("test: time: %.2fs, speed: %.2fst/s" % (test_cost, 0))
            print("relation result: Precision: %.4f; Recall: %.4f; F1: %.4f" %
                  (RP, RR, RF))
            print("target result: Precision: %.4f; Recall: %.4f; F1: %.4f" %
                  (TP, TR, TF))
            print("opinion result: Precision: %.4f; Recall: %.4f; F1: %.4f" %
                  (OP, OR, OF))

            if current_Score > best_Score:
                print(
                    "Exceed previous best f score with target f: %.4f and opinion f: %.4f and relation f: %.4f"
                    % (TF, OF, RF))
                model_name = args.model_dir + "/modelFinal.model"
                print("Save current best model in file:", model_name)
                torch.save(model, model_name)
                best_Score = current_Score

            gc.collect()
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help=
        "The config json file corresponding to the pre-trained BERT model. \n"
        "This specifies the model architecture.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=1024,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=666,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    args = parser.parse_args()

    processors = {"aus": OOCLAUSProcessor, "ukd": OOCLUKDProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    # if args.max_seq_length > bert_config.max_position_embeddings:
    #     raise ValueError(
    #         "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format(
    #         args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = BertTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size *
            args.num_train_epochs)

    model = BertForDocMultiClassification(bert_config, len(label_list))
    if args.init_checkpoint is not None:
        # model_dict = model.bert.state_dict()
        # parameters = torch.load(args.init_checkpoint, map_location='cpu')
        # pretrained_dict = {}
        # for k in parameters:
        #     pretrained_dict[k.replace('bert.', '')] = parameters[k]
        # # 2. overwrite entries in the existing state dict
        # model_dict.update(pretrained_dict)
        # # 3. load the new state dict
        # model.bert.load_state_dict(pretrained_dict)

        model.bert.load_state_dict(
            torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [{
        'params':
        [p for n, p in model.named_parameters() if n not in no_decay],
        'weight_decay_rate':
        0.01
    }, {
        'params': [p for n, p in model.named_parameters() if n in no_decay],
        'weight_decay_rate':
        0.0
    }]

    optimizer = BertAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        global long_doc_num
        logger.info("  Num examples of Long document= %d", long_doc_num)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for epoch_num in trange(int(args.num_train_epochs), desc="Epoch"):
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    global_step += 1
                    logger.info("Current loss: %f", float(loss))

            if args.do_eval:
                eval_examples = processor.get_dev_examples(args.data_dir)
                eval_features = convert_examples_to_features(
                    eval_examples, label_list, args.max_seq_length, tokenizer)

                logger.info("***** Running evaluation *****")
                logger.info("  Num examples = %d", len(eval_examples))
                logger.info("  Batch size = %d", args.eval_batch_size)

                all_input_ids = torch.tensor(
                    [f.input_ids for f in eval_features], dtype=torch.long)
                all_input_mask = torch.tensor(
                    [f.input_mask for f in eval_features], dtype=torch.long)
                all_segment_ids = torch.tensor(
                    [f.segment_ids for f in eval_features], dtype=torch.long)
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.long)

                eval_data = TensorDataset(all_input_ids, all_input_mask,
                                          all_segment_ids, all_label_ids)
                if args.local_rank == -1:
                    eval_sampler = SequentialSampler(eval_data)
                else:
                    eval_sampler = DistributedSampler(eval_data)
                eval_dataloader = DataLoader(eval_data,
                                             sampler=eval_sampler,
                                             batch_size=args.eval_batch_size)

                model.eval()
                output_file_path = os.path.join(
                    args.output_dir, "results_epoch_{0}.txt".format(epoch_num))
                output_file_writer = open(output_file_path,
                                          'w',
                                          encoding='utf-8')
                eval_loss, eval_accuracy = 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        tmp_eval_loss, logits = model(input_ids, segment_ids,
                                                      input_mask, label_ids)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    tmp_eval_accuracy = accuracy(logits, label_ids)

                    for logit, label_id in zip(logits, label_ids):
                        output_file_writer.write(str(logit).replace('\n', ' '))
                        output_file_writer.write('\t')
                        output_file_writer.write(
                            str(label_id).replace('\n', ' '))
                        output_file_writer.write('\n')

                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1

                eval_loss = eval_loss / nb_eval_steps
                eval_accuracy = eval_accuracy / nb_eval_examples

                result = {
                    'eval_loss': eval_loss,
                    'eval_accuracy': eval_accuracy,
                    'global_step': global_step,
                    'loss': tr_loss / nb_tr_steps
                }

                output_eval_file = os.path.join(
                    args.output_dir,
                    "eval_results_epoch_{0}.txt".format(epoch_num))
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))

                # Save pytorch-model
                output_model_file = os.path.join(
                    args.output_dir,
                    "fine_tune_model_epoch_{0}.bin".format(epoch_num))
                torch.save(model.state_dict(), output_model_file)

                aus_label_path = args.data_dir + r"\aus_dev.label.csv"
                ukd_label_path = args.data_dir + r"\ukd_dev.label.csv"
                if task_name == 'aus':
                    label_path = aus_label_path
                else:
                    label_path = ukd_label_path
                output_file_writer.close()

                convert(output_file_path, task_name)
                evaluation_report(label_path, output_file_path + '.label',
                                  task_name.upper(),
                                  output_file_path + '.eval')
Beispiel #17
0
def train(train_features_tensor_dataset,
          dev_features_tensor_dataset,
          test_features_tensor_dataset=None,
          data_mode='mongo',
          demo=False,
          use_w2v=False):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.batch_size = int(args.batch_size)

    if 'set random seed':
        seed_everything(args.seed)
        if args.n_gpu > 0:
            torch.cuda.manual_seed_all(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    logger.info("train examples {}".format(len(train_features_tensor_dataset)))
    num_train_steps = int(
        len(train_features_tensor_dataset) / args.batch_size *
        args.num_train_epochs)
    steps_a_epoch = int(num_train_steps // args.num_train_epochs)

    if 'create train TensorDataset':
        train_sampler = RandomSampler(train_features_tensor_dataset)
        train_dataloader = DataLoader(train_features_tensor_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.batch_size,
                                      drop_last=True)

    if 'create dev TensorDataset':
        dev_sampler = SequentialSampler(dev_features_tensor_dataset)
        dev_dataloader = DataLoader(dev_features_tensor_dataset,
                                    sampler=dev_sampler,
                                    batch_size=args.batch_size,
                                    drop_last=False)

    if 'create test TensorDataset':
        test_sampler = SequentialSampler(test_features_tensor_dataset)
        test_dataloader = DataLoader(test_features_tensor_dataset,
                                     sampler=test_sampler,
                                     batch_size=args.batch_size,
                                     drop_last=False)

    log = Log(args, running_time, baseline, benchmark)
    if 'create model':
        bert_config = BertConfig.from_json_file(args.bert_config_file)
        weight = ''
        if use_w2v:
            weight = load_pkl_data(args.w2v_weight)
        if baseline == 'baseline1':
            model = Baseline1(args, use_w2v=use_w2v, weight=weight)
        elif baseline == 'baseline2':
            model = Baseline2(args, use_w2v=use_w2v, weight=weight)
        elif baseline == 'baseline3':
            model = Baseline3(args, use_w2v=use_w2v, weight=weight)
        elif baseline == 'baseline4':
            model = Baseline4(args, use_w2v=use_w2v, weight=weight)
        elif baseline == 'baseline5':
            model = Baseline5(args, use_w2v=use_w2v, weight=weight)
        elif baseline == 'baseline6':
            model = Baseline6(args, use_w2v=use_w2v, weight=weight)
        elif baseline == 'MSCNN':
            model = MSCNN(args, use_w2v=use_w2v, weight=weight)
        elif baseline == 'MSAIN':
            model = MSAIN(args, use_w2v=use_w2v, weight=weight)
        elif baseline == 'MLCCNN':
            model = MLCCNN(args, use_w2v=use_w2v, weight=weight)
        else:
            raise Exception('model not exsit')

        model.to(device)

        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        optimizer_grouped_parameters = [{
            'params': [p for n, p in param_optimizer],
            'weight_decay':
            0.0
        }]
        optimizer = torch.optim.Adagrad(optimizer_grouped_parameters,
                                        lr=args.learning_rate)

        # DataParallel training
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

    global_step = 0
    best_top1 = 0
    for curEpoch in range(int(args.num_train_epochs)):
        model.train()
        with tqdm(total=steps_a_epoch) as pbar:
            for step, batch in enumerate(train_dataloader):
                if data_mode == 'mongo':
                    indexes = batch[0].sort().values.tolist()
                    batch_data = [
                        item for item in collection.find(
                            {'index': {
                                '$in': indexes
                            }})
                    ]
                    if use_w2v:
                        q_input_ids_w2v = torch.tensor(
                            [item['q_w2v_ids'] for item in batch_data],
                            dtype=torch.long).to(device)
                        p_a_input_ids_w2v = torch.tensor(
                            [item['p_a_w2v_ids'] for item in batch_data],
                            dtype=torch.long).to(device)
                        n_a_input_ids_w2v = torch.tensor(
                            [item['n_a_w2v_ids'] for item in batch_data],
                            dtype=torch.long).to(device)
                        q_input_mask_tensor = torch.tensor(
                            [item['q_input_mask'] for item in batch_data],
                            dtype=torch.uint8).to(device)
                        p_a_input_mask_tensor = torch.tensor(
                            [item['p_a_input_mask'] for item in batch_data],
                            dtype=torch.uint8).to(device)
                        n_a_input_mask_tensor = torch.tensor(
                            [item['n_a_input_mask'] for item in batch_data],
                            dtype=torch.uint8).to(device)
                    else:
                        q_input_ids_tensor = torch.tensor(
                            [item['q_input_ids'] for item in batch_data],
                            dtype=torch.long).to(device)
                        q_input_mask_tensor = torch.tensor(
                            [item['q_input_mask'] for item in batch_data],
                            dtype=torch.long).to(device)
                        q_segment_ids_tensor = torch.tensor(
                            [item['q_segment_ids'] for item in batch_data],
                            dtype=torch.long).to(device)
                        p_a_input_ids_tensor = torch.tensor(
                            [item['p_a_input_ids'] for item in batch_data],
                            dtype=torch.long).to(device)
                        p_a_input_mask_tensor = torch.tensor(
                            [item['p_a_input_mask'] for item in batch_data],
                            dtype=torch.long).to(device)
                        p_a_segment_ids_tensor = torch.tensor(
                            [item['p_a_segment_ids'] for item in batch_data],
                            dtype=torch.long).to(device)
                        n_a_input_ids_tensor = torch.tensor(
                            [item['n_a_input_ids'] for item in batch_data],
                            dtype=torch.long).to(device)
                        n_a_input_mask_tensor = torch.tensor(
                            [item['n_a_input_mask'] for item in batch_data],
                            dtype=torch.long).to(device)
                        n_a_segment_ids_tensor = torch.tensor(
                            [item['n_a_segment_ids'] for item in batch_data],
                            dtype=torch.long).to(device)
                elif data_mode == 'local':
                    batch = tuple(input.to(device) for input in batch)
                    question_id, pos_ans_id, neg_ans_id, \
                    q_input_ids, q_input_mask, q_segment_ids, \
                    p_a_input_ids, p_a_input_mask, p_a_segment_ids, \
                    n_a_input_ids, n_a_input_mask, n_a_segment_ids = batch

                if use_w2v:
                    loss, pre_pos_sim = model(
                        q_input_ids_w2v=q_input_ids_w2v,
                        p_a_input_ids_w2v=p_a_input_ids_w2v,
                        n_a_input_ids_w2v=n_a_input_ids_w2v,
                        q_input_mask=q_input_mask_tensor,
                        p_a_input_mask=p_a_input_mask_tensor,
                        n_a_input_mask=n_a_input_mask_tensor,
                    )
                else:
                    loss, pre_pos_sim = model(
                        q_input_ids=q_input_ids_tensor,
                        q_input_mask=q_input_mask_tensor,
                        q_segment_ids=q_segment_ids_tensor,
                        p_a_input_ids=p_a_input_ids_tensor,
                        p_a_input_mask=p_a_input_mask_tensor,
                        p_a_segment_ids=p_a_segment_ids_tensor,
                        n_a_input_ids=n_a_input_ids_tensor,
                        n_a_input_mask=n_a_input_mask_tensor,
                        n_a_segment_ids=n_a_segment_ids_tensor)
                if args.n_gpu > 1:
                    loss = loss.mean()
                loss.backward()
                if 'clip grad norm':
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                #lr_scheduler.step()   # initial_lr = 0
                optimizer.step()
                optimizer.zero_grad()

                pbar.set_description(
                    f'{baseline}, benchmark-{benchmark}, train loss: {loss:.6f}'
                )
                log.print(
                    f'{baseline}, benchmark-{benchmark}, step: {global_step}, train loss: {loss:.6f}'
                )
                pbar.update(1)
                global_step += 1

                if step % args.dev_step == 0 and step != 0:
                    top1 = dev(args,
                               model,
                               dev_dataloader,
                               device,
                               demo=False,
                               use_w2v=use_w2v)
                    if top1 > best_top1:
                        best_top1 = top1
                        logging.info(
                            f'{baseline}, benchmark-{benchmark}, epoch:{curEpoch}, global_step:{global_step}, dev top1:{top1:.2%}, best top1: {best_top1:.2%}'
                        )
                        log.print(
                            f'{baseline}, benchmark-{benchmark}, epoch:{curEpoch}, global_step:{global_step}, dev top1:{top1:.2%}, best top1: {best_top1:.2%}'
                        )

                        save_model(model, global_step, args, logging, log,
                                   running_time, benchmark, baseline)
                    else:
                        logging.info(
                            f'{baseline}, benchmark-{benchmark}, epoch:{curEpoch}, global_step:{global_step}, dev top1:{top1:.2%}, best top1: {best_top1:.2%}'
                        )
                        log.print(
                            f'{baseline}, benchmark-{benchmark}, epoch:{curEpoch}, global_step:{global_step}, dev top1:{top1:.2%}, best top1: {best_top1:.2%}'
                        )

                    model.train()

        top1 = dev(args,
                   model,
                   dev_dataloader,
                   device,
                   demo=False,
                   use_w2v=use_w2v)
        if top1 > best_top1:
            best_top1 = top1
            save_model(model, global_step, args, logging, log, running_time,
                       benchmark, baseline)
        logging.info(
            f'{baseline}, benchmark-{benchmark}, epoch:{curEpoch}, global_step:{global_step}, dev top1:{top1:.2%}, best top1: {best_top1:.2%}'
        )
        log.print(
            f'{baseline}, benchmark-{benchmark}, epoch:{curEpoch}, global_step:{global_step}, dev top1:{top1:.2%}, best top1: {best_top1:.2%}'
        )

    test_top1 = test(args,
                     model,
                     test_dataloader,
                     device,
                     running_time,
                     baseline,
                     benchmark,
                     demo=False,
                     use_w2v=use_w2v)
    logging.info(
        f'{baseline}, benchmark-{benchmark}, best test top1:{test_top1:.2%}')
    log.print(
        f'{baseline}, benchmark-{benchmark}, best test top1:{test_top1:.2%}')
def sum_file(args):
    # initial device and gpu number
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {} n_gpu: {}".format(device, n_gpu))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    model_file = os.path.join(args.model_path, WEIGHTS_NAME)
    config_file = os.path.join(args.model_path, CONFIG_NAME)
    # Prepare model
    print('[Test] Load model...')
    config = BertConfig.from_json_file(config_file)
    model = Summarizer(args,
                       device,
                       load_pretrained_bert=False,
                       bert_config=config)
    model.to(device)
    # load check points
    model.load_cp(torch.load(model_file))
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # prepare testing data
    eval_dataloader = None
    logger.info(f"***** Prepare data from {args.input_data} *****")

    eval_examples = get_data(args, args.input_data)
    eval_features = convert_examples_to_features(eval_examples)
    logger.info("  Num documents = %d", len(eval_examples))
    logger.info("  Num segments = %d", len(eval_features))

    # Run prediction for full data
    eval_data = get_dataset(eval_features)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    # testing
    preds = [[] for _ in range(len(eval_examples))]
    if eval_dataloader:
        model.eval()
        for docu_idx, seg_idx, input_ids, input_mask, segment_ids, clss_ids, clss_mask in tqdm(
                eval_dataloader, desc="Testing"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            clss_ids = clss_ids.to(device)
            clss_mask = clss_mask.to(device)

            with torch.no_grad():
                sent_scores, mask = model(input_ids, segment_ids, clss_ids,
                                          input_mask, clss_mask)

            sent_scores = sent_scores.detach().cpu().numpy()
            mask = mask.detach().cpu().numpy()
            # find selected sentences
            result = select_sent(docu_idx, seg_idx, eval_examples, sent_scores,
                                 mask)
            for i, pred in result:
                preds[i].append(pred)
    preds = ['<sep>'.join(pred) for pred in preds]

    # output results
    path_dir = args.result_path
    os.makedirs(path_dir, exist_ok=True)
    with open(os.path.join(path_dir, 'preds.txt'), 'w', encoding='utf-8') as f:
        f.write('\n'.join(preds))
Beispiel #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--features_h5path",
        default="/coc/pskynet2/jlu347/multi-modal-bert/data/flick30k/flickr30k.h5",
    )

    # Required parameters
    parser.add_argument(
        "--val_file",
        default="data/flick30k/all_data_final_test_set0_2014.jsonline",
        type=str,
        help="The input train corpus.",
    )

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )

    parser.add_argument(
        "--pretrained_weight",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )

    parser.add_argument(
        "--output_dir",
        default="result",
        type=str,
        # required=True,
        help="The output directory where the model checkpoints will be written.",
    )

    parser.add_argument(
        "--config_file",
        default="config/bert_config.json",
        type=str,
        # required=True,
        help="The config file which specified the model details.",
    )
    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=30,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.",
    )

    parser.add_argument(
        "--train_batch_size",
        default=128,
        type=int,
        help="Total batch size for training.",
    )
    parser.add_argument(
        "--learning_rate",
        default=5e-5,
        type=float,
        help="The initial learning rate for Adam.",
    )
    parser.add_argument(
        "--num_train_epochs",
        default=50,
        type=int,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--warmup_proportion",
        default=0.01,
        type=float,
        help="Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.",
    )
    parser.add_argument(
        "--no_cuda", action="store_true", help="Whether not to use CUDA when available"
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help="Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="local_rank for distributed training on gpus",
    )

    parser.add_argument(
        "--seed", type=int, default=42, help="random seed for initialization"
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumualte before performing a backward/update pass.",
    )
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument(
        "--num_workers",
        type=int,
        default=1,
        help="Number of workers in the dataloader.",
    )
    parser.add_argument(
        "--from_pretrained",
        action="store_true",
        help="Wheter the tensor is from pretrained.",
    )
    parser.add_argument(
        "--save_name", default="", type=str, help="save name for training."
    )
    parser.add_argument(
        "--baseline",
        action="store_true",
        help="Wheter to use the baseline model (single bert).",
    )

    parser.add_argument(
        "--zero_shot", action="store_true", help="Wheter directly evaluate."
    )

    args = parser.parse_args()

    if args.baseline:
        from pytorch_pretrained_bert.modeling import BertConfig
        from multimodal_bert.bert import MultiModalBertForImageCaptionRetrieval
        from multimodal_bert.bert import BertForMultiModalPreTraining
    else:
        from multimodal_bert.multi_modal_bert import (
            MultiModalBertForImageCaptionRetrieval,
            BertConfig,
        )
        from multimodal_bert.multi_modal_bert import BertForMultiModalPreTraining

    print(args)
    if args.save_name is not "":
        timeStamp = args.save_name
    else:
        timeStamp = strftime("%d-%b-%y-%X-%a", gmtime())
        timeStamp += "_{:0>6d}".format(random.randint(0, 10e6))

    savePath = os.path.join(args.output_dir, timeStamp)

    if not os.path.exists(savePath):
        os.makedirs(savePath)

    config = BertConfig.from_json_file(args.config_file)
    # save all the hidden parameters.
    with open(os.path.join(savePath, "command.txt"), "w") as f:
        print(args, file=f)  # Python 3.x
        print("\n", file=f)
        print(config, file=f)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device(
            "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
        )
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
            device, n_gpu, bool(args.local_rank != -1), args.fp16
        )
    )

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                args.gradient_accumulation_steps
            )
        )

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
    #     raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # train_examples = None
    num_train_optimization_steps = None

    print("Loading Train Dataset", args.val_file)

    tokenizer = BertTokenizer.from_pretrained(
        args.bert_model, do_lower_case=args.do_lower_case
    )
    image_features_reader = ImageFeaturesH5Reader(args.features_h5path, True)
    eval_dset = COCORetreivalDatasetVal(args.val_file, image_features_reader, tokenizer)

    config.fast_mode = True
    if args.from_pretrained:
        if args.zero_shot:
            model = BertForMultiModalPreTraining.from_pretrained(
                args.pretrained_weight, config
            )
        else:
            model = MultiModalBertForImageCaptionRetrieval.from_pretrained(
                args.pretrained_weight, config, dropout_prob=0.1
            )
    else:
        if args.zero_shot:
            model = BertForMultiModalPreTraining.from_pretrained(
                args.bert_model, config
            )
        else:
            model = MultiModalBertForImageCaptionRetrieval.from_pretrained(
                args.bert_model, config, dropout_prob=0.1
            )

    if args.fp16:
        model.half()
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    model.cuda()
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(eval_dset))
    logger.info("  Batch size = %d", args.train_batch_size)

    eval_dataloader = DataLoader(
        eval_dset,
        shuffle=False,
        batch_size=1,
        num_workers=args.num_workers,
        pin_memory=False,
    )

    startIterID = 0
    global_step = 0
    masked_loss_v_tmp = 0
    masked_loss_t_tmp = 0
    next_sentence_loss_tmp = 0
    loss_tmp = 0

    r1, r5, r10, medr, meanr = evaluate(args, model, eval_dataloader)
    print("finish evaluation, save result to %s")

    val_name = args.val_file.split("/")[-1]
    with open(os.path.join(savePath, val_name + "_result.txt"), "w") as f:
        print(
            "r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f"
            % (r1, r5, r10, medr, meanr),
            file=f,
        )
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    # Data files for VQA task.
    parser.add_argument("--features_h5path", default="data/coco/test2015.h5")
    parser.add_argument(
        "--train_file",
        default="data/VQA/training",
        type=str,
        # required=True,
        help="The input train corpus.",
    )
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )

    parser.add_argument(
        "--pretrained_weight",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )

    parser.add_argument(
        "--output_dir",
        default="save",
        type=str,
        # required=True,
        help=
        "The output directory where the model checkpoints will be written.",
    )

    parser.add_argument(
        "--config_file",
        default="config/bert_config.json",
        type=str,
        # required=True,
        help="The config file which specified the model details.",
    )
    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=30,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.",
    )

    parser.add_argument("--use_location",
                        action="store_true",
                        help="whether use location.")
    parser.add_argument(
        "--train_batch_size",
        default=128,
        type=int,
        help="Total batch size for training.",
    )
    parser.add_argument(
        "--learning_rate",
        default=5e-5,
        type=float,
        help="The initial learning rate for Adam.",
    )
    parser.add_argument(
        "--num_train_epochs",
        default=30,
        type=int,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--warmup_proportion",
        default=0.01,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="local_rank for distributed training on gpus",
    )

    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass.",
    )
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument(
        "--num_workers",
        type=int,
        default=20,
        help="Number of workers in the dataloader.",
    )
    parser.add_argument(
        "--from_pretrained",
        action="store_true",
        help="Wheter the tensor is from pretrained.",
    )
    parser.add_argument("--save_name",
                        default="",
                        type=str,
                        help="save name for training.")
    parser.add_argument(
        "--baseline",
        action="store_true",
        help="Wheter to use the baseline model (single bert).",
    )
    parser.add_argument("--split",
                        default="test",
                        type=str,
                        help="train or trainval.")

    parser.add_argument(
        "--use_chunk",
        default=0,
        type=float,
        help="whether use chunck for parallel training.",
    )
    args = parser.parse_args()

    if args.baseline:
        from pytorch_pretrained_bert.modeling import BertConfig
        from multimodal_bert.bert import MultiModalBertForVQA
    else:
        from multimodal_bert.multi_modal_bert import MultiModalBertForVQA, BertConfig

    print(args)
    if args.save_name is not "":
        timeStamp = args.save_name
    else:
        timeStamp = strftime("%d-%b-%y-%X-%a", gmtime())
        timeStamp += "_{:0>6d}".format(random.randint(0, 10e6))

    savePath = os.path.join(args.output_dir, timeStamp)

    if not os.path.exists(savePath):
        os.makedirs(savePath)

    config = BertConfig.from_json_file(args.config_file)
    # save all the hidden parameters.
    with open(os.path.join(savePath, "command.txt"), "w") as f:
        print(args, file=f)  # Python 3.x
        print("\n", file=f)
        print(config, file=f)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # train_examples = None
    num_train_optimization_steps = None

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    image_features_reader = ImageFeaturesH5Reader(args.features_h5path, True)

    if args.split == "minval":
        eval_dset = VQAClassificationDataset("minval",
                                             image_features_reader,
                                             tokenizer,
                                             dataroot="data/VQA")
    elif args.split == "test":
        eval_dset = VQAClassificationDataset("test",
                                             image_features_reader,
                                             tokenizer,
                                             dataroot="data/VQA")
    elif args.split == "val":
        eval_dset = VQAClassificationDataset("val",
                                             image_features_reader,
                                             tokenizer,
                                             dataroot="data/VQA")
    elif args.split == "test-dev":
        eval_dset = VQAClassificationDataset("test-dev",
                                             image_features_reader,
                                             tokenizer,
                                             dataroot="data/VQA")

    num_labels = eval_dset.num_ans_candidates
    if args.from_pretrained:
        model = MultiModalBertForVQA.from_pretrained(args.pretrained_weight,
                                                     config,
                                                     num_labels=num_labels)
    else:
        model = MultiModalBertForVQA.from_pretrained(args.bert_model,
                                                     config,
                                                     num_labels=num_labels)

    if args.fp16:
        model.half()
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = DataParallel(model, use_chuncks=args.use_chunk)

    model.cuda()

    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_dset))
    logger.info("  Batch size = %d", args.train_batch_size)

    eval_dataloader = DataLoader(
        eval_dset,
        shuffle=False,
        batch_size=args.train_batch_size,
        num_workers=args.num_workers,
        pin_memory=True,
    )

    startIterID = 0
    global_step = 0
    masked_loss_v_tmp = 0
    masked_loss_t_tmp = 0
    next_sentence_loss_tmp = 0
    loss_tmp = 0
    start_t = timer()

    model.train(False)
    eval_score, bound = evaluate(args, model, eval_dataloader)
    logger.info("\teval score: %.2f (%.2f)" % (100 * eval_score, 100 * bound))
Beispiel #21
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="save",
        type=str,
        help=
        "The output directory where the model checkpoints will be written.",
    )
    parser.add_argument(
        "--config_file",
        default="config/bert_config.json",
        type=str,
        help="The config file which specified the model details.",
    )
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        "--num_train_epochs",
        default=20,
        type=int,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="random seed for initialization")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass.",
    )
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument("--num_workers",
                        type=int,
                        default=16,
                        help="Number of workers in the dataloader.")
    parser.add_argument(
        "--save_name",
        default='',
        type=str,
        help="save name for training.",
    )
    parser.add_argument("--use_chunk",
                        default=0,
                        type=float,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--in_memory",
                        default=False,
                        type=bool,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--optimizer",
                        default='BertAdam',
                        type=str,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--tasks",
                        default='',
                        type=str,
                        help="1-2-3... training task separate by -")
    parser.add_argument(
        "--freeze",
        default=-1,
        type=int,
        help="till which layer of textual stream of vilbert need to fixed.")
    parser.add_argument("--vision_scratch",
                        action="store_true",
                        help="whether pre-trained the image or not.")
    parser.add_argument("--evaluation_interval",
                        default=1,
                        type=int,
                        help="evaluate very n epoch.")
    parser.add_argument("--lr_scheduler",
                        default='mannul',
                        type=str,
                        help="whether use learning rate scheduler.")
    parser.add_argument("--baseline",
                        action="store_true",
                        help="whether use single stream baseline.")
    parser.add_argument("--compact",
                        action="store_true",
                        help="whether use compact vilbert model.")
    parser.add_argument("--debug",
                        action="store_true",
                        help="whether in debug mode.")
    parser.add_argument(
        "--tensorboard_dir",
        default="tensorboard_log",
        type=str,
        help="The output directory where tensorboard log will be written.",
    )
    parser.add_argument(
        "--batch_size",
        default=-1,
        type=int,
        help="Custom Batch size for task.",
    )
    parser.add_argument(
        "--data_root",
        default="",
        type=str,
        help="The data root of the task.",
    )
    args = parser.parse_args()
    with open('vlbert_tasks.yml', 'r') as f:
        task_cfg = edict(yaml.load(f))

    # random.seed(args.seed)
    # np.random.seed(args.seed)
    # torch.manual_seed(args.seed)

    if args.baseline:
        from pytorch_pretrained_bert.modeling import BertConfig
        from vilbert.basebert import BaseBertForVLTasks
    elif args.compact:
        from vilbert.vilbert_compact import BertConfig
        from vilbert.vilbert_compact import VILBertForVLTasks
    else:
        from vilbert.vilbert import BertConfig
        from vilbert.vilbert import VILBertForVLTasks

    task_names = []
    task_lr = []
    for i, task_id in enumerate(args.tasks.split('-')):
        task = 'TASK' + task_id
        name = task_cfg[task]['name']
        task_names.append(name)
        task_lr.append(task_cfg[task]['lr'])

    base_lr = min(task_lr)
    loss_scale = {}
    for i, task_id in enumerate(args.tasks.split('-')):
        task = 'TASK' + task_id
        loss_scale[task] = task_lr[i] / base_lr

    if args.save_name:
        prefix = '-' + args.save_name
    else:
        prefix = ''
    timeStamp = '-'.join(task_names) + '_' + args.config_file.split(
        '/')[1].split('.')[0] + prefix
    savePath = os.path.join(args.output_dir, timeStamp)
    logPath = os.path.join(args.tensorboard_dir, timeStamp)

    bert_weight_name = json.load(
        open("config/" + args.bert_model + "_weight_name.json", "r"))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu:
        if not os.path.exists(savePath):
            os.makedirs(savePath)

    config = BertConfig.from_json_file(args.config_file)
    if default_gpu:
        # save all the hidden parameters.
        with open(os.path.join(savePath, 'command.txt'), 'w') as f:
            print(args, file=f)  # Python 3.x
            print('\n', file=f)
            print(config, file=f)

    if args.batch_size != -1:
        for i, task_id in enumerate(args.tasks.split('-')):
            task = 'TASK' + task_id
            task_cfg[task]['batch_size'] = args.batch_size

    if args.data_root != "":
        for i, task_id in enumerate(args.tasks.split('-')):
            data_root = args.data_root
            task = 'TASK' + task_id
            task_cfg[task]['dataroot'] = data_root
            task_cfg[task]['features_h5path1'] = os.path.join(
                data_root, task_cfg[task]['features_h5path1'].split('/')[-1])
            task_cfg[task]['features_h5path2'] = os.path.join(
                data_root, task_cfg[task]['features_h5path2'].split('/')[-1])
            task_cfg[task]['train_annotations_jsonpath'] = os.path.join(
                data_root,
                task_cfg[task]['train_annotations_jsonpath'].split('/')[-1])
            task_cfg[task]['val_annotations_jsonpath'] = os.path.join(
                data_root,
                task_cfg[task]['val_annotations_jsonpath'].split('/')[-1])

    # Done it for VCR Dataset only, need to put this train_100.jsonl for other datasets
    if args.debug:
        for i, task_id in enumerate(args.tasks.split('-')):
            task = 'TASK' + task_id
            task_cfg[task]['train_annotations_jsonpath'] = '/'.join(
                task_cfg[task]['train_annotations_jsonpath'].split('/')[:-1] +
                ['train_100.jsonl'])
            task_cfg[task]['val_annotations_jsonpath'] = '/'.join(
                task_cfg[task]['val_annotations_jsonpath'].split('/')[:-1] +
                ['val_100.jsonl'])
            task_cfg[task]['batch_size'] = 2

    # Have added args.debug to only VCR Datasets (vcr_dataset.py) will need to add it to other dataset too.
    task_batch_size, task_num_iters, task_ids, task_datasets_train, task_datasets_val, \
            task_dataloader_train, task_dataloader_val = LoadDatasets(args, task_cfg, args.tasks.split('-'), args.debug)

    tbLogger = utils.tbLogger(logPath, savePath, task_names, task_ids,
                              task_num_iters, args.gradient_accumulation_steps)

    # if n_gpu > 0:
    # torch.cuda.manual_seed_all(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    num_train_optimization_steps = max(task_num_iters.values(
    )) * args.num_train_epochs // args.gradient_accumulation_steps
    num_labels = max(
        [dataset.num_labels for dataset in task_datasets_train.values()])

    task_start_iter = {}
    task_interval = {}
    for task_id, num_iter in task_num_iters.items():
        task_start_iter[task_id] = num_train_optimization_steps - (
            task_cfg[task]['num_epoch'] * num_iter //
            args.gradient_accumulation_steps)
        task_interval[task_id] = num_train_optimization_steps // (
            task_cfg[task]['num_epoch'] * num_iter //
            args.gradient_accumulation_steps)

    if args.baseline:
        model = BaseBertForVLTasks.from_pretrained(args.from_pretrained,
                                                   config,
                                                   num_labels=num_labels,
                                                   default_gpu=default_gpu)
    else:
        model = VILBertForVLTasks.from_pretrained(args.from_pretrained,
                                                  config,
                                                  num_labels=num_labels,
                                                  default_gpu=default_gpu)

    task_losses = LoadLosses(args, task_cfg, args.tasks.split('-'))
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model, delay_allreduce=True)

    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    if args.freeze != -1:
        bert_weight_name_filtered = []
        for name in bert_weight_name:
            if 'embeddings' in name:
                bert_weight_name_filtered.append(name)
            elif 'encoder' in name:
                layer_num = name.split('.')[2]
                if int(layer_num) <= args.freeze:
                    bert_weight_name_filtered.append(name)

        optimizer_grouped_parameters = []
        for key, value in dict(model.named_parameters()).items():
            if key[12:] in bert_weight_name_filtered:
                value.requires_grad = False

        if default_gpu:
            print("filtered weight")
            print(bert_weight_name_filtered)

    optimizer_grouped_parameters = []
    lr = args.learning_rate
    for key, value in dict(model.named_parameters()).items():
        if value.requires_grad:
            if 'vil_prediction' in key:
                # if args.learning_rate <= 2e-5:
                lr = 1e-4
            else:
                if args.vision_scratch:
                    if key[12:] in bert_weight_name:
                        lr = args.learning_rate
                    else:
                        lr = 1e-4
                else:
                    lr = args.learning_rate
            if any(nd in key for nd in no_decay):
                optimizer_grouped_parameters += [{
                    "params": [value],
                    "lr": lr,
                    "weight_decay": 0.01
                }]
            if not any(nd in key for nd in no_decay):
                optimizer_grouped_parameters += [{
                    "params": [value],
                    "lr": lr,
                    "weight_decay": 0.0
                }]

    if default_gpu:
        print(len(list(model.named_parameters())),
              len(optimizer_grouped_parameters))

    max_num_iter = max(task_num_iters.values())
    max_batch_size = max(task_batch_size.values())

    if args.optimizer == 'BertAdam':
        optimizer = BertAdam(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            warmup=args.warmup_proportion,
            t_total=num_train_optimization_steps,
            schedule='warmup_constant',
        )
    elif args.optimizer == 'Adam':
        optimizer = Adam(
            optimizer_grouped_parameters,
            lr=base_lr,
            warmup=args.warmup_proportion,
            t_total=num_train_optimization_steps,
            schedule='warmup_constant',
        )
    elif args.optimizer == 'Adamax':
        optimizer = Adamax(
            optimizer_grouped_parameters,
            lr=base_lr,
            warmup=args.warmup_proportion,
            t_total=num_train_optimization_steps,
            schedule='warmup_constant',
        )

    if args.lr_scheduler == 'automatic':
        lr_scheduler = ReduceLROnPlateau(optimizer, \
                        mode='max',
                        factor=0.2,
                        patience=1,
                        cooldown=1,
                        threshold=0.001)
    elif args.lr_scheduler == 'mannul':
        lr_reduce_list = np.array([12, 16])

        # lr_reduce_list = np.array([6, 8, 10])
        def lr_lambda_fun(epoch):
            return pow(0.1, np.sum(lr_reduce_list <= epoch))

        lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun)

    if default_gpu:
        print("***** Running training *****")
        print("  Num Iters: ", task_num_iters)
        print("  Batch size: ", task_batch_size)
        print("  Num steps: %d" % num_train_optimization_steps)

    startIterID = 0
    # initialize the data iteration.
    task_iter_train = {name: None for name in task_ids}
    task_count = {name: 0 for name in task_ids}
    for epochId in tqdm(range(args.num_train_epochs), desc="Epoch"):
        model.train()
        for step in range(max_num_iter):
            iterId = startIterID + step + (epochId * max_num_iter)
            for task_id in task_ids:
                if iterId >= task_start_iter[task_id]:
                    # if iterId % task_interval[task_id] == 0:
                    loss, score = ForwardModelsTrain(args, task_cfg, device,
                                                     task_id, task_count,
                                                     task_iter_train,
                                                     task_dataloader_train,
                                                     model, task_losses,
                                                     task_start_iter)
                    loss = loss * loss_scale[task_id]
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    loss.backward()
                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        optimizer.step()
                        model.zero_grad()

                        if default_gpu:
                            tbLogger.step_train(epochId, iterId, float(loss),
                                                float(score),
                                                optimizer.show_lr(), task_id,
                                                'train')

            if step % (20 * args.gradient_accumulation_steps
                       ) == 0 and step != 0 and default_gpu:
                tbLogger.showLossTrain()

        model.eval()
        # when run evaluate, we run each task sequentially.
        for task_id in task_ids:
            for i, batch in enumerate(task_dataloader_val[task_id]):
                loss, score, batch_size = ForwardModelsVal(
                    args, task_cfg, device, task_id, batch, model, task_losses)
                tbLogger.step_val(epochId, float(loss), float(score), task_id,
                                  batch_size, 'val')
                if default_gpu:
                    sys.stdout.write('%d/%d\r' %
                                     (i, len(task_dataloader_val[task_id])))
                    sys.stdout.flush()

        ave_score = tbLogger.showLossVal()
        if args.lr_scheduler == 'automatic':
            lr_scheduler.step(ave_score)
            logger.info("best average score is %3f" % lr_scheduler.best)
        else:
            lr_scheduler.step()

        if default_gpu:
            # Save a trained model
            logger.info("** ** * Saving fine - tuned model on " + logPath +
                        "** ** * ")
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Only save the model it-self

            if not os.path.exists(savePath):
                os.makedirs(savePath)
            output_model_file = os.path.join(
                savePath, "pytorch_model_" + str(epochId) + ".bin")
            torch.save(model_to_save.state_dict(), output_model_file)

    tbLogger.txt_close()
Beispiel #22
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    ## Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
        "of training.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument(
        '--version_2_with_negative',
        action='store_true',
        help=
        'If true, the SQuAD examples contain some that do not have an answer.')
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null."
    )
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--checkpoint_dir',
                        type=str,
                        default='',
                        help="Checkpoint directory used for prediction.")
    parser.add_argument('--checkpoint_filename',
                        type=str,
                        default='',
                        help="Checkpoint binary filename used for prediction.")
    parser.add_argument('--output_result_file',
                        type=str,
                        default='',
                        help="Result files will be save in this directory.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory () already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
    if args.local_rank == 0:
        torch.distributed.barrier()

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_train:
        if args.local_rank in [-1, 0]:
            tb_writer = SummaryWriter()
        # Prepare data loader
        logger.info("***** Preparing Data *****")
        train_examples = read_thai_qa_examples(input_file=args.train_file,
                                               is_training=True)
        cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}_cache'.format(
            list(filter(None, args.bert_model.split('/'))).pop(),
            str(args.max_seq_length), str(args.doc_stride),
            str(args.max_query_length))
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions,
                                   all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        num_train_optimization_steps = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs
        # if args.local_rank != -1:
        #     num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

        logger.info("***** Finished Preparing Data *****")

        # Prepare optimizer
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)
        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        model.train()
        lowest_training_loss = float('inf')
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Iteration",
                         disable=args.local_rank not in [-1, 0])):
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used and handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()

                    global_step += 1
                    if args.local_rank in [-1, 0]:
                        tb_writer.add_scalar('lr',
                                             optimizer.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar('loss', loss.item(), global_step)

                # Save the best model aka lowest training loss in this case
                loss_scalar = loss.item()
                if loss_scalar < lowest_training_loss:
                    lowest_training_loss = loss_scalar
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    output_model_file = os.path.join(args.output_dir,
                                                     f"best_weight.bin")
                    torch.save(model_to_save.state_dict(), output_model_file)

            # Save the model every epoch
            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(args.output_dir,
                                             f"epoch_{epoch}.bin")
            torch.save(model_to_save.state_dict(), output_model_file)

    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
        torch.save(args, output_args_file)
    else:
        model = BertForQuestionAnswering.from_pretrained(args.bert_model)

    model.to(device)

    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):

        # Load checkpoint
        logger.info(
            f'Loading checkpoint from {args.checkpoint_dir}/{args.checkpoint_filename}'
        )
        config = BertConfig.from_json_file(
            f'{args.checkpoint_dir}/config.json')
        model = BertForQuestionAnswering(config)
        state_dict = torch.load(
            f'{args.checkpoint_dir}/{args.checkpoint_filename}')
        model.load_state_dict(state_dict)
        tokenizer = BertTokenizer(f'{args.checkpoint_dir}/vocab.txt',
                                  do_lower_case=args.do_lower_case)
        model.to(device)

        eval_examples = read_thai_qa_examples(input_file=args.predict_file,
                                              is_training=False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader,
                desc="Evaluating",
                disable=args.local_rank not in [-1, 0]):
            # if len(all_results) % 1000 == 0:
            #     logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(
                    input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))
        # output_prediction_file = os.path.join(args.output_dir, "predictions.json")
        # output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
        # output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, args.output_result_file,
                          args.verbose_logging, args.null_score_diff_threshold,
                          tokenizer)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="save",
        type=str,
        help=
        "The output directory where the model checkpoints will be written.",
    )
    parser.add_argument(
        "--config_file",
        default="config/bert_config.json",
        type=str,
        help="The config file which specified the model details.",
    )
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        "--num_train_epochs",
        default=20,
        type=int,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="random seed for initialization")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass.",
    )
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument("--num_workers",
                        type=int,
                        default=16,
                        help="Number of workers in the dataloader.")
    parser.add_argument(
        "--save_name",
        default='',
        type=str,
        help="save name for training.",
    )
    parser.add_argument("--use_chunk",
                        default=0,
                        type=float,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--in_memory",
                        default=False,
                        type=bool,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--optimizer",
                        default='BertAdam',
                        type=str,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--tasks",
                        default='',
                        type=str,
                        help="1-2-3... training task separate by -")
    parser.add_argument(
        "--freeze",
        default=-1,
        type=int,
        help="till which layer of textual stream of vilbert need to fixed.")
    parser.add_argument("--vision_scratch",
                        action="store_true",
                        help="whether pre-trained the image or not.")
    parser.add_argument("--evaluation_interval",
                        default=1,
                        type=int,
                        help="evaluate very n epoch.")
    parser.add_argument("--lr_scheduler",
                        default='mannul',
                        type=str,
                        help="whether use learning rate scheduler.")
    parser.add_argument("--baseline",
                        action="store_true",
                        help="whether use single stream baseline.")
    parser.add_argument("--compact",
                        action="store_true",
                        help="whether use compact vilbert model.")
    parser.add_argument("--debug",
                        action="store_true",
                        help="whether in debug mode.")
    parser.add_argument(
        "--tensorboard_dir",
        default="tensorboard_log",
        type=str,
        help="The output directory where tensorboard log will be written.",
    )
    parser.add_argument(
        "--batch_size",
        default=-1,
        type=int,
        help="Custom Batch size for task.",
    )
    parser.add_argument(
        "--data_root",
        default="",
        type=str,
        help="The data root of the task.",
    )
    args = parser.parse_args()
    with open('vlbert_tasks.yml', 'r') as f:
        task_cfg = edict(yaml.load(f))

    # random.seed(args.seed)
    # np.random.seed(args.seed)
    # torch.manual_seed(args.seed)

    if args.baseline:
        from pytorch_pretrained_bert.modeling import BertConfig
        from vilbert.basebert import BaseBertForVLTasks
    elif args.compact:
        from vilbert.vilbert_compact import BertConfig
        from vilbert.vilbert_compact import VILBertForVLTasks
    else:
        from vilbert.vilbert import BertConfig
        from vilbert.vilbert import VILBertForVLTasks

    task_names = []
    task_lr = []
    for i, task_id in enumerate(args.tasks.split('-')):
        task = 'TASK' + task_id
        name = task_cfg[task]['name']
        task_names.append(name)
        task_lr.append(task_cfg[task]['lr'])

    base_lr = min(task_lr)
    loss_scale = {}
    for i, task_id in enumerate(args.tasks.split('-')):
        task = 'TASK' + task_id
        loss_scale[task] = task_lr[i] / base_lr

    if args.save_name:
        prefix = '-' + args.save_name
    else:
        prefix = ''

    timeStamp = '-'.join(task_names) + '_' + args.config_file.split(
        '/')[1].split('.')[0] + prefix
    savePath = os.path.join(args.output_dir, timeStamp)
    logPath = os.path.join(args.tensorboard_dir, timeStamp)

    # removes everything in that directory
    if os.path.isdir(logPath):
        logger.error('Tensorboard Log path exists. Overwriting.')

    bert_weight_name = json.load(
        open("config/" + args.bert_model + "_weight_name.json", "r"))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu:
        if not os.path.exists(savePath):
            os.makedirs(savePath)

    config = BertConfig.from_json_file(args.config_file)
    if default_gpu:
        # save all the hidden parameters.
        with open(os.path.join(savePath, 'command.txt'), 'w') as f:
            print(args, file=f)  # Python 3.x
            print('\n', file=f)
            print(config, file=f)

    if args.batch_size != -1:
        for i, task_id in enumerate(args.tasks.split('-')):
            task = 'TASK' + task_id
            task_cfg[task]['batch_size'] = args.batch_size

    if args.data_root != "":
        for i, task_id in enumerate(args.tasks.split('-')):
            data_root = args.data_root
            task = 'TASK' + task_id
            task_cfg[task]['dataroot'] = data_root
            task_cfg[task]['features_h5path1'] = os.path.join(
                data_root, task_cfg[task]['features_h5path1'].split('/')[-1])
            task_cfg[task]['features_h5path2'] = os.path.join(
                data_root, task_cfg[task]['features_h5path2'].split('/')[-1])
            task_cfg[task]['train_annotations_jsonpath'] = os.path.join(
                data_root,
                task_cfg[task]['train_annotations_jsonpath'].split('/')[-1])
            task_cfg[task]['val_annotations_jsonpath'] = os.path.join(
                data_root,
                task_cfg[task]['val_annotations_jsonpath'].split('/')[-1])

    # Done it for VCR Dataset only, need to put this train_100.jsonl for other datasets
    if args.debug:
        for i, task_id in enumerate(args.tasks.split('-')):
            task = 'TASK' + task_id
            task_cfg[task]['train_annotations_jsonpath'] = '/'.join(
                task_cfg[task]['train_annotations_jsonpath'].split('/')[:-1] +
                ['train_100.jsonl'])
            task_cfg[task]['val_annotations_jsonpath'] = '/'.join(
                task_cfg[task]['val_annotations_jsonpath'].split('/')[:-1] +
                ['val_100.jsonl'])
            task_cfg[task]['batch_size'] = 90

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Barrier to make sure only the first process in distributed training download model & vocab

    gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
    # Have added args.debug to only VCR Datasets (vcr_dataset.py) will need to add it to other dataset too.
    task_batch_size, task_num_iters, task_ids, task_datasets_train, task_datasets_val, \
            task_dataloader_train, task_dataloader_val = LoadDatasets(args, task_cfg, gpt2_tokenizer, args.tasks.split('-'), args.debug)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # End of barrier to make sure only the first process in distributed training download model & vocab

    tbLogger = utils.tbLogger(logPath, savePath, task_names, task_ids,
                              task_num_iters, args.gradient_accumulation_steps)

    # if n_gpu > 0:
    # torch.cuda.manual_seed_all(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    num_train_optimization_steps = max(task_num_iters.values(
    )) * args.num_train_epochs // args.gradient_accumulation_steps
    num_labels = max(
        [dataset.num_labels for dataset in task_datasets_train.values()])

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Barrier to make sure only the first process in distributed training download model & vocab

    if args.baseline:
        vil_model = BaseBertForVLTasks.from_pretrained(args.from_pretrained,
                                                       config,
                                                       num_labels=num_labels,
                                                       default_gpu=default_gpu)
    else:
        vil_model = VILBertForVLTasks.from_pretrained(args.from_pretrained,
                                                      config,
                                                      num_labels=num_labels,
                                                      default_gpu=default_gpu)

    model = ViLBertGPT2(vil_model,
                        gpt2_tokenizer,
                        gpt2_embed_dim=768,
                        config=config)
    PATH = 'save/trained_models/vilbert_gpt2_loss_3_1.bin'
    model.load_state_dict(torch.load(PATH))
    model.to(device)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # End of barrier to make sure only the first process in distributed training download model & vocab

    task_losses = LoadLosses(args, task_cfg, args.tasks.split('-'))
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model, delay_allreduce=True)

    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    model.eval()
    # when run evaluate, we run each task sequentially.
    epochId = 0
    for task_id in task_ids:
        num_batch_10 = int(0.1 * len(task_dataloader_val[task_id]))
        if args.debug:
            num_batch_10 = 1
        for i, batch in enumerate(task_dataloader_val[task_id]):
            # generate
            if i % num_batch_10 == 0:
                generate = True
                loss_vl, gpt2_loss, score, batch_size, bleu_score = ForwardModelsVal(
                    args,
                    task_cfg,
                    device,
                    task_id,
                    batch,
                    model,
                    task_losses,
                    generate=generate)
            else:
                generate = False
                loss_vl, gpt2_loss, score, batch_size = ForwardModelsVal(
                    args,
                    task_cfg,
                    device,
                    task_id,
                    batch,
                    model,
                    task_losses,
                    generate=generate)

            loss = loss_vl + gpt2_loss
            tbLogger.step_val(epochId, float(loss), float(loss_vl),
                              float(gpt2_loss), float(score), bleu_score,
                              task_id, batch_size, 'val')

            if default_gpu:
                sys.stdout.write('%d/%d\r' %
                                 (i, len(task_dataloader_val[task_id])))
                sys.stdout.flush()

            epochId += 1

    ave_score = tbLogger.showLossVal()
    tbLogger.txt_close()
def main():
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer(vocab_file=args.vocab_file)

    train_examples = None
    num_train_optimization_steps = None
    vocab_list = []
    with open(args.vocab_file, 'r') as fr:
        for line in fr:
            vocab_list.append(line.strip("\n"))

    if args.do_train:
        train_examples = create_examples(
            data_path=args.pretrain_train_path,
            max_seq_length=args.max_seq_length,
            masked_lm_prob=args.masked_lm_prob,
            max_predictions_per_seq=args.max_predictions_per_seq,
            vocab_list=vocab_list)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    model = BertForMaskedLM(
        config=BertConfig.from_json_file(args.bert_config_json))
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    best_loss = 100000

    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      args.max_seq_length,
                                                      tokenizer)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for e in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                # masked_lm_loss
                loss = model(input_ids, segment_ids, input_mask, label_ids)

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                if nb_tr_steps > 0 and nb_tr_steps % 100 == 0:
                    logger.info(
                        "===================== -epoch %d -train_step %d -train_loss %.4f\n"
                        % (e, nb_tr_steps, tr_loss / nb_tr_steps))

            if nb_tr_steps > 0 and nb_tr_steps % 2000 == 0:
                eval_examples = create_examples(
                    data_path=args.pretrain_dev_path,
                    max_seq_length=args.max_seq_length,
                    masked_lm_prob=args.masked_lm_prob,
                    max_predictions_per_seq=args.max_predictions_per_seq,
                    vocab_list=vocab_list)
                eval_features = convert_examples_to_features(
                    eval_examples, args.max_seq_length, tokenizer)
                all_input_ids = torch.tensor(
                    [f.input_ids for f in eval_features], dtype=torch.long)
                all_input_mask = torch.tensor(
                    [f.input_mask for f in eval_features], dtype=torch.long)
                all_segment_ids = torch.tensor(
                    [f.segment_ids for f in eval_features], dtype=torch.long)
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.long)
                eval_data = TensorDataset(all_input_ids, all_input_mask,
                                          all_segment_ids, all_label_ids)
                # Run prediction for full data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data,
                                             sampler=eval_sampler,
                                             batch_size=args.eval_batch_size)

                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                for input_ids, input_mask, segment_ids, label_ids in tqdm(
                        eval_dataloader, desc="Evaluating"):
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        loss = model(input_ids, segment_ids, input_mask,
                                     label_ids)

                    eval_loss += loss.item()
                    nb_eval_steps += 1

                eval_loss = eval_loss / nb_eval_steps
                if eval_loss < best_loss:
                    # Save a trained model, configuration and tokenizer
                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self

                    # If we save using the predefined names, we can load using `from_pretrained`
                    output_model_file = os.path.join(args.output_dir,
                                                     WEIGHTS_NAME)
                    torch.save(model_to_save.state_dict(), output_model_file)
                    best_loss = eval_loss
                logger.info(
                    "============================ -epoch %d -train_loss %.4f -eval_loss %.4f\n"
                    % (e, tr_loss / nb_tr_steps, eval_loss))
Beispiel #25
0
def test(args):
    # initial device and gpu number
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    print("[Test] device: {} n_gpu: {}".format(device, n_gpu))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    model_file = os.path.join(args.model_path, WEIGHTS_NAME)
    config_file = os.path.join(args.model_path, CONFIG_NAME)
    # Prepare model
    print('[Test] Load model...')
    config = BertConfig.from_json_file(config_file)
    model = Summarizer(args,
                       device,
                       load_pretrained_bert=False,
                       bert_config=config)
    if args.fp16:
        model.half()
    model.to(device)
    # load check points
    model.load_cp(torch.load(model_file))
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # prepare testing data
    eval_dataloader = None
    eval_path = os.path.join(args.data_path, 'test.pt')
    print(f"[Test] ***** Prepare testing data from {eval_path} *****")

    eval_examples = torch.load(eval_path)
    eval_features = convert_examples_to_features(eval_examples)
    eval_data = get_dataset(eval_features)

    print(f"[Test]   Num examples = {len(eval_examples)}")
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # testing
    loss_f = torch.nn.BCELoss(reduction='none')
    refs, preds = [], []
    if eval_dataloader and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        model.eval()

        eval_loss, eval_pk, eval_windiff, eval_bound_sim = 0, 0, 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        for sent_idx, input_ids, input_mask, segment_ids, clss_ids, clss_mask, label_ids in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            clss_ids = clss_ids.to(device)
            clss_mask = clss_mask.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                sent_scores, mask = model(input_ids, segment_ids, clss_ids,
                                          input_mask, clss_mask)

                tmp_eval_loss = loss_f(sent_scores, label_ids.float())
                tmp_eval_loss = (tmp_eval_loss * mask.float()).sum()
                tmp_eval_loss = tmp_eval_loss / tmp_eval_loss.numel()

            sent_scores = sent_scores.detach().cpu().numpy()
            mask = mask.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            # find selected sentences
            _pred, selected_ids = select_seg(sent_idx, eval_examples,
                                             sent_scores, mask)
            preds.extend(_pred)
            tmp_pk, tmp_windiff, tmp_bound_sim = accuracy(
                selected_ids, label_ids, mask)

            eval_loss += tmp_eval_loss.mean().item()
            eval_pk += tmp_pk
            eval_windiff += tmp_windiff
            eval_bound_sim += tmp_bound_sim

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_pk = eval_pk / nb_eval_examples
        eval_windiff = eval_windiff / nb_eval_examples
        eval_bound_sim = eval_bound_sim / nb_eval_examples

        # print result
        print(
            f'[Test] loss: {eval_loss: 8.5f}, Pk: {100 * eval_pk: 3.3f} , WinDiff: {100 * eval_windiff: 3.3f} , Boundary Similarity: {100 * eval_bound_sim: 3.3f} '
        )

    # output results
    path_dir = args.result_path
    os.makedirs(path_dir, exist_ok=True)
    with open(os.path.join(path_dir, 'preds.txt'), 'w', encoding='utf-8') as f:
        f.write('\n'.join(preds))
Beispiel #26
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="results",
        type=str,
        help=
        "The output directory where the model checkpoints will be written.",
    )
    parser.add_argument(
        "--config_file",
        default="config/bert_config.json",
        type=str,
        help="The config file which specified the model details.",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument("--num_workers",
                        type=int,
                        default=10,
                        help="Number of workers in the dataloader.")
    parser.add_argument(
        "--save_name",
        default='',
        type=str,
        help="save name for training.",
    )
    parser.add_argument("--batch_size",
                        default=1000,
                        type=int,
                        help="what is the batch size?")
    parser.add_argument("--tasks",
                        default='',
                        type=str,
                        help="1-2-3... training task separate by -")
    parser.add_argument("--in_memory",
                        default=False,
                        type=bool,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--baseline",
                        action="store_true",
                        help="whether use single stream baseline.")
    parser.add_argument("--split",
                        default="",
                        type=str,
                        help="which split to use.")

    args = parser.parse_args()
    with open('vlbert_tasks.yml', 'r') as f:
        task_cfg = edict(yaml.safe_load(f))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if args.baseline:
        from pytorch_pretrained_bert.modeling import BertConfig
        from vilbert.basebert import BaseBertForVLTasks
    else:
        from vilbert.vilbert import BertConfig
        from vilbert.vilbert import VILBertForVLTasks

    task_names = []
    for i, task_id in enumerate(args.tasks.split('-')):
        task = 'TASK' + task_id
        name = task_cfg[task]['name']
        task_names.append(name)

    # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0]
    timeStamp = args.from_pretrained.split('/')[1] + '-' + args.save_name
    savePath = os.path.join(args.output_dir, timeStamp)

    config = BertConfig.from_json_file(args.config_file)
    bert_weight_name = json.load(
        open("config/" + args.bert_model + "_weight_name.json", "r"))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        torch.distributed.init_process_group(backend="nccl")

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu and not os.path.exists(savePath):
        os.makedirs(savePath)

    task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val \
                        = LoadDatasetEval(args, task_cfg, args.tasks.split('-'))

    tbLogger = utils.tbLogger(timeStamp,
                              savePath,
                              task_names,
                              task_ids,
                              task_num_iters,
                              1,
                              save_logger=False,
                              txt_name='eval.txt')

    num_labels = max(
        [dataset.num_labels for dataset in task_datasets_val.values()])

    if args.baseline:
        model = BaseBertForVLTasks.from_pretrained(args.from_pretrained,
                                                   config,
                                                   num_labels=num_labels,
                                                   default_gpu=default_gpu)
    else:
        model = VILBertForVLTasks.from_pretrained(args.from_pretrained,
                                                  config,
                                                  num_labels=num_labels,
                                                  default_gpu=default_gpu)

    task_losses = LoadLosses(args, task_cfg, args.tasks.split('-'))
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model, delay_allreduce=True)

    elif n_gpu > 1:
        model = nn.DataParallel(model)

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    print("  Num Iters: ", task_num_iters)
    print("  Batch size: ", task_batch_size)

    model.eval()
    for task_id in task_ids:
        results = []
        others = []
        for i, batch in enumerate(
                tqdm(task_dataloader_val[task_id])
        ):  #, total=len(task_dataloader_val[task_id]), position=0, leave=True):
            loss, score, batch_size, results, others = EvaluatingModel(args, task_cfg, device, \
                    task_id, batch, model, task_dataloader_val, task_losses, results, others)

            tbLogger.step_val(0, float(loss), float(score), task_id,
                              batch_size, 'val')

            # sys.stdout.write('%d/%d\r' % (i, len(task_dataloader_val[task_id])))
            # sys.stdout.flush()
        # save the result or evaluate the result.
        ave_score = tbLogger.showLossVal()

        if args.split:
            json_path = os.path.join(savePath, args.split)
        else:
            json_path = os.path.join(savePath, task_cfg[task_id]['val_split'])

        json.dump(results, open(json_path + '_result.json', 'w'))
        json.dump(others, open(json_path + '_others.json', 'w'))
Beispiel #27
0
def main():
    parser = argparse.ArgumentParser()

    # Data files for FOIL task.
    parser.add_argument(
        "--features_h5path",
        default="/coc/pskynet2/jlu347/multi-modal-bert/data/referExpression",
    )
    parser.add_argument("--instances-jsonpath", default="data/referExpression")
    parser.add_argument("--task", default="refcoco+")

    # Required parameters
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )

    parser.add_argument(
        "--pretrained_weight",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )

    parser.add_argument(
        "--output_dir",
        default="save",
        type=str,
        help=
        "The output directory where the model checkpoints will be written.",
    )

    parser.add_argument(
        "--config_file",
        default="config/bert_config.json",
        type=str,
        help="The config file which specified the model details.",
    )
    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=30,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.",
    )
    parser.add_argument(
        "--train_batch_size",
        default=128,
        type=int,
        help="Total batch size for training.",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )

    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="local_rank for distributed training on gpus",
    )

    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass.",
    )
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--num_workers",
        type=int,
        default=20,
        help="Number of workers in the dataloader.",
    )
    parser.add_argument(
        "--from_pretrained",
        action="store_true",
        help="Wheter the tensor is from pretrained.",
    )
    parser.add_argument(
        "--baseline",
        action="store_true",
        help="Wheter to use the baseline model (single bert).",
    )

    parser.add_argument(
        "--use_chunk",
        default=0,
        type=float,
        help="whether use chunck for parallel training.",
    )

    parser.add_argument(
        "--split",
        default="test",
        type=str,
        help="whether use chunck for parallel training.",
    )

    args = parser.parse_args()

    if args.baseline:
        from pytorch_pretrained_bert.modeling import BertConfig
        from multimodal_bert.bert import MultiModalBertForReferExpression
    else:
        from multimodal_bert.multi_modal_bert import (
            MultiModalBertForReferExpression,
            BertConfig,
        )

    # Declare path to save checkpoints.
    print(args)
    config = BertConfig.from_json_file(args.config_file)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    # random.seed(args.seed)
    # np.random.seed(args.seed)
    # torch.manual_seed(args.seed)
    # if n_gpu > 0:
    #     torch.cuda.manual_seed_all(args.seed)

    num_train_optimization_steps = None

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    features_h5path = os.path.join(args.features_h5path, args.task + ".h5")
    gt_features_h5path = os.path.join(args.features_h5path,
                                      args.task + "_gt.h5")

    image_features_reader = ImageFeaturesH5Reader(features_h5path, True)

    eval_dset = ReferExpressionDataset(
        args.task,
        args.split,
        args.instances_jsonpath,
        image_features_reader,
        None,
        tokenizer,
    )

    # config = BertConfig.from_json_file(args.config_file)

    num_labels = 2
    if args.from_pretrained:
        model = MultiModalBertForReferExpression.from_pretrained(
            args.pretrained_weight, config, dropout_prob=0.2)
    else:
        model = MultiModalBertForReferExpression(config, dropout_prob=0.2)

    print("loading model from %s" % (args.pretrained_weight))
    checkpoint = torch.load(args.pretrained_weight)
    model.load_state_dict(checkpoint)

    if args.fp16:
        model.half()
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = DataParallel(model, use_chuncks=args.use_chunk)

    model.cuda()

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(eval_dset))
    logger.info("  Batch size = %d", args.train_batch_size)

    eval_dataloader = DataLoader(
        eval_dset,
        shuffle=False,
        batch_size=args.train_batch_size,
        num_workers=args.num_workers,
        pin_memory=True,
    )

    model.eval()
    eval_loss, eval_score = evaluate(args, model, eval_dataloader)
    logger.info("\teval_loss: %.2f, score: %.2f" %
                (eval_loss, 100 * eval_score))
Beispiel #28
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="",
        type=str,
        help=
        "The output directory where the model checkpoints will be written.",
    )
    parser.add_argument(
        "--config_file",
        default="config/bert_config.json",
        type=str,
        help="The config file which specified the model details.",
    )
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        "--num_train_epochs",
        default=20,
        type=int,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="random seed for initialization")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass.",
    )
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument("--num_workers",
                        type=int,
                        default=16,
                        help="Number of workers in the dataloader.")
    parser.add_argument(
        "--save_name",
        default='',
        type=str,
        help="save name for training.",
    )
    parser.add_argument("--use_chunk",
                        default=0,
                        type=float,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--in_memory",
                        default=False,
                        type=bool,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--optimizer",
                        default='BertAdam',
                        type=str,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--tasks",
                        default='',
                        type=str,
                        help="1-2-3... training task separate by -")
    parser.add_argument(
        "--freeze",
        default=-1,
        type=int,
        help="till which layer of textual stream of vilbert need to fixed.")
    parser.add_argument("--vision_scratch",
                        action="store_true",
                        help="whether pre-trained the image or not.")
    parser.add_argument("--evaluation_interval",
                        default=1,
                        type=int,
                        help="evaluate very n epoch.")
    parser.add_argument("--lr_scheduler",
                        default='mannul',
                        type=str,
                        help="whether use learning rate scheduler.")
    parser.add_argument("--baseline",
                        action="store_true",
                        help="whether use single stream baseline.")
    parser.add_argument("--compact",
                        action="store_true",
                        help="whether use compact vilbert model.")
    parser.add_argument("--captions_path",
                        default='',
                        type=str,
                        help="1-2-3... training task separate by -")
    parser.add_argument("--cider_path",
                        default='',
                        type=str,
                        help="1-2-3... training task separate by -")
    parser.add_argument("--tsv_path",
                        default='',
                        type=str,
                        help="1-2-3... training task separate by -")
    parser.add_argument("--out_path",
                        default='',
                        type=str,
                        help="1-2-3... training task separate by -")

    args = parser.parse_args()
    assert len(args.output_dir) > 0

    with open('vlbert_tasks.yml', 'r') as f:
        task_cfg = edict(yaml.load(f))

    if args.baseline:
        from pytorch_pretrained_bert.modeling import BertConfig
        from vilbert.basebert import BaseBertForVLTasks
    elif args.compact:
        from vilbert.vilbert_compact import BertConfig
        from vilbert.vilbert_compact import VILBertForVLTasks
    else:
        from vilbert.vilbert import BertConfig
        from vilbert.vilbert import VILBertForVLTasks

    if args.save_name:
        prefix = '-' + args.save_name
    else:
        prefix = ''
    timeStamp = '_' + args.config_file.split('/')[1].split('.')[0] + prefix
    savePath = os.path.join(args.output_dir, timeStamp)

    bert_weight_name = json.load(
        open("config/" + args.bert_model + "_weight_name.json", "r"))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu:
        if not os.path.exists(savePath):
            os.makedirs(savePath)

    config = BertConfig.from_json_file(args.config_file)
    if default_gpu:
        # save all the hidden parameters.
        with open(os.path.join(savePath, 'command.txt'), 'w') as f:
            print(args, file=f)  # Python 3.x
            print('\n', file=f)
            print(config, file=f)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=True)
    dataset = CiderDataset(args.captions_path,
                           args.tsv_path,
                           args.cider_path,
                           tokenizer,
                           is_eval=True)
    '''
    length_of_data = len(dataset)
    length_of_val = length_of_data // 10

    train, val, test = random_split(dataset, [length_of_data - 2 * length_of_val, length_of_val, length_of_val])

    train_dataloader = DataLoader(train, batch_size=10, shuffle=True)
    val_dataloader = DataLoader(val, batch_size=10, shuffle=True)
    test_dataloader = DataLoader(test, batch_size=10, shuffle=False)
    '''

    val_dataloader = DataLoader(dataset, batch_size=10, shuffle=False)
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    model_list = [1]

    for model_name in model_list:
        model = VILBertForVLTasks.from_pretrained(args.from_pretrained,
                                                  config,
                                                  num_labels=1,
                                                  default_gpu=default_gpu)
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        i = 0

        # initialize the data iteration.
        actual_values = []
        predicted_values = []
        image_ids_list = []
        captions_list = []

        model.eval()
        for batch in val_dataloader:
            i += 1
            #if not args.no_cuda:
            #    batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch)
            features, spatials, image_mask, captions, _, input_mask, segment_ids, co_attention_mask, image_id, y, raw_captions = batch
            #print(image_id)
            #print(raw_captions)
            #print(type(raw_captions))
            captions_list.extend(raw_captions)
            _, vil_logit, _, _, _, _, _ = \
                model(captions.cuda(), features.cuda(), spatials.cuda(), segment_ids.cuda(), input_mask.cuda(), image_mask.cuda(), co_attention_mask.cuda())
            actual_values += y.tolist()
            predicted_values += vil_logit.squeeze(-1).tolist()
            image_ids_list += image_id.tolist()

        print("Model Name ", model_name)
        print("Values ",
              np.corrcoef(np.array(actual_values), np.array(predicted_values)))
        print("Actual mean", np.array(actual_values).mean())
        print("Predicted mean", np.array(predicted_values).mean())

        final_dict = {}
        final_dict['actual_values'] = actual_values
        final_dict['predicted_values'] = predicted_values
        final_dict['image_ids'] = image_ids_list
        final_dict['captions'] = captions_list
        if len(args.out_path) > 0:
            json.dump(final_dict, open(args.out_path, 'w'))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_file)
        train_dataset = BERTDataset(args.train_file,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    #model = BertForPreTraining.from_pretrained(args.bert_model)
    bert_config = BertConfig.from_json_file('bert_config.json')
    model = BertForPreTraining(bert_config)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on file.__next__
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                loss = model(input_ids, segment_ids, input_mask, lm_label_ids,
                             is_next)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / num_train_steps, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        if args.do_train:
            torch.save(model_to_save.state_dict(), output_model_file)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run test and create submission.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--seed",
                        default=None,
                        type=int,
                        help="Seed for randomized elements in the training")
    parser.add_argument("--eval_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    ## Our arguements
    parser.add_argument("--mode",
                        choices=[
                            "none", "distill", "smoothed_distill",
                            "reweight_baseline", "bias_product_baseline",
                            "learned_mixin_baseline"
                        ])
    parser.add_argument("--penalty",
                        type=float,
                        default=0.03,
                        help="Penalty weight for the learn_mixin model")
    parser.add_argument("--n_processes",
                        type=int,
                        default=4,
                        help="Processes to use for pre-processing")
    parser.add_argument("--debug", action="store_true")
    parser.add_argument(
        "--sorted",
        action="store_true",
        help='Sort the data so most batches have the same input length,'
        ' makes things about 2x faster. Our experiments did not actually'
        ' use this in the end (not sure if it makes a difference) so '
        'its off by default.')
    parser.add_argument("--which_bias",
                        choices=["hans", "hypo", "hans_json"],
                        required=True)
    parser.add_argument("--custom_teacher", default=None)
    parser.add_argument("--theta",
                        type=float,
                        default=0.1,
                        help="for theta smoothed distillation loss")

    args = parser.parse_args()

    utils.add_stdout_logger()

    if args.mode == "none":
        loss_fn = clf_distill_loss_functions.Plain()
    elif args.mode == "distill":
        loss_fn = clf_distill_loss_functions.DistillLoss()
    elif args.mode == "smoothed_distill":
        loss_fn = clf_distill_loss_functions.SmoothedDistillLoss()
    elif args.mode == "reweight_baseline":
        loss_fn = clf_distill_loss_functions.ReweightBaseline()
    elif args.mode == "bias_product_baseline":
        loss_fn = clf_distill_loss_functions.BiasProductBaseline()
    elif args.mode == "learned_mixin_baseline":
        loss_fn = clf_distill_loss_functions.LearnedMixinBaseline(args.penalty)
    else:
        raise RuntimeError()

    output_dir = args.output_dir

    if args.do_train:
        if exists(output_dir):
            if len(os.listdir(output_dir)) > 0:
                logging.warning("Output dir exists and is non-empty")
        else:
            os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logging.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    if args.seed is not None:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        if n_gpu > 0:
            torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval and not args.do_test:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(output_dir) and os.listdir(output_dir) and args.do_train:
        logging.warning(
            "Output directory ({}) already exists and is not empty.".format(
                output_dir))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Its way ot easy to forget if this is being set by a command line flag
    if "-uncased" in args.bert_model:
        do_lower_case = True
    elif "-cased" in args.bert_model:
        do_lower_case = False
    else:
        raise NotImplementedError(args.bert_model)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=do_lower_case)

    num_train_optimization_steps = None
    train_examples = None
    if args.do_train:
        train_examples = load_mnli(True, 2000 if args.debug else None)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
            args.local_rank))

    model = BertDistill.from_pretrained(args.bert_model,
                                        cache_dir=cache_dir,
                                        num_labels=3,
                                        loss_fn=loss_fn)

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

    if args.do_train:
        train_features: List[InputFeatures] = convert_examples_to_features(
            train_examples, args.max_seq_length, tokenizer, args.n_processes)

        if args.which_bias == "mix":
            hypo_bias_map = load_bias("hypo")
            hans_bias_map = load_bias("hans")
            bias_map = {}

            def compute_entropy(probs, base=3):
                return -(probs * (np.log(probs) / np.log(base))).sum()

            for key in hypo_bias_map.keys():
                hypo_ent = compute_entropy(np.exp(hypo_bias_map[key]))
                hans_ent = compute_entropy(np.exp(hans_bias_map[key]))
                if hypo_ent < hans_ent:
                    bias_map[key] = hypo_bias_map[key]
                else:
                    bias_map[key] = hans_bias_map[key]
        else:
            bias_map = load_bias(args.which_bias)

        for fe in train_features:
            fe.bias = bias_map[fe.example_id].astype(np.float32)
        teacher_probs_map = load_teacher_probs(args.custom_teacher)
        for fe in train_features:
            fe.teacher_probs = np.array(
                teacher_probs_map[fe.example_id]).astype(np.float32)

        example_map = {}
        for ex in train_examples:
            example_map[ex.id] = ex

        logging.info("***** Running training *****")
        logging.info("  Num examples = %d", len(train_examples))
        logging.info("  Batch size = %d", args.train_batch_size)
        logging.info("  Num steps = %d", num_train_optimization_steps)

        train_dataloader = build_train_dataloader(train_features,
                                                  args.train_batch_size,
                                                  args.seed, args.sorted)

        model.train()
        loss_ema = 0
        total_steps = 0
        decay = 0.99

        for _ in trange(int(args.num_train_epochs), desc="Epoch", ncols=100):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            pbar = tqdm(train_dataloader, desc="loss", ncols=100)
            for step, batch in enumerate(pbar):
                batch = tuple(t.to(device) for t in batch)
                if bias_map is not None:
                    example_ids, input_ids, input_mask, segment_ids, label_ids, bias, teacher_probs = batch
                else:
                    bias = None
                    example_ids, input_ids, input_mask, segment_ids, label_ids = batch

                logits, loss = model(input_ids, segment_ids, input_mask,
                                     label_ids, bias, teacher_probs)

                total_steps += 1
                loss_ema = loss_ema * decay + loss.cpu().detach().numpy() * (
                    1 - decay)
                descript = "loss=%.4f" % (loss_ema / (1 - decay**total_steps))
                pbar.set_description(descript, refresh=False)

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())

        # Record the args as well
        arg_dict = {}
        for arg in vars(args):
            arg_dict[arg] = getattr(args, arg)
        with open(join(output_dir, "args.json"), 'w') as out_fh:
            json.dump(arg_dict, out_fh)

        # Load a trained model and config that you have fine-tuned
        config = BertConfig(output_config_file)
        model = BertDistill(config, num_labels=3, loss_fn=loss_fn)
        model.load_state_dict(torch.load(output_model_file))
    else:
        output_config_file = os.path.join(output_dir, CONFIG_NAME)
        config = BertConfig.from_json_file(output_config_file)
        output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
        model = BertDistill(config, num_labels=3, loss_fn=loss_fn)
        model.load_state_dict(torch.load(output_model_file))

    model.to(device)

    if not args.do_eval and not args.do_test:
        return
    if not (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        return

    model.eval()

    if args.do_eval:
        eval_datasets = [("mnli_dev_m", load_mnli(False)),
                         ("mnli_dev_mm",
                          load_mnli(False, custom_path="dev_mismatched.tsv"))]
        eval_datasets += load_easy_hard()
        eval_datasets += [("hans", load_hans())]
        eval_datasets += load_hans_subsets()
    else:
        eval_datasets = []

    if args.do_test:
        test_datasets = load_all_test_jsonl()
        eval_datasets += test_datasets
        subm_paths = [
            "../submission/{}.csv".format(x[0]) for x in test_datasets
        ]

    for ix, (name, eval_examples) in enumerate(eval_datasets):
        logging.info("***** Running evaluation on %s *****" % name)
        logging.info("  Num examples = %d", len(eval_examples))
        logging.info("  Batch size = %d", args.eval_batch_size)
        eval_features = convert_examples_to_features(eval_examples,
                                                     args.max_seq_length,
                                                     tokenizer)
        eval_features.sort(key=lambda x: len(x.input_ids))
        all_label_ids = np.array([x.label_id for x in eval_features])
        eval_dataloader = build_eval_dataloader(eval_features,
                                                args.eval_batch_size)

        eval_loss = 0
        nb_eval_steps = 0
        probs = []
        test_subm_ids = []

        for example_ids, input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluating", ncols=100):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model(input_ids, segment_ids, input_mask)

            # create eval loss and other metric required by the task
            loss_fct = CrossEntropyLoss()
            tmp_eval_loss = loss_fct(logits.view(-1, 3), label_ids.view(-1))

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            probs.append(
                torch.nn.functional.softmax(logits, 1).detach().cpu().numpy())
            test_subm_ids.append(example_ids.cpu().numpy())

        probs = np.concatenate(probs, 0)
        test_subm_ids = np.concatenate(test_subm_ids, 0)
        eval_loss = eval_loss / nb_eval_steps

        if "hans" in name:
            # take max of non-entailment rather than taking their sum
            probs[:, 0] = probs[:, [0, 2]].max(axis=1)
            probs = probs[:, :2]

        preds = np.argmax(probs, axis=1)

        result = {"acc": simple_accuracy(preds, all_label_ids)}
        result["loss"] = eval_loss

        conf_plot_file = os.path.join(output_dir,
                                      "eval_%s_confidence.png" % name)
        ECE, bins_acc, bins_conf, bins_num = visualize_predictions(
            probs, all_label_ids, conf_plot_file=conf_plot_file)
        result["ECE"] = ECE
        result["bins_acc"] = bins_acc
        result["bins_conf"] = bins_conf
        result["bins_num"] = bins_num

        output_eval_file = os.path.join(output_dir,
                                        "eval_%s_results.txt" % name)
        output_all_eval_file = os.path.join(output_dir, "eval_all_results.txt")
        with open(output_eval_file,
                  "w") as writer, open(output_all_eval_file,
                                       "a") as all_writer:
            logging.info("***** Eval results *****")
            all_writer.write("eval results on %s:\n" % name)
            for key in sorted(result.keys()):
                logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
                all_writer.write("%s = %s\n" % (key, str(result[key])))

        output_answer_file = os.path.join(output_dir,
                                          "eval_%s_answers.json" % name)
        answers = {
            ex.example_id: [float(x) for x in p]
            for ex, p in zip(eval_features, probs)
        }
        with open(output_answer_file, "w") as f:
            json.dump(answers, f)

        # prepare submission file
        if args.do_test and ix >= len(eval_datasets) - len(test_datasets):
            with open(subm_paths.pop(0), "w") as subm_f:
                subm_f.write("pairID,gold_label\n")
                for sub_id, pred_label_id in zip(test_subm_ids, preds):
                    subm_f.write("{},{}\n".format(
                        str(sub_id), REV_NLI_LABEL_MAP[pred_label_id]))