Exemple #1
0
    def load(cls, pretrained_model_name_or_path, language=None, **kwargs):
        """
        Load a pretrained model by supplying

        * the name of a remote model on s3 ("bert-base-cased" ...)
        * OR a local path of a model trained via transformers ("some_dir/huggingface_model")
        * OR a local path of a model trained via FARM ("some_dir/farm_model")

        :param pretrained_model_name_or_path: The path of the saved pretrained model or its name.
        :type pretrained_model_name_or_path: str

        """

        bert = cls()
        if "farm_lm_name" in kwargs:
            bert.name = kwargs["farm_lm_name"]
        else:
            bert.name = pretrained_model_name_or_path
        # We need to differentiate between loading model using FARM format and Pytorch-Transformers format
        farm_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json"
        if os.path.exists(farm_lm_config):
            # FARM style
            bert_config = BertConfig.from_pretrained(farm_lm_config)
            farm_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin"
            bert.model = BertModel.from_pretrained(farm_lm_model, config=bert_config, **kwargs)
            bert.language = bert.model.config.language
        else:
            # Pytorch-transformer Style
            bert.model = BertModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs)
            bert.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path)
        return bert
Exemple #2
0
 def init_encoder(
         cls, cfg_name: str, projection_dim: int = 0, dropout: float = 0.1, **kwargs
 ) -> BertModel:
     cfg = BertConfig.from_pretrained(cfg_name if cfg_name else 'bert-base-uncased')
     if dropout != 0:
         cfg.attention_probs_dropout_prob = dropout
         cfg.hidden_dropout_prob = dropout
     return cls.from_pretrained(cfg_name, config=cfg, project_dim=projection_dim, **kwargs)
Exemple #3
0
 def __init__(self, model_path, vocab: Vocabulary):
     super().__init__(vocab)
     config = BertConfig.from_pretrained(model_path)
     bert_model = BertForPreTraining(config)
     self.bert = bert_model.bert
     tags = vocab.get_index_to_token_vocabulary("tags")
     num_tags = len(tags)
     self.projection = torch.nn.Linear(768, num_tags)
     self.metric = SpanBasedF1Measure(vocab, label_encoding='BMES')
Exemple #4
0
    def __init__(self, args: argparse.Namespace):
        super().__init__()
        self.args = args
        self.bert_config = BertConfig.from_pretrained(self.args.bert_path)
        self.model = BertForMaskedLM(self.bert_config)
        self.loss_fn = CrossEntropyLoss(reduction="none")

        self.train_acc = MaskedAccuracy()
        self.valid_acc = MaskedAccuracy()
Exemple #5
0
 def __init__(self, args):
     super().__init__()
     self.args = args
     self.bert_config = BertConfig.from_pretrained(
         self.args.bert_config_dir, output_hidden_states=False)
     self.bert = BertModel(self.bert_config)
     self.linear = nn.Linear(self.bert_config.hidden_size * 1001, 919)
     self.threshold = nn.Threshold(0, 1e-6)
     self.linear2 = nn.Linear(919, 919)
     self.sigmoid = nn.Sigmoid()
Exemple #6
0
    def _build_word_embedding(self):
        self.bert_config = BertConfig.from_pretrained(self.config.bert_model_name)
        if self.config.pretrained_bert:
            bert_model = BertForPreTraining.from_pretrained(self.config.bert_model_name)
            self.word_embedding = bert_model.bert.embeddings
            self.pooler = bert_model.bert.pooler
            self.pooler.apply(self.init_weights)

        else:
            self.pooler = BertPooler(self.bert_config)
            self.word_embedding = BertEmbeddings(self.bert_config)
Exemple #7
0
    def __init__(self, config, args):
        super().__init__(config)
        self.args = args

        if args.bert_model == "albert-base-v2":
            bert = AlbertModel.from_pretrained(args.bert_model)
        elif args.bert_model == "emilyalsentzer/Bio_ClinicalBERT":
            bert = AutoModel.from_pretrained(args.bert_model)
        elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12":
            bert = AutoModel.from_pretrained(args.bert_model)
        elif args.bert_model == "bert-small-scratch":
            config = BertConfig.from_pretrained(
                "google/bert_uncased_L-4_H-512_A-8")
            bert = BertModel(config)
        elif args.bert_model == "bert-base-scratch":
            config = BertConfig.from_pretrained("bert-base-uncased")
            bert = BertModel(config)
        else:
            bert = BertModel.from_pretrained(
                args.bert_model)  # bert-base-uncased, small, tiny

        self.txt_embeddings = bert.embeddings
        self.img_embeddings = ImageBertEmbeddings(args, self.txt_embeddings)

        if args.img_encoder == 'ViT':
            img_size = args.img_size
            patch_sz = 32 if img_size == 512 else 16
            self.img_encoder = Img_patch_embedding(image_size=img_size,
                                                   patch_size=patch_sz,
                                                   dim=2048)
        else:
            self.img_encoder = ImageEncoder_cnn(args)
            for p in self.img_encoder.parameters():
                p.requires_grad = False
            for c in list(self.img_encoder.children())[5:]:
                for p in c.parameters():
                    p.requires_grad = True

        self.encoder = bert.encoder
        self.pooler = bert.pooler
Exemple #8
0
 def load_model(self):
     self.tokenizer = BertTokenizer.from_pretrained(self.args.pretrained_path,do_lower_case=self.args.do_lower_case)
     self.config = BertConfig.from_pretrained(self.args.pretrained_path,num_labels=self.args.num_labels)
     if self.args.resume_model:
         self.model = BertForMultiLable.from_pretrained(self.args.resume_model_path,config=self.config)
         with open(self.threshold_path, 'r') as f:
             self.threshold = float(f.read())   # read the best model's threshold
     else:
         self.model = BertForMultiLable.from_pretrained(self.args.pretrained_path,config=self.config)
     if self.args.cuda:
         self.model.cuda()
         if self.args.n_gpus>1:
             self.model = DataParallel(self.model)
Exemple #9
0
    def __init__(self, config, args):
        super().__init__(config)
        self.args = args

        if args.bert_model == "emilyalsentzer/Bio_ClinicalBERT":
            bert = AutoModel.from_pretrained(args.bert_model)
        elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12":
            bert = AutoModel.from_pretrained(args.bert_model)
        elif args.bert_model == "bert-small-scratch":
            config = BertConfig.from_pretrained(
                "google/bert_uncased_L-4_H-512_A-8")
            bert = BertModel(config)
        elif args.bert_model == "bert-base-scratch":
            config = BertConfig.from_pretrained("bert-base-uncased")
            bert = BertModel(config)
        else:
            bert = BertModel.from_pretrained(
                args.bert_model)  # bert-base-uncased, small, tiny

        self.txt_embeddings = bert.embeddings

        self.encoder = bert.encoder
        self.pooler = bert.pooler
Exemple #10
0
 def __init__(self, model_path, vocab: Vocabulary):
     super().__init__(vocab)
     self.pretrained_tokenizer = BertForPreTraining.from_pretrained(
         model_path)
     config = BertConfig.from_pretrained(model_path)
     bert_model = BertForPreTraining(config)
     self.bert = bert_model.bert
     tags = vocab.get_index_to_token_vocabulary("tags")
     num_tags = len(tags)
     constraints = allowed_transitions(constraint_type="BMES", labels=tags)
     self.projection = torch.nn.Linear(768, num_tags)
     self.crf = ConditionalRandomField(num_tags=num_tags,
                                       constraints=constraints,
                                       include_start_end_transitions=False)
Exemple #11
0
    def test_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            config = BertConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, PretrainedConfig)

            model = BertModel.from_pretrained(model_name)
            model, loading_info = BertModel.from_pretrained(
                model_name, output_loading_info=True)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, PreTrainedModel)
            for value in loading_info.values():
                self.assertEqual(len(value), 0)

            config = BertConfig.from_pretrained(model_name,
                                                output_attentions=True,
                                                output_hidden_states=True)
            model = BertModel.from_pretrained(model_name,
                                              output_attentions=True,
                                              output_hidden_states=True)
            self.assertEqual(model.config.output_attentions, True)
            self.assertEqual(model.config.output_hidden_states, True)
            self.assertEqual(model.config, config)
Exemple #12
0
 def load_model(self):
     self.tokenizer = MyBertTokenizer.from_pretrained(
         self.args.pretrained_path, do_lower_case=self.args.do_lower_case)
     self.config = BertConfig.from_pretrained(
         self.args.pretrained_path, num_labels=self.args.num_labels)
     if self.args.resume_model:
         self.model = BertCrfForNer.from_pretrained(
             self.args.resume_model_path, config=self.config)
     else:
         self.model = BertCrfForNer.from_pretrained(
             self.args.pretrained_path, config=self.config)
     if self.args.cuda:
         self.model.cuda()
         if self.args.n_gpus > 1:
             self.model = DataParallel(self.model)
Exemple #13
0
def model_builder(model_name_or_path: str,
                  num_labels: int,
                  feat_config_path: str = None,
                  one_hot_embed: bool = True,
                  use_lstm=False,
                  device: torch.device = torch.device("cpu")):
    feature = None
    if feat_config_path is not None:
        feature = Feature(feat_config_path, one_hot_embed)
    config = BertConfig.from_pretrained(model_name_or_path,
                                        num_labels=num_labels)
    model = NerModel.from_pretrained(model_name_or_path,
                                     config=config,
                                     feature=feature,
                                     use_lstm=use_lstm,
                                     device=device)
    return config, model, feature
    def __init__(self, config, args):
        super().__init__(config)

        if args.weight_load:
            config = AutoConfig.from_pretrained(args.load_pretrained_model)
            model_state_dict = torch.load(
                os.path.join(args.load_pretrained_model, 'pytorch_model.bin'))
            cxrbert = CXRBERT.from_pretrained(args.load_pretrained_model,
                                              state_dict=model_state_dict,
                                              config=config,
                                              args=args)
        else:
            config = BertConfig.from_pretrained('bert-base-uncased')
            cxrbert = CXRBERT(config, args)

        self.enc = cxrbert.enc
        self.itm = cxrbert.itm
Exemple #15
0
  def __init__(self, bert_model: str, max_layer=None, pool=True, freeze_embeddings=False):
    super().__init__()
    self.freeze_embeddings = freeze_embeddings
    config = BertConfig.from_pretrained(bert_model, cache_dir=TRANSFORMER_CACHE_DIR)
    if max_layer is not None and not pool:
      config.num_hidden_layers = max_layer
    self.pool = pool
    self.max_layer = max_layer
    self.embeddings = BertEmbeddings(config)
    if config.num_hidden_layers > 0:
      self.encoder = BertEncoder(config)
      self.encoder.output_hidden_states = True
    else:
      self.encoder = None

    if pool:
      self.pooler = BertPooler(config)
    else:
      self.pooler = None
    self.config = config
    self.bert_model = bert_model
    def init_encoder(
        cls,
        cfg_name: str,
        num_hidden_layers: int,
        projection_dim: int = 0,
        dropout: float = 0.1,
        pretrained: bool = True,
        **kwargs
    ) -> BertModel:
        cfg = BertConfig.from_pretrained(cfg_name if cfg_name else "bert-base-uncased", 
                                         num_hidden_layers=num_hidden_layers, **kwargs)
        if dropout != 0:
            cfg.attention_probs_dropout_prob = dropout
            cfg.hidden_dropout_prob = dropout

        if pretrained:
            return cls.from_pretrained(
                cfg_name, config=cfg, project_dim=projection_dim,
            )
        else:
            return cls(cfg, project_dim=projection_dim)
Exemple #17
0
def model_builder_from_pretrained(model_name_or_path,
                                  num_labels,
                                  pre_train_path,
                                  feat_dir: str = None,
                                  one_hot_embed: bool = True,
                                  use_lstm=False,
                                  device: torch.device = torch.device("cpu")):
    feature = None
    if feat_dir is not None:
        feature = Feature(feat_dir + "/feature_config.json", one_hot_embed)
    config = BertConfig.from_pretrained(model_name_or_path,
                                        num_labels=num_labels)
    model = NerModel.from_pretrained(model_name_or_path,
                                     config=config,
                                     feature=feature,
                                     use_lstm=False,
                                     device=device)
    model.load_state_dict(
        torch.load(pre_train_path + "/vner_model.bin", map_location='cpu'))
    model.eval()
    return config, model, feature
Exemple #18
0
 def init_encoder(cls,
                  cfg_name: str,
                  projection_dim: int = 0,
                  dropout: float = 0.1,
                  num_hidden_layers: int = 12,
                  num_attention_heads: int = 12,
                  pretrained: bool = True,
                  **kwargs) -> BertModel:
     cfg = BertConfig.from_pretrained(
         cfg_name if cfg_name else "bert-base-uncased")
     if dropout != 0:
         cfg.attention_probs_dropout_prob = dropout
         cfg.hidden_dropout_prob = dropout
     cfg.num_hidden_layers = num_hidden_layers
     cfg.num_attention_heads = num_attention_heads
     cfg.pooler_num_attention_heads = num_attention_heads  # careful here
     logger.info(f'new bert cfg:\n{cfg}')
     if pretrained:
         return cls.from_pretrained(cfg_name,
                                    config=cfg,
                                    project_dim=projection_dim,
                                    **kwargs)
     else:
         return HFBertEncoder(cfg, project_dim=projection_dim)
Exemple #19
0
    def test_forward(self):
        img_feature_dim = 2054
        bert_model_name = "bert-base-uncased"
        use_img_layernorm = True
        img_layer_norm_eps = 1e-12
        bert_config = BertConfig.from_pretrained(bert_model_name)
        # augment hf BertConfig for vinvl BertImgModel config
        bert_config.img_feature_dim = img_feature_dim
        bert_config.use_img_layernorm = use_img_layernorm
        bert_config.img_layer_norm_eps = img_layer_norm_eps
        model = VinVLBase(bert_config)

        model.eval()
        model = model.to(get_current_device())

        bs = 8
        num_feats = 70
        max_sentence_len = 25
        input_ids = torch.ones((bs, max_sentence_len), dtype=torch.long)
        img_feat = torch.rand((bs, num_feats, img_feature_dim))

        with torch.no_grad():
            model_output = model(input_ids, img_feat).last_hidden_state
        self.assertEqual(model_output.shape, torch.Size([8, 95, 768]))
def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_neg_data', type=Path, required=True)
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
    parser.add_argument(
        "--bert_model",
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Store training data as on-disc memmaps to massively reduce memory usage"
    )

    parser.add_argument("--max_seq_len", default=512, type=int)

    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--epochs",
                        type=int,
                        default=3,
                        help="Number of epochs to train for")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--kr_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--kr_freq", default=0.7, type=float)
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--learning_rate",
                        default=1e-4,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    args = parser.parse_args()

    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    if args.local_rank == -1 or args.no_cuda:
        print(torch.cuda.is_available())
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
        print(n_gpu)
        print("no gpu?")
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        print("GPU Device: ", device)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    logging.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # if n_gpu > 0:
    torch.cuda.manual_seed_all(args.seed)

    pt_output = Path(getenv('PT_OUTPUT_DIR', ''))
    args.output_dir = Path(os.path.join(pt_output, args.output_dir))

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples /
                                       args.train_batch_size /
                                       args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    # Prepare model
    config = BertConfig.from_pretrained(args.bert_model)
    # config.num_hidden_layers = args.num_layers
    model = FuckWrapper(config)
    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=num_train_optimization_steps)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()

    before_train_path = Path(os.path.join(args.output_dir, "before_training"))
    print("Before training path: ", before_train_path)
    before_train_path.mkdir(parents=True, exist_ok=True)
    model.save_pretrained(os.path.join(args.output_dir, "before_training"))
    tokenizer.save_pretrained(os.path.join(args.output_dir, "before_training"))

    neg_epoch_dataset = PregeneratedDataset(
        epoch=0,
        training_path=args.pregenerated_neg_data,
        tokenizer=tokenizer,
        num_data_epochs=num_data_epochs,
        reduce_memory=args.reduce_memory)
    if args.local_rank == -1:
        neg_train_sampler = RandomSampler(neg_epoch_dataset)
    else:
        neg_train_sampler = DistributedSampler(neg_epoch_dataset)

    neg_train_dataloader = DataLoader(neg_epoch_dataset,
                                      sampler=neg_train_sampler,
                                      batch_size=args.train_batch_size)

    def inf_train_gen():
        while True:
            for kr_step, kr_batch in enumerate(neg_train_dataloader):
                yield kr_step, kr_batch

    kr_gen = inf_train_gen()

    for epoch in range(args.epochs):
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            reduce_memory=args.reduce_memory)
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)

        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1):
            logging.info("** ** * Saving fine-tuned model ** ** * ")
            model.save_pretrained(args.output_dir)
            tokenizer.save_pretrained(args.output_dir)

        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                model.train()

                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids = batch

                outputs = model(input_ids=input_ids,
                                attention_mask=input_mask,
                                token_type_ids=segment_ids,
                                masked_lm_labels=lm_label_ids,
                                negated=False)
                loss = outputs[0]
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)

                if args.local_rank == 0 or args.local_rank == -1:
                    nb_tr_steps += 1
                    pbar.update(1)
                    mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                    pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    scheduler.step()  # Update learning rate schedule
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if random.random() > args.kr_freq:
                    kr_step, kr_batch = next(kr_gen)
                    kr_batch = tuple(t.to(device) for t in kr_batch)
                    input_ids, input_mask, segment_ids, lm_label_ids = kr_batch

                    outputs = model(input_ids=input_ids,
                                    attention_mask=input_mask,
                                    token_type_ids=segment_ids,
                                    masked_lm_labels=lm_label_ids,
                                    negated=True)
                    loss = outputs[0]
                    if n_gpu > 1:
                        loss = loss.mean()  # mean() to average on multi-gpu.
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    if args.fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)

                    tr_loss += loss.item()
                    nb_tr_examples += input_ids.size(0)
                    if args.local_rank == -1:
                        nb_tr_steps += 1
                        mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                        pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        scheduler.step()  # Update learning rate schedule
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1

    # Save a trained model
    if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1):
        logging.info("** ** * Saving fine-tuned model ** ** * ")
        model.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
Exemple #21
0
if __name__ == '__main__':
    source_path = '../data/train/source.txt'
    target_path = '../data/train/target.txt'
    keyword_path = '../data/train/TextRank.txt'
    eval_source_path = '../data/eval/source.txt'
    eval_target_path = '../data/eval/target.txt'
    eval_keyword_path = '../data/eval/TextRank.txt'
    log_path = '../log/log.txt'
    bert_path = '../../chinese_wwm_ext_pytorch/'
    pre_trainModel = '../model/model.pth'  # 断点训练的模型名称
    log = open(log_path, 'w', encoding='utf-8')
    rouge = Rouge()  # 评估指标

    device = torch.device('cuda:0')
    # 加载BERT模型
    bert_config = BertConfig.from_pretrained(bert_path +
                                             'bert_config.json')  # 配置文件
    bert_model = BertModel.from_pretrained(bert_path + 'pytorch_model.bin',
                                           config=bert_config)  # 模型
    bert_model.to(device)
    tokenizer = BertTokenizer.from_pretrained(bert_path + 'vocab.txt')  # 词包
    config = Config()

    # 训练集
    loader = DataLoader(dataset=MyDataSet(source_path, target_path,
                                          keyword_path, tokenizer),
                        batch_size=config.batch_size,
                        shuffle=True,
                        num_workers=0,
                        collate_fn=pad,
                        drop_last=False)  # 最后一个batch数据集不丢弃
    # 评估集
Exemple #22
0
from oscar.modeling.modeling_distilbert import DistilBertForImageCaptioning
from transformers.modeling_bert import BertConfig

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--dump_source", default="", type=str)
    parser.add_argument("--target_config", default="", type=str)
    parser.add_argument("--dump_target", default="", type=str)
    parser.add_argument("--vocab_transform", action="store_true")
    args = parser.parse_args()

    f = open("/home/ubuntu/mmml/layers.log", 'w')

    model = BertForImageCaptioning.from_pretrained(args.dump_source)
    new_model = DistilBertForImageCaptioning(
        BertConfig.from_pretrained(args.target_config))
    state_dict = model.state_dict()
    compressed_sd = {}

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name, param.data.shape, file=f)

    print("\n\n", file=f)

    for name, param in new_model.named_parameters():
        if param.requires_grad:
            print(name, param.data.shape, file=f)

    prefix = "bert"
Exemple #23
0
    # Vocab and Tokenizer
    ptr_dir = Path("pretrained")
    vocab_filepath = ptr_dir / "{}-vocab.pkl".format(args.type)
    with open(vocab_filepath, mode='rb') as io:
        vocab = pickle.load(io)
    ptr_tokenizer = BertTokenizer.from_pretrained(args.type,
                                                  do_lower_case="uncased"
                                                  in args.type)
    ptr_tokenizer = Tokenizer(vocab, ptr_tokenizer.tokenize)

    preprocessor = PreProcessor(ptr_tokenizer, model_config.max_len)

    # Load Model
    config_filepath = ptr_dir / "{}-config.json".format(args.type)
    config = BertConfig.from_pretrained(config_filepath,
                                        output_hidden_states=False)
    model = BIIN(config,
                 vocab,
                 model_config.hidden_size,
                 enc_num_layers=len(model_config.hidden_size))

    # Data Loader
    tr_ds = Corpus(data_config.tr_path,
                   preprocessor.preprocess,
                   sep='\t',
                   doc_col='question1',
                   label_col='is_duplicate',
                   is_pair=True,
                   doc_col_second='question2')
    val_ds = Corpus(data_config.dev_path,
                    preprocessor.preprocess,
Exemple #24
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_neg_data', type=Path, required=True)
    parser.add_argument('--pregenerated_pos_data', type=Path, required=True)
    parser.add_argument('--validation_neg_data', type=Path, required=True)
    parser.add_argument('--validation_pos_data', type=Path, required=True)
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
    parser.add_argument('--exp_group', type=str, required=True)
    parser.add_argument(
        "--bert_model",
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--method",
                        type=str,
                        choices=[
                            'neg_samebatch', 'distill_samebatch',
                            'distill_samebatch_lstm', 'distill', 'kl',
                            'unlikelihood'
                        ])
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument("--save_before", action='store_true')
    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Store training data as on-disc memmaps to massively reduce memory usage"
    )

    parser.add_argument("--max_seq_len", default=512, type=int)

    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--epochs",
                        type=int,
                        default=3,
                        help="Number of epochs to train for")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--port_idx", type=int)

    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--valid_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--kr_freq", default=0.0, type=float)
    parser.add_argument("--mlm_freq", default=0, type=float)
    parser.add_argument("--kl_w", default=1000, type=float)
    parser.add_argument("--ul_w", default=1, type=float)
    parser.add_argument("--gamma",
                        default=0.5,
                        type=float,
                        help="coeff of UL and 1-coeff of LL")
    parser.add_argument('--no_mlm',
                        action='store_true',
                        help="don't do any MLM training")
    parser.add_argument("--no_tie",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--no_ul',
                        action='store_true',
                        help="don't do any UL training")
    parser.add_argument('--no_ll',
                        action='store_true',
                        help="don't do any LL training")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--learning_rate",
                        default=1e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    args = parser.parse_args()





    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    if args.local_rank == -1 or args.no_cuda:
        print(torch.cuda.is_available())
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
        print("Num of gpus: ", n_gpu)
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        print("GPU Device: ", device)
        n_gpu = 1
        dist_comms.init_distributed_training(args.local_rank, args.port_idx)
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    logging.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # if n_gpu > 0:
    torch.cuda.manual_seed_all(args.seed)

    pt_output = Path(getenv('PT_OUTPUT_DIR', ''))
    args.output_dir = Path(os.path.join(pt_output, args.output_dir))

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    if args.bert_model != "roberta-base":
        tokenizer = BertTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case)
    else:
        tokenizer = RobertaTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case)
        tokenizer.vocab = tokenizer.encoder

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples /
                                       args.train_batch_size /
                                       args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    # Prepare model
    if args.bert_model != "roberta-base":
        if args.method == "neg_samebatch":
            config = BertConfig.from_pretrained(args.bert_model)
            config.bert_model = args.bert_model
            core_model = BertForNegSameBatch.from_pretrained(args.bert_model,
                                                             args.gamma,
                                                             config=config)
            core_model.init_orig_bert()
        elif args.method == "unlikelihood":
            config = BertConfig.from_pretrained(args.bert_model)
            core_model = BertForNegPreTraining.from_pretrained(args.bert_model,
                                                               config=config)
        else:
            raise NotImplementedError(
                f"method {args.method} is not implemented")
    else:
        config = RobertaConfig.from_pretrained(args.bert_model)
        core_model = RobertaForNegPreTraining.from_pretrained(args.bert_model)

    core_model = core_model.to(device)

    # Prepare optimizer
    param_optimizer = list(core_model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=num_train_optimization_steps)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        core_model, optimizer = amp.initialize(core_model,
                                               optimizer,
                                               opt_level=args.fp16_opt_level)

    model = torch.nn.parallel.DistributedDataParallel(
        core_model,
        device_ids=[args.local_rank],
        output_device=args.local_rank,
        find_unused_parameters=True)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()

    if args.local_rank == 0 or args.local_rank == -1:
        if args.save_before:
            before_train_path = Path(
                os.path.join(args.output_dir, "before_training"))
            print("Before training path: ", before_train_path)
            before_train_path.mkdir(parents=True, exist_ok=True)
            model.module.save_pretrained(
                os.path.join(args.output_dir, "before_training"))
            tokenizer.save_pretrained(
                os.path.join(args.output_dir, "before_training"))

        # writer = SummaryWriter(log_dir=args.output_dir)
        wandb.init(project="neg_v2",
                   name=str(args.output_dir).split("/")[-1],
                   group=args.exp_group,
                   entity='negation')
        mlm_averagemeter = AverageMeter()
        ul_averagemeter = AverageMeter()
        ll_averagemeter = AverageMeter()
        kl_averagemeter = AverageMeter()

    neg_epoch_dataset = PregeneratedDataset(
        epoch=0,
        training_path=args.pregenerated_neg_data,
        tokenizer=tokenizer,
        num_data_epochs=num_data_epochs,
        reduce_memory=args.reduce_memory)

    pos_epoch_dataset = PregeneratedDataset(
        epoch=0,
        training_path=args.pregenerated_pos_data,
        tokenizer=tokenizer,
        num_data_epochs=num_data_epochs,
        reduce_memory=args.reduce_memory)

    neg_validation_dataset = PregeneratedDataset(
        epoch=0,
        training_path=args.validation_neg_data,
        tokenizer=tokenizer,
        num_data_epochs=num_data_epochs,
        reduce_memory=args.reduce_memory)
    pos_validation_dataset = PregeneratedDataset(
        epoch=0,
        training_path=args.validation_pos_data,
        tokenizer=tokenizer,
        num_data_epochs=num_data_epochs,
        reduce_memory=args.reduce_memory)

    if args.local_rank == -1:
        neg_train_sampler = RandomSampler(neg_epoch_dataset)
        pos_train_sampler = RandomSampler(pos_epoch_dataset)

        neg_valid_sampler = RandomSampler(neg_validation_dataset)
        pos_valid_sampler = RandomSampler(pos_validation_dataset)
    else:
        neg_train_sampler = DistributedSampler(neg_epoch_dataset)
        pos_train_sampler = DistributedSampler(pos_epoch_dataset)

        neg_valid_sampler = DistributedSampler(neg_validation_dataset)
        pos_valid_sampler = DistributedSampler(pos_validation_dataset)

    neg_train_dataloader = DataLoader(neg_epoch_dataset,
                                      sampler=neg_train_sampler,
                                      batch_size=args.train_batch_size)
    pos_train_dataloader = DataLoader(pos_epoch_dataset,
                                      sampler=pos_train_sampler,
                                      batch_size=args.train_batch_size)

    neg_valid_dataloader = DataLoader(neg_validation_dataset,
                                      sampler=neg_valid_sampler,
                                      batch_size=args.valid_batch_size)
    pos_valid_dataloader = DataLoader(pos_validation_dataset,
                                      sampler=pos_valid_sampler,
                                      batch_size=args.valid_batch_size)

    def inf_train_gen():
        while True:
            for kr_step, kr_batch in enumerate(neg_train_dataloader):
                yield kr_step, kr_batch

    kr_gen = inf_train_gen()

    def pos_inf_train_gen():
        while True:
            for kr_step, kr_batch in enumerate(pos_train_dataloader):
                yield kr_step, kr_batch

    pos_kr_gen = pos_inf_train_gen()

    mlm_loss, neg_loss = 0, 0
    mlm_nb_it, neg_nb_it = 1, 1
    mlm_nb_ex, neg_nb_ex = 0, 0

    for epoch in range(args.epochs):
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            reduce_memory=args.reduce_memory)
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)

        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        ul_tr_loss = 0
        nb_ul_tr_examples, nb_ul_tr_steps = 0, 1
        ll_tr_loss = 0
        nb_ll_tr_examples, nb_ll_tr_steps = 0, 1
        kl_tr_loss = 0
        nb_kl_tr_examples, nb_kl_tr_steps = 0, 1

        if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1
                                                   and args.local_rank == 0):
            logging.info("** ** * Saving fine-tuned model ** ** * ")
            model.module.save_pretrained(args.output_dir)
            tokenizer.save_pretrained(args.output_dir)

        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                if not args.no_mlm and (random.random() > args.mlm_freq):
                    model.train()
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, segment_ids, lm_label_ids = batch

                    outputs = model(input_ids=input_ids,
                                    attention_mask=input_mask,
                                    token_type_ids=segment_ids,
                                    masked_lm_labels=lm_label_ids,
                                    negated=False)

                    loss = outputs[1]
                    loss_dict = outputs[0]
                    mlm_loss += loss_dict['mlm'].item()

                    mlm_nb_it += 1
                    mlm_nb_ex += input_ids.size(0)

                    if n_gpu > 1:
                        loss = loss.mean()  # mean() to average on multi-gpu.

                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    if args.fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)

                    tr_loss += loss.item()

                    if args.local_rank == 0 or args.local_rank == -1:
                        mlm_averagemeter.update(loss_dict['mlm'].item())
                        # writer.add_scalar('MLM/train', loss_dict['mlm'].item(), mlm_nb_it)
                        wandb.log({'MLM/train': loss_dict['mlm'].item()})

                        nb_tr_steps += 1
                        nb_ll_tr_steps += 1
                        mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps

                        pbar.set_postfix_str(
                            f"MLM: {mlm_averagemeter:.6f}, UL: {ul_averagemeter:.6f}, LL: {ll_averagemeter:.6f}, KL: {kl_averagemeter:.6f}"
                        )

                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        scheduler.step()  # Update learning rate schedule
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1
                pbar.update(1)
                random_num = random.random()
                if random_num > args.kr_freq:
                    if args.method in ["neg_samebatch"]:
                        ul_step, ul_batch = next(kr_gen)
                        ul_batch = tuple(t.to(device) for t in ul_batch)
                        ul_input_ids, ul_input_mask, ul_segment_ids, ul_lm_label_ids = ul_batch

                        ll_step, ll_batch = next(pos_kr_gen)
                        ll_batch = tuple(t.to(device) for t in ll_batch)
                        ll_input_ids, ll_input_mask, ll_segment_ids, ll_lm_label_ids = ll_batch

                        batch_mask = torch.zeros(
                            (ul_input_ids.size(0) + ll_input_ids.size(0)),
                            dtype=ll_input_mask.dtype,
                            device=device)
                        batch_mask[:ul_input_ids.size(0)] = 1.

                        outputs = model(
                            input_ids=torch.cat([ul_input_ids, ll_input_ids],
                                                0),
                            attention_mask=torch.cat(
                                [ul_input_mask, ll_input_mask], 0),
                            token_type_ids=torch.cat(
                                [ul_segment_ids, ll_segment_ids], 0),
                            masked_lm_labels=torch.cat(
                                [ul_lm_label_ids, ll_lm_label_ids], 0),
                            negated=True,
                            batch_neg_mask=batch_mask)

                        loss = outputs[1] * args.ul_w
                        loss_dict = outputs[0]

                        if args.local_rank == 0 or args.local_rank == -1:
                            wandb.log({
                                'UL/train': loss_dict['neg'].item(),
                                'LL/train': loss_dict['pos'].item()
                            })
                            ul_averagemeter.update(loss_dict['neg'].item())
                            ll_averagemeter.update(loss_dict['pos'].item())
                        neg_nb_it += 1

                    elif random.random() > 0.5 and not args.no_ul:
                        kr_step, kr_batch = next(kr_gen)
                        kr_batch = tuple(t.to(device) for t in kr_batch)
                        input_ids, input_mask, segment_ids, lm_label_ids = kr_batch

                        outputs = model(input_ids=input_ids,
                                        attention_mask=input_mask,
                                        token_type_ids=segment_ids,
                                        masked_lm_labels=lm_label_ids,
                                        negated=True)

                        loss = outputs[1] * args.ul_w

                        loss_dict = outputs[0]
                        nb_ul_tr_steps += 1

                        neg_loss += loss_dict['neg'].item()
                        if args.local_rank == 0 or args.local_rank == -1:
                            wandb.log({
                                'UL/train':
                                loss_dict['neg'].item(),
                                'KL/train':
                                loss_dict['kl'].item() * args.kl_w
                            })
                            ul_averagemeter.update(loss_dict['neg'].item())
                            kl_averagemeter.update(loss_dict['kl'].item() *
                                                   args.kl_w)

                        neg_nb_it += 1
                    elif not args.no_ll:
                        kr_step, kr_batch = next(pos_kr_gen)
                        kr_batch = tuple(t.to(device) for t in kr_batch)
                        input_ids, input_mask, segment_ids, lm_label_ids = kr_batch

                        outputs = model(input_ids=input_ids,
                                        attention_mask=input_mask,
                                        token_type_ids=segment_ids,
                                        masked_lm_labels=lm_label_ids,
                                        negated=False)
                        loss = outputs[1]
                        loss_dict = outputs[0]
                        nb_ll_tr_steps += 1

                        mlm_loss += loss_dict['mlm'].item()

                        mlm_nb_it += 1
                        if args.local_rank == 0 or args.local_rank == -1:
                            wandb.log({'LL/train': loss_dict['mlm'].item()})
                            ll_averagemeter.update(loss_dict['mlm'].item())

                        mlm_nb_ex += input_ids.size(0)
                    else:
                        continue

                    if n_gpu > 1:
                        loss = loss.mean()  # mean() to average on multi-gpu.
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    if args.fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)

                    tr_loss += loss.item()
                    if args.local_rank == 0 or args.local_rank == -1:
                        nb_tr_steps += 1
                        mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                        pbar.set_postfix_str(
                            f"MLM: {mlm_averagemeter:.6f}, UL: {ul_averagemeter:.6f}, LL: {ll_averagemeter:.6f}, KL: {kl_averagemeter:.6f}"
                        )
                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        scheduler.step()  # Update learning rate schedule
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1
                if n_gpu > 1 and args.local_rank == -1 or (
                        n_gpu <= 1 and args.local_rank == 0):
                    if False and (step + 1) % 100 == 0:
                        neg_valid_res = validate(
                            model=model,
                            dataloader=neg_valid_dataloader,
                            device=device,
                            negated=True)
                        pos_valid_res = validate(
                            model=model,
                            dataloader=pos_valid_dataloader,
                            device=device,
                            negated=False)
                        wandb.log({
                            'neg/valid/p@1': neg_valid_res % 100.,
                            'pos/valid/p@1': pos_valid_res % 100.
                        })

    # Save a trained model
    if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1
                                               and args.local_rank == 0):
        print("Saving model")
        logging.info("** ** * Saving fine-tuned model ** ** * ")
        model.module.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        print(str(wandb.run.id))
        pickle.dump(
            str(wandb.run.id),
            open(os.path.join(args.output_dir, 'wandb_run_id.pkl'), 'wb'))
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = BertConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = BertConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = BertTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = BertTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        model = BertForTagRankingLate.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = BertForTagRanking.from_config(config)

    # add vocab for special tokens and hashtags
    special_tokens = ['<img>', '<loc>', '<time>']
    num_added_special_toks = tokenizer.add_tokens(special_tokens)
    print('We have added', num_added_special_toks, 'special tokens')
    tokenizer.img_token = '<img>'
    tokenizer.loc_token = '<loc>'
    tokenizer.time_token = '<time>'
    print(tokenizer.convert_tokens_to_ids(special_tokens))
    assert tokenizer.img_token == '<img>'
    assert tokenizer.loc_token == '<loc>'
    assert tokenizer.time_token == '<time>'

    with open(data_args.tag_list) as f:
        tag_list = f.readlines()
        tag_list = ' '.join(tag_list).replace('\n', '').split()
    num_added_toks = tokenizer.add_tokens(tag_list)
    print('tag_list:', data_args.tag_list)
    print('We have added', num_added_toks, 'tokens for hashtags')
    print('total vocab_size:', len(tokenizer))
    model.resize_token_embeddings(len(tokenizer))

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets
    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        evaluate=True) if training_args.do_eval else None
    data_collator = DataCollatorForTagGeneration(config.vocab_size)

    training_args.per_device_eval_batch_size = 1  # force eval_batch as 1
    # Initialize our Trainer
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      data_collator=data_collator)

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        dataloader = trainer.get_eval_dataloader(eval_dataset)
        # multi-gpu eval
        if training_args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        description = "Evaluation"
        batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", len(dataloader.dataset))
        logger.info("  Batch size = %d", batch_size)
        model.eval()

        if is_torch_tpu_available():
            dataloader = pl.ParallelLoader(
                dataloader,
                [training_args.device]).per_device_loader(training_args.device)

        results = {}
        for eid, example in enumerate(tqdm(dataloader, desc=description)):
            feature = convert_example_to_feature(example, tokenizer,
                                                 data_args.block_size)
            image_ids = torch.tensor([feature['image_ids']],
                                     dtype=torch.long).to(training_args.device)
            location_ids = torch.tensor([feature['location_ids']],
                                        dtype=torch.long).to(
                                            training_args.device)
            time_ids = torch.tensor([feature['time_ids']],
                                    dtype=torch.long).to(training_args.device)
            text_ids = torch.tensor([feature['text_ids']],
                                    dtype=torch.long).to(training_args.device)
            pid = feature['pid']
            inputs = {
                'image_ids': image_ids,
                'location_ids': location_ids,
                'time_ids': time_ids,
                'text_ids': text_ids
            }
            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs[0]

                logit_for_cls = logits[0]

                orig_vocab_size = 30522
                added_special_toks_size = 3  # <img>, <loc>, <time>
                logit_for_cls[:orig_vocab_size +
                              added_special_toks_size] = -float('inf')

                probabilities = F.softmax(logit_for_cls, 0).detach().cpu()

                probs, predicted_indices = torch.topk(probabilities, k=10)

                predicted_tokens = tokenizer.convert_ids_to_tokens(
                    predicted_indices)

                while pid in results:
                    pid = pid + '_'
                results[pid] = predicted_tokens

        results_save_path = os.path.join(training_args.output_dir,
                                         'results.json')
        with open(results_save_path, 'w') as f:
            logger.info("saved results.json into %s", training_args.output_dir)
            json.dump(results, f)
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Model parameters %s", model_args)
    logger.info("Data parameters %s", data_args)
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = BertConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = BertConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = BertConfig()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    config.loss_fct = model_args.loss_fct

    if model_args.tokenizer_name:
        tokenizer = BertTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = BertTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        model = BertForTagGeneration.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir)
    else:
        logger.info("Training new model from scratch")
        model = BertForTagGeneration.from_config(config)

    # add vocab for special tokens and hashtags
    special_tokens = ['<img>', '<loc>', '<time>']
    num_added_special_toks = tokenizer.add_tokens(special_tokens)
    print('We have added', num_added_special_toks, 'special tokens')
    tokenizer.img_token = '<img>'
    tokenizer.loc_token = '<loc>'
    tokenizer.time_token = '<time>'
    print(tokenizer.convert_tokens_to_ids(special_tokens))
    assert tokenizer.img_token == '<img>'
    assert tokenizer.loc_token == '<loc>'
    assert tokenizer.time_token == '<time>'

    with open(data_args.tag_list) as f:
        tag_list = f.readlines()
        tag_list = ' '.join(tag_list).replace('\n', '').split()
    num_added_toks = tokenizer.add_tokens(tag_list)
    print('tag_list:', data_args.tag_list)
    print('We have added', num_added_toks, 'tokens for hashtags')
    print('total vocab_size:', len(tokenizer))
    model.resize_token_embeddings(len(tokenizer))

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    neptune_project_name = 'junmokang/bertinsta'
    neptune_experiment_name = 'bertinsta-generation'

    if not training_args.do_eval:
        if is_torch_tpu_available():
            if xm.get_ordinal() == 0:
                neptune.init(neptune_project_name)
                neptune.create_experiment(name=neptune_project_name,
                                          params=training_args.__dict__)
        else:
            neptune.init(neptune_project_name)
            neptune.create_experiment(name=neptune_project_name,
                                      params=training_args.__dict__)

    # Get datasets
    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        loss_fct=model_args.loss_fct) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        evaluate=True) if training_args.do_eval else None
    data_collator = DataCollatorForTagGeneration(config.vocab_size,
                                                 loss_fct=model_args.loss_fct)

    training_args.per_device_eval_batch_size = 1  # force eval_batch as 1
    # Initialize our Trainer
    trainer = Trainer(model=model,
                      args=training_args,
                      neptune=neptune,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      data_collator=data_collator)

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        dataloader = trainer.get_eval_dataloader(eval_dataset)
        # multi-gpu eval
        if training_args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        description = "Evaluation"
        batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", len(dataloader.dataset))
        logger.info("  Batch size = %d", batch_size)
        model.eval()

        if is_torch_tpu_available():
            dataloader = pl.ParallelLoader(
                dataloader,
                [training_args.device]).per_device_loader(training_args.device)

        results = {}
        grouping_results = {}
        # interaction_matrix = np.zeros((6, 6)) # feature interaction
        beam_width = 1
        top_k = 10

        # tag to contexts mapping
        context_list = [
            'emotion', 'mood', 'location', 'time', 'object', 'activity',
            'event', 'others'
        ]
        context2ids = {c: [] for c in context_list}
        if data_args.tag2contexts:
            with open(data_args.tag2contexts) as f:
                tag2contexts = json.load(f)
                for tag, contexts in tag2contexts.items():
                    for c in contexts:
                        context2ids[c].append(tag)
                for c in context_list:
                    context2ids[c] = tokenizer.convert_tokens_to_ids(
                        context2ids[c])

        for eid, example in enumerate(tqdm(dataloader, desc=description)):
            generated_tags = beam_decode(beam_width, top_k, model, example,
                                         tokenizer, data_args.block_size,
                                         training_args.device)
            # generated_tags = beam_decode(beam_width, top_k, model, example, tokenizer, data_args.block_size, training_args.device, None, interaction_matrix) # feature interaction
            results[example['pid']] = generated_tags
            grouping_results[example['pid']] = {}
            grouping_results[example['pid']]['all'] = generated_tags
            # print('all:', str(generated_tags))

            # diverse generation (according to context)
            if data_args.tag2contexts:
                for context in context_list:
                    generated_tags = beam_decode(beam_width, top_k, model,
                                                 example, tokenizer,
                                                 data_args.block_size,
                                                 training_args.device,
                                                 context2ids[context])
                    grouping_results[example['pid']][context] = generated_tags
                    # print(context, ':', str(generated_tags))

        # with np.printoptions(precision=2, suppress=True): # feature interaction
        #         print(interaction_matrix)
        #         print(interaction_matrix.sum(1))
        #         print(interaction_matrix / interaction_matrix.sum(1))

        results_save_path = os.path.join(training_args.output_dir,
                                         'results.json')
        with open(results_save_path, 'w') as f:
            logger.info("saved results.json into %s", training_args.output_dir)
            json.dump(results, f)

        grouping_results_save_path = os.path.join(training_args.output_dir,
                                                  'grouping_results.json')
        with open(grouping_results_save_path, 'w') as f:
            logger.info("saved grouping_results.json into %s",
                        training_args.output_dir)
            json.dump(grouping_results, f)
Exemple #27
0
def run_bert_mwa_torch(args):
    vocab_file_path = os.path.join(
        bert_mwa_config.get("bert_pretrained_model_path"),
        bert_mwa_config.get("vocab_file"))
    bert_config_file = os.path.join(
        bert_mwa_config.get("bert_pretrained_model_path"),
        bert_mwa_config.get("bert_config_path"))

    slot_file = os.path.join(
        bert_mwa_config.get("slot_list_root_path"),
        bert_mwa_config.get("bert_slot_complete_file_name"))
    data_loader = bertWordPrepareData(vocab_file_path, slot_file,
                                      bert_mwa_config, None, 384, None, None,
                                      False, False, True)
    label2id = data_loader.tokenizer.slot2id
    train_features = data_loader.load_cache_train_dev_data()
    train_dataset = create_dataset_for_torch(train_features)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)
    valid_features = data_loader.load_cache_train_dev_data(False)
    valid_dataset = create_dataset_for_torch(valid_features)
    valid_sampler = SequentialSampler(valid_dataset)
    valid_dataloader = DataLoader(valid_dataset,
                                  sampler=valid_sampler,
                                  batch_size=args.valid_batch_size)
    model = BertForMWA
    device = torch.device("cuda:0")

    if args.do_train:
        model = model.from_pretrained(
            bert_mwa_config.get("bert_pretrained_model_path"),
            device=device,
            label2ids=label2id)
        model = model.to(device)
        if data_loader.train_samples_nums % args.train_batch_size != 0:
            each_epoch_steps = int(
                data_loader.train_samples_nums / args.train_batch_size) + 1
        else:
            each_epoch_steps = int(data_loader.train_samples_nums /
                                   args.train_batch_size)
        train_steps_nums = each_epoch_steps * args.epochs
        param_optimizer = list(model.named_parameters())
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        # optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-6)
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.lr,
                             max_grad_norm=args.clip_norm,
                             warmup=0.1,
                             t_total=train_steps_nums)
        # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", data_loader.train_samples_nums)
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", train_steps_nums)

        trainer(model, optimizer, train_dataloader, valid_dataloader,
                args.epochs, train_steps_nums, each_epoch_steps,
                data_loader.tokenizer.id2slot, device, logger, args)
    if args.do_test:
        test_features = data_loader.load_cache_train_dev_data(False, True)
        test_dataset = create_dataset_for_torch(test_features)
        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(test_dataset,
                                     sampler=test_sampler,
                                     batch_size=args.test_batch_size)
        bertconfig = BertConfig.from_pretrained(
            bert_mwa_config.get("bert_pretrained_model_path"))
        model = BertForMWA(bertconfig, label2id, device)
        model.load_state_dict(state_dict=torch.load(
            bert_mwa_config.get(args.model_checkpoint_dir) +
            "/pytorch_model.bin"))
        # model = model.from_pretrained(,device=device,label2ids=label2id)
        model.to(device)
        predict_all_and_evaluate(model, test_dataloader,
                                 data_loader.tokenizer.id2slot, device, logger,
                                 "data/orig_data_test.txt", args)
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # device = torch.device('cpu')
    if torch.cuda.is_available():
        print("current device: ", torch.cuda.current_device())

    # special token
    SOPH = '<soph>'
    NSOPH = '<nsoph>'

    config = BertConfig.from_pretrained('bert-base-uncased')

    # constant the seed
    SEED = 1234

    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    num_added_token = tokenizer.add_tokens(([SOPH, NSOPH]))

    INPUT_DIM = len(tokenizer)  # len(SRC.vocab)
    OUTPUT_DIM = len(tokenizer)  # len(TRG.vocab)
    HID_DIM = 768
    DEC_LAYERS = 3
    DEC_HEADS = 8
    DEC_PF_DIM = 512
    ENC_DROPOUT = 0.1
    DEC_DROPOUT = 0.1
    SRC_PAD_IDX = 0
    TRG_PAD_IDX = 0
    BATCH_SIZE = 100
    MAX_SEQ_LEN = 50
    N_EPOCHS = 5
    CLIP = 1
    LEARNING_RATE = 0.0005
    SAVE_PATH = 'tut6-model.pt'
    LOAD_PATH = 'tut6-model.pt'

    unfreeze_bert = False
    do_load = False

    do_train = False
    do_eval = False
    do_generate = True

    dec = Decoder(OUTPUT_DIM,
                  HID_DIM,
                  DEC_LAYERS,
                  DEC_HEADS,
                  DEC_PF_DIM,
                  DEC_DROPOUT,
                  device)

    model = Seq2Seq(dec, SRC_PAD_IDX, TRG_PAD_IDX, config, device).to(device)

    # Resize tokenizer
    model.bert_encoder.resize_token_embeddings(len(tokenizer))

    model.decoder.apply(initialize_weights)

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

    best_valid_loss = float('inf')

    processor = DiscoFuseProcessor()

    valid_iterator, num_val_ex = make_DataLoader(data_dir='./',
                                                 processor=processor,
                                                 tokenizer=tokenizer,
                                                 max_seq_length=MAX_SEQ_LEN,
                                                 batch_size=BATCH_SIZE,
                                                 mode="dev",
                                                 SOPH=SOPH,
                                                 NSOPH=NSOPH,
                                                 domain="sports")

    if do_train:
        for param in model.bert_encoder.parameters():
            param.requires_grad = unfreeze_bert

        print(f'The model has {count_parameters(model):,} trainable parameters')

        train_iterator, num_tr_ex = make_DataLoader(data_dir='./',
                                                    processor=processor,
                                                    tokenizer=tokenizer,
                                                    max_seq_length=MAX_SEQ_LEN,
                                                    batch_size=BATCH_SIZE,
                                                    mode="train",
                                                    SOPH=SOPH,
                                                    NSOPH=NSOPH)

        print("---- Begin Training ----")
        if do_load and os.path.exists(LOAD_PATH):
            print("---- Loading model from {} ----".format(LOAD_PATH))
            model.load_state_dict(torch.load(LOAD_PATH))

        for epoch in range(N_EPOCHS):

            start_time = time.time()

            num_batches_in_epoch = int(num_tr_ex/BATCH_SIZE)  # 10000

            train_loss = train(model, train_iterator, optimizer, criterion, CLIP,  num_batches_in_epoch, device=device)
            valid_loss, valid_exact = evaluate(model, valid_iterator, criterion, device=device, tokenizer=tokenizer)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(), SAVE_PATH)

            print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
            print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
            print(f'\t Val. EXACT: {valid_exact:.2f}')

    elif do_eval:
        print("Doing only evaluation")
        model.load_state_dict(torch.load(LOAD_PATH))
        valid_loss, valid_exact = evaluate(model, valid_iterator, criterion, device=device, tokenizer=tokenizer)
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. EXACT: {valid_exact:3.3f}')

    elif do_generate:
        print("Doing only generation")
        model.load_state_dict(torch.load(LOAD_PATH))
        all_predictions, all_trgs, all_counter_predictions = generate(model, valid_iterator, device, tokenizer)
        all_counter_pred_str = [" ".join(a).replace(" ##", "") for a in all_counter_predictions]
        all_pred_str = [" ".join(a).replace(" ##", "") for a in all_predictions]
        all_trgs_str = [" ".join(a).replace(" ##", "") for a in all_trgs]
        with open("generated_fuse.txt", 'a') as fp:
            for i in range(len(all_predictions)):
                counter_pred_line = "Counter pred: " + all_counter_pred_str[i] + "\n"
                pred_line = "Origin pred:  " + all_pred_str[i] + "\n"
                trg_line = "origin trg:   " + all_trgs_str[i] + "\n\n"
                fp.writelines(counter_pred_line)
                fp.writelines(pred_line)
                fp.writelines(trg_line)

    else:
        raise ValueError("Error - must either train evaluate, or generate!")