Esempio n. 1
0
 def __init__(self):
     super().__init__()
     self.bert = BertForMaskedLM.from_pretrained("bert-base-uncased")
Esempio n. 2
0
def init():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=False,
        help="The input training data file (a text file).",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=False,
        help="The output directory where the model predictions and checkpoints will be written.",
    )

    # Optional parameters
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--do_evaluate",
        type=bool,
        default=False,
        help="Run model evaluation on out-of-sample data.",
    )
    parser.add_argument("--do_train", type=bool, default=False, help="Run training.")
    parser.add_argument(
        "--do_overwrite_output_dir",
        type=bool,
        default=False,
        help="Whether to overwrite the output dir.",
    )
    parser.add_argument(
        "--encoder_model_name_or_path",
        default="bert-base-cased",
        type=str,
        help="The model checkpoint to initialize the encoder's weights with.",
    )
    parser.add_argument(
        "--decoder_model_name_or_path",
        default="/data/zhuoyu/semantic_parsing/models",
        type=str,
        help="The model checkpoint to initialize the decoder's weights with.",
    )
    parser.add_argument(
        "--model_type",
        default="bert",
        type=str,
        help="The decoder architecture to be fine-tuned.",
    )
    parser.add_argument(
        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument(
        "--to_cpu", default=False, type=bool, help="Whether to force training on CPU."
    )
    parser.add_argument(
        "--num_train_epochs",
        default=10,
        type=int,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=4,
        type=int,
        help="Batch size per GPU/CPU for eval.",
    )
    parser.add_argument(
        "--per_gpu_train_batch_size",
        default=4,
        type=int,
        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
        "--input_block_size",
        default=256,
        type=int,
        help="Max seq length for input",
    )
    parser.add_argument(
        "--output_block_size",
        default=128,
        type=int,
        help="Max seq length for output",
    )

    parser.add_argument(
        "--trained_checkpoints",
        default="/data/zhuoyu/semantic_parsing/chemistry_bert_parser_binary",
        type=str,
        help="trained_checkpoints",
    )

    parser.add_argument(
        "--decoding_type",
        default="decoding",
        type=str,
        help="",
    )

    parser.add_argument(
        "--encoder_lr",
        default=5e-4,
        type=float,
        help="encoder's learning rate",
    )

    parser.add_argument(
        "--decoder_lr",
        default=5e-4,
        type=float,
        help="encoder's learning rate",
    )

    parser.add_argument(
        "--encoder_warmup",
        default=10,
        type=int,
        help="encoder's learning rate",
    )

    parser.add_argument(
        "--decoder_warmup",
        default=100,
        type=int,
        help="encoder's learning rate",
    )

    parser.add_argument("--seed", default=42, type=int)
    args = parser.parse_args()


    # Set up training device
    if args.to_cpu or not torch.cuda.is_available():
        args.device = torch.device("cpu")
        args.n_gpu = 0
    else:
        args.device = torch.device("cuda")
        args.n_gpu = torch.cuda.device_count()
        print(args.n_gpu)

    # Load pretrained model and tokenizer. The decoder's weights are randomly initialized.
    tokenizer = AutoTokenizer.from_pretrained(args.encoder_model_name_or_path
                                              ,never_split=['[unused0]','[unused1]','[unused2]','[unused3]'])



    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        0,
        args.device,
        args.n_gpu,
        False,
        False,
    )

    logger.info("Training/evaluation parameters %s", args)
    checkpoint=args.trained_checkpoints
    encoder_checkpoint = os.path.join(checkpoint, "encoder")
    decoder_checkpoint_question_varibles = os.path.join(checkpoint, "decoder_0")
    decoder_checkpoint_conditions = os.path.join(checkpoint, "decoder_1")

    decoder_models = [BertForMaskedLM.from_pretrained(decoder_checkpoint_question_varibles),
                      BertForMaskedLM.from_pretrained(decoder_checkpoint_conditions)]
    model = Model2Models.from_pretrained(
        encoder_checkpoint, decoder_model=decoder_models
    )

    model.to(args.device)
    model.eval()

    processor=ChemistryProcessor()

    return args,model,tokenizer,processor
Esempio n. 3
0
from flask import Flask, render_template, request, url_for, jsonify, make_response
from flask_cors import CORS
import json
import torch
import wordfreq
from transformers import BertTokenizer, BertForMaskedLM

app = Flask(__name__)

# https://stackoverflow.com/questions/37575089/disable-template-cache-jinja2
app.config['TEMPLATES_AUTO_RELOAD'] = True
CORS(app)

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def prepareInputs(init_text):
    # List of punctuation to determine where segments end
    punc_list = [".", "?", "!"]
    # Prepend the [CLS] tag
    prompt_text = "[CLS] " + init_text
    # Insert the [SEP] tags
    for i in range(0, len(prompt_text)):
        if prompt_text[i] in punc_list:
            prompt_text = prompt_text[:i + 1] + " [SEP]" + prompt_text[i + 1:]

    return prompt_text

def createSegIDs(tokenized_text):
    currentSeg = 0
Esempio n. 4
0
def train_mlm(lr=3e-5,
              epoch=20,
              save_epoch_cnt=2,
              save_batch_cnt=500,
              mult_batch=True,
              mult_cnt=batch_mult_cnt):
    model = BertForMaskedLM.from_pretrained('../' + model_path)
    masked_ids, token_type_idss, attention_masks, input_idss = \
        pickle.load(open('../preprocess/train_data_for_mlm.pk', 'rb'))
    batch_cnt = len(masked_ids)
    print('data preparation finished, ' + str(len(masked_ids)) +
          ' batch in total')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    for i_epoch in range(epoch):
        model.train()
        epoch_total_loss = 0.0
        temp_batch_cnt = 0
        accu_loss = None
        epoch_run_time = 0
        for i_batch, masked_batch in enumerate(masked_ids):
            time_start = time.time()
            masked_batch = masked_batch.cuda()
            input_ids = input_idss[i_batch].cuda()
            token_type_ids = token_type_idss[i_batch].cuda()
            attention_mask = attention_masks[i_batch].cuda()
            if mult_batch:
                outputs = model(input_ids=masked_batch,
                                token_type_ids=token_type_ids,
                                attention_mask=attention_mask,
                                labels=input_ids)
                if accu_loss is None:
                    accu_loss = outputs[0]
                else:
                    accu_loss = accu_loss + outputs[0]
                temp_batch_cnt += 1
                if temp_batch_cnt >= mult_cnt:
                    temp_batch_cnt = 0
                    accu_loss.backward()
                    optimizer.step()
                    model.zero_grad()
                    epoch_total_loss += accu_loss.float()
                    # record time
                    time_end = time.time()
                    epoch_run_time += time_end - time_start
                    speed = epoch_run_time / (i_batch + 1)
                    eta = int((batch_cnt - (i_batch + 1)) * speed)
                    print(
                        f'epoch:{i_epoch + 1} batch:{i_batch + 1} loss:{accu_loss.float()} avg loss:{epoch_total_loss / (i_batch + 1)} ETA:{eta // 3600}:{(eta % 3600) // 60}:{eta % 60}'
                    )
            else:
                model.zero_grad()
                outputs = model(input_ids=input_ids,
                                token_type_ids=token_type_ids,
                                attention_mask=attention_mask,
                                labels=masked_batch)
                loss = outputs[0]
                loss.backward()
                optimizer.step()
                epoch_total_loss += loss.float()
                # record time
                time_end = time.time()
                epoch_run_time += time_end - time_start
                speed = epoch_run_time / (i_batch + 1)
                eta = int((batch_cnt - (i_batch + 1)) * speed)
                print(
                    f'epoch:{i_epoch + 1} batch:{i_batch + 1} loss:{loss.float()} avg loss:{epoch_total_loss / (i_batch + 1)} ETA:{eta // 3600}:{(eta % 3600) // 60}:{eta % 60}'
                )

            if (i_batch + 1) % save_batch_cnt == 0:
                save_name = '../' + model_path + '_ContTrain_epoch_' + str(
                    i_epoch + 1) + '_batch_' + str(i_batch +
                                                   1) + '_bsz_' + str(mlm_bsz)
                print('saving models as:' + save_name)
                model.bert.save_pretrained(save_name)

        if (i_epoch + 1) % save_epoch_cnt == 0:
            save_name = '../' + model_path + '_ContTrain_epoch_' + str(
                i_epoch + 1) + '_bsz_' + str(mlm_bsz)
            print('saving models as:' + save_name)
            model.bert.save_pretrained(save_name)
Esempio n. 5
0
df_lineage = pd.read_csv(lineage[1], sep='\t', names=["Repo"])[:20000]

print("CSVs loaded")

docstrings_avg_vec = np.load(docstrings_vecs[1], allow_pickle=True)

config = BertConfig.from_json_file(model_path[1] + '/config.json')
config.output_hidden_states = True

print("Tokenizer and model initialized")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

device = torch.device('cpu')

model = BertForMaskedLM.from_pretrained("bert-base-uncased", config=config)
model.load_state_dict(
    torch.load(model_path[1] + "/pytorch_model.bin", map_location=device))
model.eval()

# Initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(docstrings_avg_vec)
index.createIndex({'post': 2}, print_progress=True)

print("Index made")


# Routes:
@app.route('/hello')
def hello_world():
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input training data file (a text file).",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    # Optional parameters
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--do_evaluate",
        type=bool,
        default=False,
        help="Run model evaluation on out-of-sample data.",
    )
    parser.add_argument("--do_train",
                        type=bool,
                        default=False,
                        help="Run training.")
    parser.add_argument(
        "--do_overwrite_output_dir",
        type=bool,
        default=False,
        help="Whether to overwrite the output dir.",
    )
    parser.add_argument(
        "--encoder_model_name_or_path",
        default="bert-base-cased",
        type=str,
        help="The model checkpoint to initialize the encoder's weights with.",
    )
    parser.add_argument(
        "--decoder_model_name_or_path",
        default="/data/zhuoyu/semantic_parsing/models",
        type=str,
        help="The model checkpoint to initialize the decoder's weights with.",
    )
    parser.add_argument(
        "--model_type",
        default="bert",
        type=str,
        help="The decoder architecture to be fine-tuned.",
    )
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--to_cpu",
                        default=False,
                        type=bool,
                        help="Whether to force training on CPU.")
    parser.add_argument(
        "--num_train_epochs",
        default=10,
        type=int,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=4,
        type=int,
        help="Batch size per GPU/CPU for eval.",
    )
    parser.add_argument(
        "--per_gpu_train_batch_size",
        default=4,
        type=int,
        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
        "--input_block_size",
        default=256,
        type=int,
        help="Max seq length for input",
    )
    parser.add_argument(
        "--output_block_size",
        default=64,
        type=int,
        help="Max seq length for output",
    )

    parser.add_argument(
        "--trained_checkpoints",
        default="",
        type=str,
        help="trained_checkpoints",
    )

    parser.add_argument(
        "--decoding_type",
        default="pnt",
        type=str,
        help="",
    )

    parser.add_argument(
        "--encoder_lr",
        default=5e-4,
        type=float,
        help="encoder's learning rate",
    )

    parser.add_argument(
        "--decoder_lr",
        default=5e-4,
        type=float,
        help="encoder's learning rate",
    )

    parser.add_argument(
        "--encoder_warmup",
        default=10,
        type=int,
        help="encoder's learning rate",
    )

    parser.add_argument(
        "--decoder_warmup",
        default=100,
        type=int,
        help="encoder's learning rate",
    )

    parser.add_argument("--seed", default=42, type=int)

    parser.add_argument(
        "--decoder_version",
        default="v1",
        type=str,
        help="",
    )

    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.do_overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --do_overwrite_output_dir to overwrite."
            .format(args.output_dir))

    # Set up training device
    if args.to_cpu or not torch.cuda.is_available():
        args.device = torch.device("cpu")
        args.n_gpu = 0
    else:
        args.device = torch.device("cuda")
        args.n_gpu = torch.cuda.device_count()
        print(args.n_gpu)

    # Load pretrained model and tokenizer. The decoder's weights are randomly initialized.
    tokenizer = AutoTokenizer.from_pretrained(
        args.encoder_model_name_or_path,
        never_split=['[unused0]', '[unused1]', '[unused2]', '[unused3]'])
    #config = BertConfig.from_pretrained(args.model_name_or_path)
    #config.num_hidden_layers=3
    #config.is_decoder=True
    #decoder_model = BertForMaskedLM(config)
    decoder_models = [
        BertForMaskedLM.from_pretrained(args.decoder_model_name_or_path),
        BertForMaskedLM.from_pretrained(args.decoder_model_name_or_path)
    ]
    model = Model2Models.from_pretrained(args.encoder_model_name_or_path,
                                         decoder_model=decoder_models)
    #model = Model2Model.from_pretrained(
    #    args.model_name_or_path, decoder_model=None
    #)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        0,
        args.device,
        args.n_gpu,
        False,
        False,
    )

    logger.info("Training/evaluation parameters %s", args)

    # Train the model

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    if args.do_train:
        model.to(args.device)
        global_step, tr_loss = train(args, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

        logger.info("Saving model checkpoint to %s", args.output_dir)

        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir,
                                      "training_arguments.bin"))

    # Evaluate the model
    results = {}
    if args.do_evaluate:
        checkpoints = [args.trained_checkpoints]
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            encoder_checkpoint = os.path.join(checkpoint, "encoder")
            decoder_checkpoint_question_varibles = os.path.join(
                checkpoint, "decoder_0")
            decoder_checkpoint_conditions = os.path.join(
                checkpoint, "decoder_1")

            decoder_models = [
                BertForMaskedLM.from_pretrained(
                    decoder_checkpoint_question_varibles),
                BertForMaskedLM.from_pretrained(decoder_checkpoint_conditions)
            ]
            model = Model2Models.from_pretrained(encoder_checkpoint,
                                                 decoder_model=decoder_models)

            model.to(args.device)

            #model = PreTrainedEncoderDecoder.from_pretrained(
            #    encoder_checkpoint, decoder_checkpoint
            #)
            #model = Model2Model.from_pretrained(encoder_checkpoint)
            #model.to(args.device)
            results = "placeholder"

            evaluate(args, model, tokenizer, "test")

    return results
Esempio n. 7
0
        self.data = fact_ru + wiki_ru + lenta_ru

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.tokenizer.encode_plus(self.data[i],
                                          **self.tokenizer_params)


#check()

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased",
                                          max_len=128)
model_config = BertConfig.from_json_file("config.json")
model = BertForMaskedLM(model_config)
#model = BertForMaskedLM.from_pretrained("outputs/checkpoint-15000")

dataset = RuDataset(tokenizer)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)

trainer = Trainer(
    model,
    data_collator=data_collator,
    train_dataset=dataset,
    tokenizer=tokenizer,
    # prediction_loss_only=True,
    args=TrainingArguments(output_dir="outputs",
Esempio n. 8
0
def main():
    cfg = args_parse()

    # 如果不存在训练文件则先处理数据
    if not os.path.exists(cfg.DATASETS.TRAIN):
        logger.debug('preprocess data')
        preprocess.main()
    logger.info(f'load model, model arch: {cfg.MODEL.NAME}')
    tokenizer = BertTokenizer.from_pretrained(cfg.MODEL.BERT_CKPT)
    collator = DataCollator(tokenizer=tokenizer)
    # 加载数据
    train_loader, valid_loader, test_loader = make_loaders(
        collator,
        train_path=cfg.DATASETS.TRAIN,
        valid_path=cfg.DATASETS.VALID,
        test_path=cfg.DATASETS.TEST,
        batch_size=cfg.SOLVER.BATCH_SIZE,
        num_workers=4)
    if cfg.MODEL.NAME == 'softmaskedbert4csc':
        model = SoftMaskedBert4Csc(cfg, tokenizer)
    elif cfg.MODEL.NAME == 'macbert4csc':
        model = MacBert4Csc(cfg, tokenizer)
    else:
        raise ValueError("model not found.")
    # 加载之前保存的模型,继续训练
    if cfg.MODEL.WEIGHTS and os.path.exists(cfg.MODEL.WEIGHTS):
        model.load_from_checkpoint(checkpoint_path=cfg.MODEL.WEIGHTS,
                                   cfg=cfg,
                                   map_location=device,
                                   tokenizer=tokenizer)
    # 配置模型保存参数
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
    ckpt_callback = ModelCheckpoint(monitor='val_loss',
                                    dirpath=cfg.OUTPUT_DIR,
                                    filename='{epoch:02d}-{val_loss:.2f}',
                                    save_top_k=1,
                                    mode='min')
    # 训练模型
    logger.info('train model ...')
    trainer = pl.Trainer(
        max_epochs=cfg.SOLVER.MAX_EPOCHS,
        gpus=None if device == torch.device('cpu') else cfg.MODEL.GPU_IDS,
        accumulate_grad_batches=cfg.SOLVER.ACCUMULATE_GRAD_BATCHES,
        callbacks=[ckpt_callback])
    # 进行训练
    # train_loader中有数据
    torch.autograd.set_detect_anomaly(True)
    if 'train' in cfg.MODE and train_loader and len(train_loader) > 0:
        if valid_loader and len(valid_loader) > 0:
            trainer.fit(model, train_loader, valid_loader)
        else:
            trainer.fit(model, train_loader)
        logger.info('train model done.')
    # 模型转为transformers可加载
    if ckpt_callback and len(ckpt_callback.best_model_path) > 0:
        ckpt_path = ckpt_callback.best_model_path
    elif cfg.MODEL.WEIGHTS and os.path.exists(cfg.MODEL.WEIGHTS):
        ckpt_path = cfg.MODEL.WEIGHTS
    else:
        ckpt_path = ''
    logger.info(f'ckpt_path: {ckpt_path}')
    if ckpt_path and os.path.exists(ckpt_path):
        model.load_state_dict(torch.load(ckpt_path)['state_dict'])
        # 先保存原始transformer bert model
        tokenizer.save_pretrained(cfg.OUTPUT_DIR)
        bert = BertForMaskedLM.from_pretrained(cfg.MODEL.BERT_CKPT)
        bert.save_pretrained(cfg.OUTPUT_DIR)
        state_dict = torch.load(ckpt_path)['state_dict']
        new_state_dict = OrderedDict()
        if cfg.MODEL.NAME in ['macbert4csc']:
            for k, v in state_dict.items():
                if k.startswith('bert.'):
                    new_state_dict[k[5:]] = v
        else:
            new_state_dict = state_dict
        # 再保存finetune训练后的模型文件,替换原始的pytorch_model.bin
        torch.save(new_state_dict,
                   os.path.join(cfg.OUTPUT_DIR, 'pytorch_model.bin'))
    # 进行测试的逻辑同训练
    if 'test' in cfg.MODE and test_loader and len(test_loader) > 0:
        trainer.test(model, test_loader)
Esempio n. 9
0
    SAVE_MODEL + "/merges.txt",
)

tokenizer.enable_truncation(max_length=512)
print(tokenizer.encode("For it is in reality vain to profess"))

config = BertConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = BertTokenizer.from_pretrained(SAVE_MODEL, max_len=512)
model = BertForMaskedLM(config=config)

print(model.num_parameters())

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=PATH + "/kant.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)
training_args = TrainingArguments(
    output_dir=SAVE_MODEL,
    overwrite_output_dir=True,
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser()
    # Model and data are required
    parser.add_argument(
        "--dir_pretrained_model",
        type=str,
        required=True,
        help=
        "Dir containing pre-trained model (checkpoint), which may have been fine-tuned already."
    )

    # Required for certain modes (--resume, --do_train, --eval_during_training, --do_eval or --do_pred)
    parser.add_argument(
        "--dir_train",
        type=str,
        help=
        ("Dir containing training data (n files named <lang>.train containing unlabeled text)"
         ))
    parser.add_argument(
        "--dir_output",
        type=str,
        help=
        "Directory in which model will be written (required if --do_train (but not --resume) or --do_pred)"
    )
    parser.add_argument(
        "--path_dev",
        type=str,
        help="Path of 2-column TSV file containing labeled validation examples."
    )
    parser.add_argument(
        "--path_test",
        type=str,
        required=False,
        help="Path of text file containing unlabeled test examples.")
    # Execution modes
    parser.add_argument(
        "--resume",
        action="store_true",
        help=
        "Resume training model in --dir_pretrained_model (note: --dir_output will be ignored)"
    )
    parser.add_argument("--do_train", action="store_true", help="Run training")
    parser.add_argument("--eval_during_training",
                        action="store_true",
                        help="Run evaluation on dev set during training")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Evaluate model on dev set")
    parser.add_argument("--do_pred",
                        action="store_true",
                        help="Run prediction on test set")

    # Score to optimize on dev set (by early stopping)
    parser.add_argument(
        "--score_to_optimize",
        choices=["track1", "track2", "track3"],
        default="track3",
        help="Score to optimize on dev set during training (by early stopping)."
    )

    # Hyperparameters
    parser.add_argument(
        "--freeze_encoder",
        action="store_true",
        help=
        "Freeze weights of pre-trained encoder. (Note: in this case, we do not keep doing MLM.)"
    )
    parser.add_argument(
        "--no_mlm",
        action="store_true",
        help=
        "Do not keep doing masked language modeling (MLM) during fine-tuning.")
    parser.add_argument(
        "--sampling_alpha",
        type=float,
        default=1.0,
        help=
        "Dampening factor for relative frequencies used to compute language sampling probabilities"
    )
    parser.add_argument(
        "--weight_relevant",
        type=float,
        default=1.0,
        help=
        "Relative sampling frequency of relevant languages wrt irrelevant languages"
    )
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for evaluation.")
    parser.add_argument(
        "--seq_len",
        default=128,
        type=int,
        help=
        "Length of input sequences. Shorter seqs are padded, longer ones are trucated"
    )
    parser.add_argument("--learning_rate",
                        default=1e-4,
                        type=float,
                        help="The initial learning rate for AdamW optimizer.")
    parser.add_argument("--equal_betas",
                        action='store_true',
                        help="Use beta1=beta2=0.9 for AdamW optimizer.")
    parser.add_argument(
        "--correct_bias",
        action='store_true',
        help=
        "Correct bias in AdamW optimizer (correct_bias=False is meant to reproduce BERT behaviour exactly."
    )
    parser.add_argument(
        "--max_train_steps",
        default=1000000,
        type=int,
        help=
        "Maximum number of training steps to perform. Note: # optimization steps = # train steps / # accumulation steps."
    )
    parser.add_argument(
        "--num_train_steps_per_epoch",
        default=1000,
        type=int,
        help=
        "Number of training steps that equals one epoch. Note: # optimization steps = # train steps / # accumulation steps."
    )
    parser.add_argument(
        '--grad_accum_steps',
        type=int,
        default=1,
        help=
        "Number of training steps (i.e. batches) to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        "--num_gpus",
        type=int,
        default=-1,
        help="Num GPUs to use for training (0 for none, -1 for all available)")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    args = parser.parse_args()

    # Distributed or parallel?
    if args.local_rank != -1 or args.num_gpus > 1:
        raise NotImplementedError(
            "No distributed or parallel training available at the moment.")
    if torch.cuda.is_available():
        args.device = torch.device("cuda")
        args.n_gpu = 1
    else:
        args.device = torch.device("cpu")
        args.n_gpu = 0

    # Check execution mode
    assert args.resume or args.do_train or args.do_eval or args.do_pred
    if args.resume:
        assert not args.do_train
        assert not args.do_eval
        assert not args.do_pred

    # Load checkpoint. This contains a pre-trained model which may or
    # may not have been fine-tuned for language identification already
    logger.info("Loading checkpoint...")
    checkpoint_path = os.path.join(args.dir_pretrained_model, "checkpoint.tar")
    checkpoint_data = torch.load(checkpoint_path)
    if args.resume:
        # Check progress
        logger.info("Resuming training. Currently at training step %d" %
                    checkpoint_data["global_step"])
        # Replace args with initial args for this job, except for
        # num_gpus, seed and model directory
        current_num_gpus = args.n_gpu
        current_dir_pretrained_model = args.dir_pretrained_model
        args = deepcopy(checkpoint_data["initial_args"])
        args.num_gpus = current_num_gpus
        args.dir_pretrained_model = current_dir_pretrained_model
        args.resume = True
        logger.info("Args (most have been reloaded from checkpoint): %s" %
                    args)
    else:
        if args.eval_during_training:
            assert args.do_train
        if args.do_train or args.do_pred:
            assert args.dir_output is not None
            if os.path.exists(args.dir_output) and os.path.isdir(
                    args.dir_output) and len(os.listdir(args.dir_output)) > 1:
                msg = "%s already exists and is not empty" % args.dir_output
                raise ValueError(msg)
            if not os.path.exists(args.dir_output):
                os.makedirs(args.dir_output)
        if args.do_train:
            assert args.dir_train is not None
            train_paths = glob.glob(os.path.join(args.dir_train, "*.train"))
            assert len(train_paths) > 0
            checkpoint_data["initial_args"] = args
        if args.do_train and args.freeze_encoder and not args.no_mlm:
            logger.warning(
                "Setting --no_mlm to True since --freeze_encoder is True, therefore doing MLM would be pointless."
            )
            args.no_mlm = True
    if args.do_eval or args.eval_during_training:
        assert args.path_dev is not None
        assert os.path.exists(args.path_dev)
    if args.do_pred:
        assert args.path_test is not None
        assert os.path.exists(args.path_test)
    if args.grad_accum_steps < 1:
        raise ValueError(
            "Invalid grad_accum_steps parameter: {}, should be >= 1".format(
                args.grad_accum_steps))

    # Create list of languages we handle
    lang_list = sorted(ALL_LANGS)

    # Seed RNGs
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Load tokenizer
    logger.info("Loading tokenizer...")
    tokenizer_path = os.path.join(args.dir_pretrained_model, "tokenizer.pkl")
    with open(tokenizer_path, "rb") as f:
        tokenizer = pickle.load(f)

    # Make encoder and model
    logger.info("Making encoder...")
    encoder_config = BertConfig.from_json_file(
        os.path.join(args.dir_pretrained_model, "config.json"))
    encoder = BertForMaskedLM(encoder_config)
    logger.info("Making model...")
    model = BertForLangID(encoder, lang_list)
    model.to(args.device)

    # Load model weights. First, check if we just have an encoder, or a previously fine-tuned model
    if "classifier.dense.weight" in checkpoint_data["model_state_dict"]:
        if "best_model_state_dict" in checkpoint_data and not args.resume:
            logger.info("Loading model weights from 'best_model_state_dict'")
            model.load_state_dict(checkpoint_data["best_model_state_dict"])
        else:
            logger.info("Loading model weights from 'model_state_dict'")
            model.load_state_dict(checkpoint_data["model_state_dict"])
    else:
        # Model has not previously been fine-tuned, so we only load encoder weights
        assert args.do_train
        logger.info("Loading encoder weights from 'model_state_dict'")
        model.encoder.load_state_dict(checkpoint_data["model_state_dict"])
    if (args.do_train or args.resume) and args.freeze_encoder:
        model.freeze_encoder()

    # Write encoder config and tokenizer in output directory
    if (not args.resume) and args.do_train:
        path_config = os.path.join(args.dir_output, "config.json")
        model.encoder.config.to_json_file(path_config)
        path_tokenizer = os.path.join(args.dir_output, "tokenizer.pkl")
        with open(path_tokenizer, "wb") as f:
            pickle.dump(tokenizer, f)

    # Log some info on the model
    logger.info("Encoder config: %s" % repr(model.encoder.config))
    logger.info("Model params:")
    for n, p in model.named_parameters():
        msg = "  %s" % n
        if not p.requires_grad:
            msg += " ***FROZEN***"
        logger.info(msg)
    logger.info("Nb model params: %d" % count_params(model))
    logger.info("Nb params in encoder: %d" % count_params(model.encoder))
    logger.info("Nb params in pooler: %d" % count_params(model.pooler))
    logger.info("Nb params in classifier: %d" % count_params(model.classifier))

    # Get data
    max_seq_length = args.seq_len + 2  # We add 2 for CLS and SEP
    if args.resume:
        # Reload training dataset(s)
        logger.info("Reloading training data from checkpoint")
        train_dataset = checkpoint_data["train_dataset"]
        train_dataset.prep_files_for_streaming()
        dev_dataset = checkpoint_data.get("dev_dataset", None)
        unk_dataset = checkpoint_data.get("unk_dataset", None)
        if unk_dataset:
            unk_dataset.prep_files_for_streaming()
    elif args.do_train:
        # Remove unk.train if present, and create a MLM dataset for it.
        path_unk = check_for_unk_train_data(train_paths)
        if path_unk is None:
            unk_dataset = None
        else:
            train_paths.remove(path_unk)
            logger.info("Creating MLM-only training set from %s..." % path_unk)
            unk_dataset = BertDatasetForMLM(
                [path_unk],
                tokenizer,
                max_seq_length,
                sampling_alpha=args.sampling_alpha,
                weight_relevant=args.weight_relevant,
                encoding="utf-8",
                seed=args.seed,
                verbose=DEBUG)

        logger.info("Creating training set from %s training files in %s..." %
                    (len(train_paths), args.dir_train))
        train_dataset = BertDatasetForClassification(
            train_paths,
            tokenizer,
            max_seq_length,
            include_mlm=True,
            sampling_alpha=args.sampling_alpha,
            weight_relevant=args.weight_relevant,
            encoding="utf-8",
            seed=args.seed,
            verbose=DEBUG)
        if path_unk is not None:
            assert len(unk_dataset) == len(train_dataset)
        # Check train_dataset.lang2id: keys should contain all langs, and nothing else, like that of the model
        assert train_dataset.lang2id == model.lang2id
    if not args.resume:
        dev_dataset = None
        if args.do_eval or args.eval_during_training:
            logger.info("Loading validation data from %s..." % args.path_dev)
            dev_dataset = BertDatasetForTesting(args.path_dev,
                                                tokenizer,
                                                model.lang2id,
                                                max_seq_length,
                                                require_labels=True,
                                                encoding="utf-8",
                                                verbose=DEBUG)
        if args.do_train and args.eval_during_training:
            checkpoint_data["dev_dataset"] = dev_dataset
        if args.do_pred:
            logger.info("Loading test data from %s..." % args.path_test)
            test_dataset = BertDatasetForTesting(args.path_test,
                                                 tokenizer,
                                                 model.lang2id,
                                                 max_seq_length,
                                                 require_labels=False,
                                                 encoding="utf-8",
                                                 verbose=DEBUG)

    # Compute number of epochs and steps, initialize number of training steps done.
    num_opt_steps_per_epoch = args.num_train_steps_per_epoch // args.grad_accum_steps
    args.num_epochs = math.ceil(checkpoint_data["max_opt_steps"] /
                                num_opt_steps_per_epoch)
    if args.do_train and (not args.resume):
        checkpoint_data["global_step"] = 0
        checkpoint_data[
            "max_opt_steps"] = args.max_train_steps // args.grad_accum_steps

    # Training
    if args.do_train or args.resume:
        # Prepare optimizer
        logger.info("Preparing optimizer...")
        np_list = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        opt_params = [{
            'params':
            [p for n, p in np_list if not any(nd in n for nd in no_decay)],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in np_list if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        if args.equal_betas:
            betas = (0.9, 0.9)
        else:
            betas = (0.9, 0.999)
        optimizer = AdamW(
            opt_params,
            lr=args.learning_rate,
            betas=betas,
            correct_bias=args.correct_bias
        )  # To reproduce BertAdam specific behaviour, use correct_bias=False

        # Load optimizer state if resuming
        if args.resume:
            optimizer.load_state_dict(checkpoint_data["optimizer_state_dict"])

        # Log some info before training
        logger.info("*** Training info: ***")
        logger.info("  Number of training steps completed: %d" %
                    checkpoint_data["global_step"])
        logger.info("  Max training steps: %d" % args.max_train_steps)
        logger.info("  Gradient accumulation steps: %d" %
                    args.grad_accum_steps)
        logger.info("  Max optimization steps: %d" %
                    checkpoint_data["max_opt_steps"])
        logger.info("  Training dataset size: %d" % len(train_dataset))
        logger.info("  Batch size: %d" % args.train_batch_size)
        logger.info("  # training steps/epoch: %d" %
                    (args.num_train_steps_per_epoch))
        logger.info("  # optimization steps/epoch: %d" %
                    num_opt_steps_per_epoch)
        logger.info("  # epochs to do: %d" % args.num_epochs)
        if args.eval_during_training:
            logger.info("Validation dataset size: %d" % len(dev_dataset))

        # Run training
        train(model,
              optimizer,
              train_dataset,
              args,
              checkpoint_data,
              dev_dataset=dev_dataset,
              unk_dataset=unk_dataset)
        # Reload model
        save_to_dir = args.dir_pretrained_model if args.resume else args.dir_output
        checkpoint_data = torch.load(
            os.path.join(save_to_dir, "checkpoint.tar"))
        if "best_model_state_dict" in checkpoint_data:
            model.load_state_dict(checkpoint_data["best_model_state_dict"])
        else:
            model.load_state_dict(checkpoint_data["model_state_dict"])

    # Evaluate model on dev set
    if args.do_eval:
        logger.info("*** Running evaluation... ***")
        scores = evaluate(model, dev_dataset, args)
        logger.info("***** Evaluation Results *****")
        for score_name in sorted(scores.keys()):
            logger.info("- %s: %.4f" % (score_name, scores[score_name]))

    # Get model's predictions on test set
    if args.do_pred:
        logger.info("*** Running prediction... ***")
        logits = predict(model, test_dataset, args)
        pred_class_ids = np.argmax(logits.cpu().numpy(), axis=1)
        pred_labels = [test_dataset.label_list[i] for i in pred_class_ids]
        path_pred = os.path.join(args.dir_output, "pred.txt")
        logger.info("Writing predictions in %s..." % path_pred)
        with open(path_pred, 'w', encoding="utf-8") as f:
            for x in pred_labels:
                f.write("%s\n" % x)
Esempio n. 11
0
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Loading the dataset from local csv file.
    data_files = {}
    data_files["train"] = args.train_file
    data_files["validation"] = args.validation_file
    raw_datasets = load_dataset("json", data_files=data_files)

    # Get the label list
    label_list = raw_datasets["train"].unique("label")
    label_list.sort()  # Let's sort it for determinism

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    tokenizer = MyBertTokenizer.from_pretrained("bert-base-cased")
    model = BertForMaskedLM.from_pretrained(args.model_name_or_path)

    # Preprocessing the datasets
    padding = "max_length" if args.pad_to_max_length else False

    # We have made sure that pos_token_id and neg_token_id is a single token
    label_to_text = {0: "terrible", 1: "fantastic"}
    logger.info(f"Label to text mapping: {label_to_text}")

    mask_token_id = tokenizer.mask_token_id
    pos_token_id = tokenizer(label_to_text[1])["input_ids"][1]
    neg_token_id = tokenizer(label_to_text[0])["input_ids"][1]
    label_to_token_id = {0: neg_token_id, 1: pos_token_id}

    def preprocess_function(examples):
        # Tokenize the source texts
        texts = examples["sentence"]
        result = tokenizer(texts,
                           padding=padding,
                           max_length=args.max_length,
                           truncation=True)

        # Add prompt
        prompt_token_ids = tokenizer("The movie is [MASK].")["input_ids"][1:]
        input_ids = result["input_ids"]
        input_ids = [ids[:-1] + prompt_token_ids for ids in input_ids]
        result["input_ids"] = input_ids

        additional_len = len(
            prompt_token_ids) - 1  # Not including [BOS] and [EOS]

        # Add attention mask
        attention_mask = result["attention_mask"]
        attention_mask_with_prompt = [
            x + [1] * additional_len for x in attention_mask
        ]
        result["attention_mask"] = attention_mask_with_prompt

        # Add token type
        token_type_ids = result["token_type_ids"]
        token_type_ids_with_prompt = [
            x + [0] * additional_len for x in token_type_ids
        ]
        result["token_type_ids"] = token_type_ids_with_prompt

        # Important!!! Since we use padding, the mask position is not fixed in the end
        mask_positions = [ids.index(mask_token_id) for ids in input_ids]
        result["mask_positions"] = mask_positions

        # Prepare labels
        sentiments = [label_to_token_id[l] for l in examples["label"]]
        labels = []
        for x, y, z in zip(input_ids, sentiments, mask_positions):
            label = [-100] * len(x)
            label[z] = y
            labels.append(label)

        # This is for all tokens
        result["labels"] = labels
        #
        # This is the true label for each sample
        result["targets"] = examples["label"]
        return result

    processed_datasets = raw_datasets.map(
        preprocess_function,
        batched=True,
        remove_columns=raw_datasets["train"].column_names,
        desc="Running tokenizer on dataset",
    )

    train_dataset = processed_datasets["train"]
    eval_dataset = processed_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorWithPadding(
            tokenizer,
            pad_to_multiple_of=(8 if accelerator.use_fp16 else None))

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  collate_fn=data_collator,
                                  batch_size=args.per_device_train_batch_size)

    eval_dataloader = DataLoader(eval_dataset,
                                 collate_fn=data_collator,
                                 batch_size=args.per_device_eval_batch_size)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader)

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps /
                                          num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    metric = load_metric("accuracy")

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")

    # Get the initial zero-shot accuracy
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        # Extract targets
        targets = batch.pop("targets")
        mask_positions = batch.pop("mask_positions")
        outputs = model(**batch)

        predictions = []
        for x, y in zip(outputs.logits,
                        mask_positions):  # logits size [8 X 49 X 30522]
            pos_logit = x[y][pos_token_id]
            neg_logit = x[y][neg_token_id]
            if pos_logit > neg_logit:
                predictions.append(1)
            else:
                predictions.append(0)
        predictions = torch.tensor(predictions).to(targets.device)

        metric.add_batch(
            predictions=accelerator.gather(predictions),
            references=accelerator.gather(targets),
        )

    eval_metric = metric.compute(
    )  # After this, all batches in metric will be cleared.
    logger.info(f"zero-shot accuracy: {eval_metric}")

    exit(0)
    # Only show the progress bar once on each machine.
    completed_steps = 0

    best_acc = 0.0
    start = time.time()
    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            # Extract targets
            batch.pop("targets")
            batch.pop("mask_positions")
            outputs = model(**batch)

            loss = outputs.loss
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                completed_steps += 1
                if completed_steps % 100 == 0:
                    logger.info(
                        f"Completed {completed_steps} steps, time passed: {time.time() - start}s."
                    )

            if completed_steps >= args.max_train_steps:
                break

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            # Extract targets
            targets = batch.pop("targets")
            mask_positions = batch.pop("mask_positions")
            outputs = model(**batch)

            predictions = []
            for x, y in zip(outputs.logits,
                            mask_positions):  # logits size [8 X 49 X 30522]
                pos_logit = x[y][pos_token_id]
                neg_logit = x[y][neg_token_id]
                if pos_logit > neg_logit:
                    predictions.append(1)
                else:
                    predictions.append(0)
            predictions = torch.tensor(predictions).to(targets.device)

            metric.add_batch(
                predictions=accelerator.gather(predictions),
                references=accelerator.gather(targets),
            )

        eval_metric = metric.compute(
        )  # After this, all batches in metric will be cleared.
        logger.info(f"epoch {epoch}: {eval_metric}")

        if eval_metric["accuracy"] > best_acc:
            best_acc = eval_metric["accuracy"]
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(args.output_dir,
                                            save_function=accelerator.save)
        attention_mask=train_data_feature['attention_mask'],
        masked_lm_labels=train_data_feature['masked_lm_labels'])
    test_dataset = makeDataset(
        token_embeddings=test_data_feature['token_embeddings'],
        segement_embeddings=test_data_feature['segement_embeddings'],
        attention_mask=test_data_feature['attention_mask'],
        masked_lm_labels=test_data_feature['masked_lm_labels'])
    train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

    # 設定model參數
    # type_vocab_size:如果token_type_ids有用到2種以上時,則需要做修改
    # 另外還有在本地的transformers/modeling_utils.py中第469~471行需要註解掉,因為pytorch會拋出錯誤訊息(RuntimeError: Error(s) in loading state_dict for BertForMaskedLM : size mismatch for bert.embeddings.token_type_embeddings.weight: copying a param with shape torch.Size([2, 768]) from checkpoint, the shape in current model is torch.Size([3, 768]).),但原始bert就有16維可以用,所以我用3維應該是沒問題的
    config = BertConfig.from_pretrained('bert-base-chinese', type_vocab_size=3)
    model = BertForMaskedLM.from_pretrained(
        'bert-base-chinese',
        from_tf=bool('.ckpt' in 'bert-base-chinese'),
        config=config)
    model.to(device)

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }, {
        'params': [
            p for n, p in model.named_parameters()
Esempio n. 13
0
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    tst_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/test.csv')
    trn_df = pd.concat([trn_df, tst_df], axis=0).fillna(-1)
    trn_df['is_original'] = 1
    # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv')
    # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv')
    # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv')

    # clean texts
    # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer'])

    # load additional tokens
    # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin:
    #     additional_tokens = pickle.load(fin)

    gkf = GroupKFold(n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold > 0:
            break
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(
                    histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ',
            logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            trn_df = trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]  #  + additional_tokens

        fold_trn_df = trn_df.drop(['is_original', 'question_body_le'], axis=1)
        # fold_trn_df = pd.concat([fold_trn_df, raw_pseudo_df, opt_pseudo_df, half_opt_pseudo_df], axis=0)

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[SEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
            use_category=False,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        model = BertForMaskedLM.from_pretrained('bert-base-uncased')

        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue

            # model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch_ML(model, optimizer, trn_loader, DEVICE)

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(trn_loss)
            else:
                histories['val_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(trn_loss)
            else:
                histories['val_metric'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(trn_loss)
            else:
                histories['val_metric_raws'][fold] = [
                    trn_loss,
                ]

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ',
                logger)
            model = model.to('cpu')
            # model = model.module
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                [],
                [],
                [],
                fold,
                epoch,
                trn_loss,
                trn_loss,
            )
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer,
                                      clean=False)
        del model

    send_line_notification('fini!')

    sel_log('now saving best checkpoints...', logger)
Esempio n. 14
0
                    interval = end - start
                    param.data[start:end, :].copy_(
                        pretrained_weight.data[:interval])
                    start = end
            elif "decoder.cls.predictions.bias" in name or "cls.predictions.bias" in name:
                param.data[:pretrained_weight.shape[0]].copy_(
                    pretrained_weight.data)
            else:
                param.data.copy_(pretrained_weight.data)
        else:
            print(name)


if __name__ == "__main__":
    config = AutoConfig.from_pretrained("bert-base-uncased")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    # pretrained_model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
    pretrained_model = AutoModelForCausalLM.from_pretrained(
        "bert-base-uncased")
    add_tokens(tokenizer)
    config.max_position_embeddings = 1024 + 2
    config.vocab_size = len(tokenizer.get_vocab())

    # config = EncoderDecoderConfig.from_encoder_decoder_configs(config, config)
    # model = EncoderDecoderModel(config=config)
    model = BertForMaskedLM(config)
    load_weights(model, pretrained_model)

    model.save_pretrained('bert-base-uncased-itokens')
    tokenizer.save_pretrained('bert-base-uncased-itokens')
Esempio n. 15
0
model.to('cuda')

# Predict hidden states features for each layer
with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    # Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the hidden state of the last layer of the Bert model
    encoded_layers = outputs[0]
# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
assert tuple(encoded_layers.shape) == (1, len(indexed_tokens),
                                       model.config.hidden_size)

# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
Esempio n. 16
0
                        help="Location of the model",
                        type=str,
                        required=True)
    parser.add_argument("--tokenizer",
                        help="Location of the tokenizer",
                        type=str,
                        required=True)
    parser.add_argument("--mode",
                        type=str,
                        choices=["mask_first", "mask_last"],
                        required=True)
    parser.add_argument("--metric",
                        help="Which metric to calculate ?",
                        choices=["rank", "probability"],
                        required=True)
    parser.add_argument("--metrics-output-path", type=str)
    args = parser.parse_args()

    tokenizer = BertTokenizer.from_pretrained(args.tokenizer)
    model = BertForMaskedLM.from_pretrained(args.model).cuda().eval()

    metrics_output_path = args.metrics_output_path if args.metrics_output_path is not None else args.model

    import os
    metrics_output_path = os.path.join(
        metrics_output_path,
        f"first_name_given_last_name/{args.mode}_{args.metric}")
    os.makedirs(metrics_output_path, exist_ok=True)

    evaluate(model, tokenizer, args.mode, args.metric, metrics_output_path)
import torch
from transformers import BertTokenizer, BertForMaskedLM
import sys
sys.path.append(".")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained(
    "shibing624/macbert4csc-base-chinese")
model = BertForMaskedLM.from_pretrained("shibing624/macbert4csc-base-chinese")
model = model.to(device)
sentence = "你找到你最喜欢的工作,我也很高心"
tokens = ['[CLS]'] + tokenizer.tokenize(sentence) + ['[SEP]']
print(tokens)
# with torch.no_grad():
# outputs = model(**tokenizer(texts, padding=True, return_tensors='pt').to(device))

for i in range(1, len(tokens) - 1):
    tmp = tokens[:i] + ['[MASK]'] + tokens[i + 1:]
    masked_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tmp)])
    segment_ids = torch.tensor([[0] * len(tmp)])
    attention_mask = torch.tensor([[1] * len(tmp)])

    outputs = model(masked_ids,
                    attention_mask=attention_mask,
                    token_type_ids=segment_ids)
    prediction_scores = outputs[0]
    print(tmp)
    # 打印被预测的字符
    prediction_index = torch.argmax(prediction_scores[0, i]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([prediction_index])[0]
Esempio n. 18
0
 def __init__(self, model_name_or_path: str):
     super(BertPretrain, self).__init__()
     self.bert_model = BertForMaskedLM.from_pretrained(model_name_or_path)
def main(): 
    print('start of main')
    parser = argparse.ArgumentParser(
        description = '''This script computes probabilities for a masked token
                         with words from the words file, and
                         stores result in csv format to the output file ''')
    
    parser.add_argument("-s", type = str, required=True, dest = "sent_type", help = 'class name: "sv_agreement" or "anaphora"')
    parser.add_argument("-t", type = str, required=True, dest = "template", help = 'template name (see templates.txt)')
    parser.add_argument("-g", type = int, required=False, default=None, dest = "gpu_num", help = 'which gpu to run this on')
    parser.add_argument("-m", type = str, required=False, default='bert-base-uncased', dest = "model_path_or_name", help = 'path to the model or name of the model')


    args = parser.parse_args()

    if args.sent_type not in ['sv_agreement', 'anaphora']:
        parser.error("invalid sent_type argument for -s")

    print('creating results path')
    use_wug = args.model_path_or_name != 'bert-base-uncased'

    number = None

    if use_wug: 
        model_type = args.model_path_or_name.split('/')
        if model_type[-1]=='': 
            model_type = model_type[:-1]
        number = model_type[-3].lower()
        model_path = '/'.join(model_type[-3:])
        
        results_path = FINE_TUNE_RESULTS_PATH[:-7] % model_path
        if not os.path.isdir(results_path): 
            print('creating directory %s' % results_path)
            os.mkdir(results_path)
        results_path = FINE_TUNE_RESULTS_PATH[:-4] % (model_path, args.sent_type)
        if not os.path.isdir(results_path): 
            print('creating directory %s' % results_path)
            os.mkdir(results_path)
        results_path = FINE_TUNE_RESULTS_PATH % (model_path, args.sent_type, args.template)
    else: 
        results_path = RESULTS_PATH[:-4] % args.sent_type
        if not os.path.isdir(results_path): 
            print('creating directory %s' % results_path)
            os.mkdir(results_path)
        results_path = RESULTS_PATH % (args.sent_type, args.template)

    results_filename = RESULTS_FILENAME % args.template

    outfilename = os.path.join(str(ABS_PATH), results_path, results_filename)

    if not os.path.isdir(results_path):
        print('creating directory %s' % results_path)
        os.mkdir(results_path)

    print('getting consts')

    sent_types = csp_consts.SENT_TYPES[args.sent_type]
    batch_sizes = csp_consts.BERT_BATCH_SIZES[args.sent_type]
    masked_types = csp_consts.MASKED_TYPE[args.sent_type]
    max_len_types = csp_consts.BERT_MAX_TYPE[args.sent_type]

    try:
        template_name = sent_types[args.template]
        batch_size_dict = batch_sizes[args.template]
        masked_type = masked_types[args.template]
        max_len = max_len_types[args.template]
    except KeyError:
        parser.error("Incompatible template for the given sentence type")
        sys.exit()
    
    print('loading model at', datetime.now())


    bert_tokenizer = BertTokenizer.from_pretrained(args.model_path_or_name)
    bert_model = BertForMaskedLM.from_pretrained(args.model_path_or_name)
    bert_model.eval()

    if args.gpu_num is not None: 
        device = torch.device('cuda:'+str(args.gpu_num) if torch.cuda.is_available() else 'cpu')
        print('running on GPU: %d' % args.gpu_num)
    else: 
        device = torch.device('cpu')

    bert_model.to(device)

    batch_size = batch_size_dict['pairs']
    num_sents = batch_size_dict['sents']
    if use_wug: 
        batch_size *= 2
        num_sents //= 2

    print('starting all computations at', datetime.now())
    eval_from_file(bert_model, bert_tokenizer, template_name, outfilename, masked_type, batch_size, num_sents, max_len, device=device, use_wug=use_wug, number=number)
    print('completed all computations at', datetime.now())
Esempio n. 20
0
def aspect_extractor_trainer(data_itr,
                             model_name,
                             bert_tokenizer,
                             linguistic_vocab,
                             required_features_list,
                             lang,
                             lowercase_data,
                             H,
                             lr,
                             scheduler_patience_steps,
                             scheduler_decay_factor,
                             scheduler_min_lr,
                             epochs,
                             max_norm,
                             no_improvement_tolerance=5000,
                             save_model_name="project_sublayers.pt",
                             relative_sizing=False,
                             resolution_strategy="first",
                             report_every=5000):
    """
    Implementation of the sub-layer model trainer which pre-trains the transformer heads using the BERT vectors.
    """
    assert len(required_features_list) > 0, "You have to select some features"
    assert linguistic_vocab is not None and len(linguistic_vocab) > 0

    Hs = []
    for rfl in required_features_list:
        if rfl in linguistic_vocab:
            if relative_sizing:
                print(
                    "This might not be supported in the multi-head implementation"
                )
                Hs.append(len(linguistic_vocab[rfl]))
            else:
                # TODO consider hierarchical encoding of features here
                Hs.append(1.0)
    assert len(Hs) > 0
    Hs.append(max(Hs))
    weight_ratio = int(float(H) / sum(Hs))
    assert weight_ratio > 1
    Hs = [int(weight_ratio * ind) for ind in Hs]
    Hs[-1] += max(0, (H - sum(Hs)))
    print(
        "Loading the pre-trained BertForMaskedLM model: {}".format(model_name))
    bert_lm = BertForMaskedLM.from_pretrained(
        model_name, output_hidden_states=True).to(device)
    number_of_bert_layers = len(bert_lm.bert.encoder.layer) + 1
    D_in = D_out = bert_lm.bert.config.hidden_size
    reverse_linguistic_vocab = create_reverse_linguistic_vocab(
        linguistic_vocab)
    print("Loading Spacy Tokenizers")
    spacy_tokenizer_1, spacy_tokenizer_2 = SpacyTokenizer(
        lang, lowercase_data), SpacyTokenizer(lang, lowercase_data)
    spacy_tokenizer_2.overwrite_tokenizer_with_split_tokenizer()
    print("Creating the model")
    model = AspectExtractor(
        D_in, Hs, D_out,
        [len(linguistic_vocab[f]) + 1
         for f in required_features_list], number_of_bert_layers,
        required_features_list, reverse_linguistic_vocab).to(device)
    model.apply(weight_init)
    opt = optim.SGD(model.parameters(), lr=float(lr), momentum=0.9)
    scheduler = ReduceLROnPlateau(opt,
                                  mode='min',
                                  patience=scheduler_patience_steps,
                                  factor=scheduler_decay_factor,
                                  threshold=0.001,
                                  verbose=False,
                                  min_lr=scheduler_min_lr)
    print("Starting to train ...")
    break_condition = False
    for t in range(epochs):
        if break_condition:
            print(
                "Minimum {} batches have been observed without any accuracy improvements in classifiers, ending the training ..."
                .format(no_improvement_tolerance))
            break
        all_loss = 0.0
        all_tokens_count = 0.0
        feature_pred_corrects = [0 for _ in range(len(required_features_list))]
        feature_pred_correct_all = 0.0
        all_prediction = [[] for _ in required_features_list]
        all_actual = [[] for _ in required_features_list]
        # TODO use the actual dataset object instead of this iterator
        itr = data_itr()
        tolerance_counts = [0 for _ in required_features_list]
        tolerance_bests = [0.0 for _ in required_features_list]
        for batch_id, input_sentences in enumerate(itr):
            sequences = [
                torch.tensor(
                    bert_tokenizer.tokenizer.encode(input_sentence,
                                                    add_special_tokens=True),
                    device=device) for input_sentence in input_sentences
            ]
            features, feature_weights = map_sentences_to_vocab_ids(
                input_sentences, required_features_list, linguistic_vocab,
                spacy_tokenizer_1, spacy_tokenizer_2, bert_tokenizer)
            input_ids = torch.nn.utils.rnn.pad_sequence(
                sequences,
                batch_first=True,
                padding_value=bert_tokenizer.tokenizer.pad_token_id)
            if input_ids.size(1) > bert_lm.config.max_position_embeddings:
                continue
            outputs = bert_lm(input_ids, masked_lm_labels=input_ids)[
                2]  # (batch_size * [input_length + 2] * 768)
            all_layers_embedded = torch.cat(
                [o.detach().unsqueeze(0) for o in outputs], dim=0)
            maxes = torch.max(model.bert_weights_for_average_pooling,
                              dim=-1,
                              keepdim=True)[0]
            x_exp = torch.exp(model.bert_weights_for_average_pooling - maxes)
            x_exp_sum = torch.sum(x_exp, dim=-1, keepdim=True)
            output_custom = x_exp / x_exp_sum
            embedded = torch.matmul(all_layers_embedded.permute(1, 2, 3, 0),
                                    output_custom)
            # sequence_length, batch_size, len(feats)
            predictions = torch.zeros(embedded.size(1), embedded.size(0),
                                      len(required_features_list))
            for s in range(1, embedded.size(1) - 1):
                x = embedded.select(1, s)
                features_selected = []
                feature_weights_selected = []
                permitted_to_continue = True
                for f, fw in zip(features, feature_weights):
                    if s < f.size(1):
                        features_selected.append(f.select(1, s))
                        feature_weights_selected.append(fw.select(1, s))
                    else:
                        permitted_to_continue = False
                if not permitted_to_continue:
                    continue
                _, loss, feature_pred_correct, feat_predictions = model(
                    x, features_selected, feature_weights_selected)
                predictions[s] = feat_predictions
                for ind, score in enumerate(feature_pred_correct):
                    feature_pred_corrects[ind] += score.sum().item()
                feature_pred_correct_all += feature_pred_correct[0].size(0)
                model.zero_grad()
                loss.backward(retain_graph=True)
                nn.utils.clip_grad_norm_(model.parameters(), max_norm)
                opt.step()
                all_loss += loss.item()
                all_tokens_count += x.size(0)
                _classification_report_ = [
                    "{}:{:.2f}%".format(
                        feat.upper(),
                        float(feature_pred_corrects[ind] * 100) /
                        feature_pred_correct_all)
                    for ind, feat in enumerate(required_features_list)
                ]
                itr.set_description(
                    "Epoch: {}, Average Loss: {:.2f}, [{}]".format(
                        t, all_loss / all_tokens_count,
                        "; ".join(_classification_report_)))
            # if model has not had any improvements in any of the classifier scores after {no_improvement_tolerance} batches, the training will stop.
            for ind, feat in enumerate(required_features_list):
                feat_score = round(
                    float(feature_pred_corrects[ind] * 100) /
                    feature_pred_correct_all, 3)
                if tolerance_bests[ind] < feat_score:
                    tolerance_bests[ind] = feat_score
                    tolerance_counts[ind] = 0
                else:
                    tolerance_counts[ind] = tolerance_counts[ind] + 1
            break_condition = sum([
                1 if tolerance_counts[ind] >= no_improvement_tolerance else 0
                for ind, feat in enumerate(required_features_list)
            ]) == len(required_features_list)
            if break_condition:
                break
            scheduler.step(all_loss / all_tokens_count)
            predictions = predictions.transpose(0, 1)
            for b in range(predictions.size(0)):
                for l in range(1, predictions.size(1) - 1):
                    classes = predictions[b][l]
                    for idx in range(len(required_features_list)):
                        pred_id = int(classes[idx].item()) - 1
                        if idx >= len(features) or b >= features[idx].size(
                                0) or l >= features[idx].size(1):
                            # print("WARNING: skipping access to index out of bounds for a tensor with size "
                            #      "({}, {}, {}) with indices [{}, {}, {}]".format(len(features), features[idx].size(0),
                            #                                                      features[idx].size(1), idx, b, l))
                            continue
                        actual_id = int(features[idx][b][l].item()) - 1
                        predicted_label = reverse_linguistic_vocab[
                            required_features_list[idx]][
                                pred_id] if pred_id > -1 else '__PAD__'
                        actual_label = reverse_linguistic_vocab[
                            required_features_list[idx]][
                                actual_id] if actual_id > -1 else '__PAD__'
                        # predicted_bis, predicted_label = separate_bis_label(predicted_label)
                        # actual_bis, actual_label = separate_bis_label(actual_label)
                        if actual_label != '__PAD__':
                            all_actual[idx].append(actual_label)
                            all_prediction[idx].append(predicted_label)
                        # print(pred_tag, actual_label, actual_bis, predicted_label, predicted_bis, predicted_label == actual_label)
            if batch_id and batch_id % report_every == 0:
                print("Creating report/persisting trained model ...")
                create_train_report_and_persist_modules(
                    model, save_model_name, all_actual, all_prediction,
                    feature_pred_correct_all, feature_pred_corrects,
                    required_features_list, resolution_strategy)
                print(
                    "Cleaning up the collected actual/prediction labels [done due to prevent application getting killed for memory limits]"
                )
                for idx in range(len(required_features_list)):
                    del all_actual[idx][:]
                    del all_prediction[idx][:]
        create_train_report_and_persist_modules(model, save_model_name,
                                                all_actual, all_prediction,
                                                feature_pred_correct_all,
                                                feature_pred_corrects,
                                                required_features_list,
                                                resolution_strategy)
    print("Training done.")
        basedir = os.path.dirname(__file__)
    return os.path.join(basedir, path)


if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


# Load a trained model and vocabulary that you have fine-tuned
model = BertForMaskedLM.from_pretrained(output_dir,
                                        output_attentions=False,  # Whether the model returns attentions weights.
                                        output_hidden_states=True,  # Whether the model returns all hidden-states.
                                        )
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Copy the model to the GPU.
# model.to(device)
model.eval()


def clean(sent):
    sent = sent.translate(str.maketrans('', '', string.punctuation))
    sent = sent.lower().split()
    sent = [word for word in sent if word not in words]
    sent = ' '.join(sent)
    return sent
Esempio n. 22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--extract_data", action="store_true", help="extract data from scratch (otherwise, load data from saved file)")
    parser.add_argument("--all_langs", action="store_true", help="Translate from all langs to all langs")
    parser.add_argument("--eval_pos", help="evaluate a specific pos tag: NOUN, VERB or ADJ")
    args = parser.parse_args()

    random_seed = 10
    num_langs = 15
    sent_per_lang = 5000
    num_classifiers = 20
    num_to_eval = 1000
    random.seed(random_seed)

    print("random_seed", random_seed)
    print("num_langs", num_langs)
    print("sent_per_lang", sent_per_lang)
    print("num_classifiers", num_classifiers)
    print("num_to_eval", num_to_eval)
    print(args)

    # load mBERT
    pretrained_weights = 'bert-base-multilingual-uncased'
    tokenizer_mlm = BertTokenizer.from_pretrained(pretrained_weights)
    model_mlm = BertForMaskedLM.from_pretrained(pretrained_weights, output_hidden_states=True)
    output_embeddings = model_mlm.cls.predictions.decoder.weight.detach().cpu().numpy()

    # collect data (<sent_per_lang> sentences) from <num_langs> most frequent languages, from TED
    data, langs = collect_data_per_lang(num_langs=num_langs, sent_per_lang=sent_per_lang)

    # extract representations of random tokens from each sentence
    random.seed(random_seed)
    data_filename = "../data/data_with_states_{}lang_{}perlang_embeddings".format(num_langs, sent_per_lang)

    if args.extract_data:
        # extract the representations and dump them to a file
        data_with_states = extract_repr_random_token_bert_mlm(tokenizer_mlm,
                                                              copy.deepcopy(data)[:num_langs * sent_per_lang],
                                                              output_embeddings)
        with open(data_filename, "wb") as f:
            pickle.dump(data_with_states, f)
        print("extracted data")
    else:
        # load the representations instead of extracting them
        with open(data_filename, "rb") as f:
            data_with_states = pickle.load(f)
        print("loaded data")

    # data for lang_repr
    vecs, labels_lang = data_for_lang_repr(data_with_states, random_seed)

    # create a vector representation for each language, and save to file
    random.seed(random_seed)
    lang_repr = create_repr_per_lang(vecs, labels_lang)
    lang_repr_filename = "../data/lang_repr_{}lang_no_inlp".format(num_langs)

    with open(lang_repr_filename, "wb") as f:
        pickle.dump(lang_repr, f)


    # evaluation using northeuralex

    # extract north_euralex data and save
    # 'zh-tw', 'zh-cn' (chinese) and 'pt-br' (brazilian portuguese) are not in this data
    ted2eur = {"en": "eng", "ar": "arb", "he": "heb", "ru": "rus", "ko": "kor", "it": "ita", "ja": "jpn",
               "es": "spa", "fr": "fra", "nl": "nld", "ro": "ron", "tr": "tur"}

    eur2ted = {v: k for k, v in ted2eur.items()}
    all_translations, map_word_pos = extract_north_euralex(eur2ted)

    with open("../data/all_translations_north_euralex", "wb") as f:
        pickle.dump(all_translations, f)

    # names of files
    details_filename = "../data/north_euralex_details"
    repr_filename = "../data/representations_embed_{}lang_no_inlp".format(num_langs)

    # evaluate on north_euralex
    if args.all_langs:
        all_langs_1 = []
        all_langs_5 = []
        all_langs_10 = []
        for source_lang in ["eng", "rus", "nld", "fra", "spa", "ita", "ron", "tur", "kor", "jpn", "arb", "heb"]:
            for target_lang in ["eng", "rus", "nld", "fra", "spa", "ita", "ron", "tur", "kor", "jpn", "arb", "heb"]:
                print(source_lang, "to", target_lang)
                rank_before, rank_after = evaluate_northeuralex_all_langs(source_lang, target_lang, lang_repr,
                                                                          eur2ted, all_translations,
                                                                          output_embeddings, tokenizer_mlm)
                acc1, acc5, acc10 = print_evals(rank_before, rank_after, return_accs=True)
                all_langs_1.append(acc1)
                all_langs_5.append(acc5)
                all_langs_10.append(acc10)
        all_langs_1 = np.array(all_langs_1)
        all_langs_5 = np.array(all_langs_5)
        all_langs_10 = np.array(all_langs_10)
        print(all_langs_1)
        print(all_langs_5)
        print(all_langs_10)
        np.save("../data/all_langs_1", all_langs_1.reshape(12, 12))
        np.save("../data/all_langs_5", all_langs_5.reshape(12, 12))
        np.save("../data/all_langs_10", all_langs_10.reshape(12, 12))

    else:
        rank_before, rank_after = evaluate_northeuralex(lang_repr, eur2ted, all_translations, map_word_pos, output_embeddings,
                                                            tokenizer_mlm,
                                                            args.eval_pos,
                                                            repr_filename=repr_filename,
                                                            details_filename=details_filename)


        # print evaluations
        for lang in rank_before:
            print("\nlang:", lang)
            print_evals(rank_before[lang], rank_after[lang])

        rank_before_all = []
        rank_after_all = []
        for lang in rank_before:
            rank_before_all += rank_before[lang]
            rank_after_all += rank_after[lang]
        print("\nall together\n")
        print_evals(rank_before_all, rank_after_all)
Esempio n. 23
0
def AE(df):

    model_type = 'bert-base-uncased'

    tokenizer = BertTokenizer.from_pretrained(model_type)
    model = BertModel.from_pretrained(model_type, return_dict=True)
    mask_model = BertForMaskedLM.from_pretrained(model_type, return_dict=True)

    sep_token = '[SEP]'
    mask_token = '[MASK]'

    mask_id = tokenizer(mask_token)['input_ids'][1]
    sep_id = tokenizer(sep_token)['input_ids'][1]

    optimizer = AdamW(model.parameters())
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    mask_model.to(device)

    auxiliary_tokens = ['the', 'aspect', 'term', 'is']

    df['mask_tokens'] = 0
    df['auxiliary_tokens'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        #for j in range(len(df['aspect_terms'].iloc[i])):
        auxiliary_sents = []
        for j in range(len(df['aspect_terms'].iloc[i])):
            aspect_terms = df['aspect_terms'].iloc[i][j]
            auxiliary_sent = auxiliary_tokens + [aspect_terms] + [
                sep_token
            ] + df['tokens'].iloc[i]
            auxiliary_sents.append(auxiliary_sent)

        mask_sent = auxiliary_tokens + [mask_token] + [sep_token
                                                       ] + df['tokens'].iloc[i]
        df['mask_tokens'].iloc[i] = mask_sent
        df['auxiliary_tokens'].iloc[i] = auxiliary_sents

    df['distance'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        tokenized = tokenizer.encode(df['mask_tokens'].iloc[i])

        sep_index = tokenized.index(sep_id)
        mask_index = tokenized.index(mask_id)

        tokenized = pd.Series([tokenized])

        padded = pad_sequences(tokenized,
                               maxlen=MAX_LEN,
                               dtype="long",
                               value=0,
                               truncating="post",
                               padding="post")

        attention_mask = np.where(padded != 0, 1, 0)

        input_ids = torch.tensor(padded).to(device)
        attention_mask = torch.tensor(attention_mask).to(device)

        with torch.no_grad():
            last_hidden_states = model(input_ids,
                                       attention_mask=attention_mask)

        original_mask_embedding = last_hidden_states[0][:, mask_index, :].cpu(
        ).numpy()

        distance = []

        for pertubed_index in range(sep_index + 1, MAX_LEN):
            padded = pad_sequences(tokenized,
                                   maxlen=MAX_LEN,
                                   dtype="long",
                                   value=0,
                                   truncating="post",
                                   padding="post")
            if padded[0][pertubed_index] != 0 and padded[0][
                    pertubed_index] != sep_id:
                #print(padded.shape)
                cur_id = padded[0][pertubed_index]
                padded[0][pertubed_index] = mask_id

                cur_embedding = mask_embedding(model, padded, mask_index)
                d = dist(original_mask_embedding, cur_embedding)
                distance.append((cur_id, d))

        df['distance'].iloc[i] = distance

    df['perturbed_mask_index'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        perturbed_mask_index = []
        mask_threshold = calculate_threshold(
            np.array(df['distance'].iloc[i])[:, 1], std_strength)
        for dis_index in range(len(df['distance'].iloc[i])):
            if df['distance'].iloc[i][dis_index][1] < mask_threshold and df[
                    'labels'].iloc[i][dis_index] != 'B' and df['labels'].iloc[
                        i][dis_index] != 'I':
                perturbed_mask_index.append(dis_index)

        df['perturbed_mask_index'].iloc[i] = perturbed_mask_index

    df['augment_token_id'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        augment_tokenizeds = []

        for j in range(len(df['aspect_terms'].iloc[i])):

            tokenized = tokenizer.encode(df['auxiliary_tokens'].iloc[i][j])
            tokenized = torch.Tensor(tokenized).unsqueeze(0).to(
                torch.int64).to(device)
            augment_tokenized = tokenizer.encode(
                df['auxiliary_tokens'].iloc[i][j])

            for k in range(len(df['perturbed_mask_index'].iloc[i])):
                mask_tokenized = tokenizer.encode(
                    df['auxiliary_tokens'].iloc[i][j])
                sep_index = mask_tokenized.index(sep_id)
                perturbed_mask_index = df['perturbed_mask_index'].iloc[i][
                    k] + sep_index + 1
                mask_tokenized[perturbed_mask_index] = mask_id

                mask_tokenized = torch.Tensor(mask_tokenized).unsqueeze(0).to(
                    torch.int64).to(device)

                outputs = mask_model(mask_tokenized, labels=tokenized)
                augment_tokenized[perturbed_mask_index] = int(
                    outputs.logits[:, perturbed_mask_index, :].argmax().cpu(
                    ).numpy())

            augment_tokenizeds.append(augment_tokenized)

        df['augment_token_id'].iloc[i] = augment_tokenizeds

    df['augment_tokens'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        tokens_lists = []

        for j in range(len(df['aspect_terms'].iloc[i])):

            tokens_list = []

            for k in range(1, len(df['augment_token_id'].iloc[i][j]) - 1):

                tokens_list.append(
                    tokenizer.decode([df['augment_token_id'].iloc[i][j][k]]))

            sep_index = tokens_list.index(sep_token)
            tokens_list = tokens_list[sep_index + 1:]
            tokens_lists.append(tokens_list)

        df['augment_tokens'].iloc[i] = tokens_lists

        return df
Esempio n. 24
0
from transformers import BertTokenizer, BertForMaskedLM, GPT2Tokenizer, GPT2LMHeadModel

bert_name = 'bert-base-uncased'
gpt2_name = 'gpt2'
bert_dir = './models/bert'
gpt2_dir = './models/gpt2'

bert_model = BertForMaskedLM.from_pretrained(bert_name)
bert_tokenizer = BertTokenizer.from_pretrained(bert_name)
gpt2_model = GPT2LMHeadModel.from_pretrained(gpt2_name)
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(gpt2_name)

bert_model.save_pretrained(bert_dir)
bert_tokenizer.save_pretrained(bert_dir)
gpt2_model.save_pretrained(gpt2_dir)
gpt2_tokenizer.save_pretrained(gpt2_dir)
        description=
        "Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation"
    )
    parser.add_argument("--model_type",
                        default="bert",
                        choices=["bert", "roberta"])
    parser.add_argument("--model_name", default='bert-base-uncased', type=str)
    parser.add_argument(
        "--dump_checkpoint",
        default='serialization_dir/tf_bert-base-uncased_0247911.pth',
        type=str)
    parser.add_argument("--vocab_transform", action='store_true')
    args = parser.parse_args()

    if args.model_type == 'bert':
        model = BertForMaskedLM.from_pretrained(args.model_name)
        prefix = 'bert'
    elif args.model_type == 'roberta':
        model = RobertaForMaskedLM.from_pretrained(args.model_name)
        prefix = 'roberta'

    state_dict = model.state_dict()
    compressed_sd = {}

    for w in ['word_embeddings', 'position_embeddings']:
        compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
            state_dict[f'{prefix}.embeddings.{w}.weight']
    for w in ['weight', 'bias']:
        compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
            state_dict[f'{prefix}.embeddings.LayerNorm.{w}']
    all_samples = [
        sample for sentence_samples in samples for sample in sentence_samples
    ]
    sample_sentences, sample_names, sample_masks = list(zip(*all_samples))
    sample_sentences, sample_names, sample_masks = (
        list(sample_sentences),
        list(sample_names),
        list(sample_masks),
    )

    print(len(sample_sentences))

    sys.exit(0)

    tokenizer = BertTokenizerFast.from_pretrained(args.tokenizer)
    model = BertForMaskedLM.from_pretrained(args.model).eval().cuda()
    cmp_model = BertForMaskedLM.from_pretrained(args.comparator).eval().cuda()

    metrics_output_path = args.model if args.metrics_output_path is None else args.metrics_output_path
    print(f"Saving results to {metrics_output_path}")

    torch.cuda.empty_cache()
    losses_under_model = batched_perplexity(model, tokenizer, sample_sentences,
                                            sample_masks)
    torch.cuda.empty_cache()
    losses_under_comparator = batched_perplexity(cmp_model, tokenizer,
                                                 sample_sentences,
                                                 sample_masks)

    loss_diff = losses_under_comparator - losses_under_model
Esempio n. 27
0
# https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb

# In[6]:

from transformers import BertForMaskedLM, BertConfig

configuration = BertConfig(
    vocab_size=80000,
    #     max_position_embeddings=512, # 512 + 2 more special tokens
    #     num_attention_heads=12,
    #     num_hidden_layers=12,
    #     type_vocab_size=1,
)
# configuration.vocab_size = 20000

model = BertForMaskedLM(config=configuration)
# model = RobertaForMaskedLM.from_pretrained('./Roberta/checkpoint-200000')

# Accessing the model configuration
# model.config

# # Initializing Tokenizer

# ## Rewrite Tokenizer of bert_itos_80k with special tokens in front

# In[9]:

from senior_project_util import ThaiTokenizer, pre_rules_th, post_rules_th
from fastai.text.transform import BaseTokenizer, Tokenizer, Vocab
from fastai.text.data import TokenizeProcessor, NumericalizeProcessor
Esempio n. 28
0
def get_model(vocab_size):
    config = get_config(vocab_size)
    if transformer_type == 'roberta':
        return RobertaForMaskedLM(config=config)
    return BertForMaskedLM(config=config)
Esempio n. 29
0
 def __init__(self, cfg, tokenizer):
     super().__init__(cfg)
     self.cfg = cfg
     self.bert = BertForMaskedLM.from_pretrained(cfg.MODEL.BERT_CKPT)
     self.tokenizer = tokenizer
Esempio n. 30
0
    def __init__(self):
        self.src_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.tgt_tokenizer.bos_token = '<s>'
        self.tgt_tokenizer.eos_token = '</s>'

        #hidden_size and intermediate_size are both wrt all the attention heads. 
        #Should be divisible by num_attention_heads
        encoder_config = BertConfig(vocab_size=self.src_tokenizer.vocab_size,
                                    hidden_size=config.hidden_size,
                                    num_hidden_layers=config.num_hidden_layers,
                                    num_attention_heads=config.num_attention_heads,
                                    intermediate_size=config.intermediate_size,
                                    hidden_act=config.hidden_act,
                                    hidden_dropout_prob=config.dropout_prob,
                                    attention_probs_dropout_prob=config.dropout_prob,
                                    max_position_embeddings=512,
                                    type_vocab_size=2,
                                    initializer_range=0.02,
                                    layer_norm_eps=1e-12)

        decoder_config = BertConfig(vocab_size=self.tgt_tokenizer.vocab_size,
                                    hidden_size=config.hidden_size,
                                    num_hidden_layers=config.num_hidden_layers,
                                    num_attention_heads=config.num_attention_heads,
                                    intermediate_size=config.intermediate_size,
                                    hidden_act=config.hidden_act,
                                    hidden_dropout_prob=config.dropout_prob,
                                    attention_probs_dropout_prob=config.dropout_prob,
                                    max_position_embeddings=512,
                                    type_vocab_size=2,
                                    initializer_range=0.02,
                                    layer_norm_eps=1e-12,
                                    is_decoder=True)

        #Create encoder and decoder embedding layers.
        encoder_embeddings = torch.nn.Embedding(self.src_tokenizer.vocab_size, config.hidden_size, padding_idx=self.src_tokenizer.pad_token_id)
        decoder_embeddings = torch.nn.Embedding(self.tgt_tokenizer.vocab_size, config.hidden_size, padding_idx=self.tgt_tokenizer.pad_token_id)

        encoder = BertModel(encoder_config)
        encoder.set_input_embeddings(encoder_embeddings.cpu())

        decoder = BertForMaskedLM(decoder_config)
        decoder.set_input_embeddings(decoder_embeddings.cpu())

        input_dirs = config.model_output_dirs

        suffix = "pytorch_model.bin"
        decoderPath = os.path.join(input_dirs['decoder'], suffix)
        encoderPath = os.path.join(input_dirs['encoder'], suffix)

        decoder_state_dict = torch.load(decoderPath)
        encoder_state_dict = torch.load(encoderPath)
        decoder.load_state_dict(decoder_state_dict)
        encoder.load_state_dict(encoder_state_dict)
        self.model = TranslationModel(encoder, decoder, None, None, self.tgt_tokenizer, config)
        self.model.cpu()


        #model.eval()
        self.model.encoder.eval()
        self.model.decoder.eval()