def train_process(config, train_load, train_sampler, model_name):
    # load source bert weights
    model_config = BertConfig.from_pretrained(
        pretrained_model_name_or_path="../user_data/bert_source/{}_config.json"
        .format(model_name))
    # model_config = BertConfig()
    model_config.vocab_size = len(
        pd.read_csv('../user_data/vocab', names=["score"]))
    model = BertForSequenceClassification(config=model_config)

    checkpoint = torch.load(
        '../user_data/save_bert/{}_checkpoint.pth.tar'.format(model_name),
        map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint['status'], strict=False)
    print('***********load pretrained mlm {} weight*************'.format(
        model_name))

    for param in model.parameters():
        param.requires_grad = True

    # 4) 封装之前要把模型移到对应的gpu
    model = model.to(config.device)

    no_decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            config.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)

    #     t_total = len(train_load) * config.num_train_epochs
    #     scheduler = get_linear_schedule_with_warmup(
    #         optimizer, num_warmup_steps=t_total * config.warmup_proportion, num_training_steps=t_total
    #     )

    cudnn.benchmark = True

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        # 5)封装
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[config.local_rank])

    model.train()
    if config.fgm:
        fgm = FGM(model)

    for epoch in range(config.num_train_epochs):
        train_sampler.set_epoch(epoch)
        torch.cuda.empty_cache()

        for batch, (input_ids, token_type_ids, attention_mask,
                    label) in enumerate(train_load):
            input_ids = input_ids.cuda(config.local_rank, non_blocking=True)
            attention_mask = attention_mask.cuda(config.local_rank,
                                                 non_blocking=True)
            token_type_ids = token_type_ids.cuda(config.local_rank,
                                                 non_blocking=True)
            label = label.cuda(config.local_rank, non_blocking=True)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            labels=label)

            loss = outputs.loss
            model.zero_grad()
            loss.backward()
            #             torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)

            if config.fgm:
                fgm.attack()  # 在embedding上添加对抗扰动
                loss_adv = model(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids,
                                 labels=label).loss
                loss_adv.backward()  # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
                fgm.restore()  # 恢复embedding参数

            optimizer.step()
        #             scheduler.step()

        # dev_auc = model_evaluate(config, model, valid_load)

        # 同步各个进程的速度,计算分布式loss
        torch.distributed.barrier()
        # reduce_dev_auc = reduce_auc(dev_auc, config.nprocs).item()

        # if reduce_dev_auc > best_dev_auc:
        #     best_dev_auc = reduce_dev_auc
        #     is_best = True

        now = strftime("%Y-%m-%d %H:%M:%S", localtime())
        msg = 'model_name:{},time:{},epoch:{}/{}'

        if config.local_rank in [0, -1]:
            print(
                msg.format(model_name, now, epoch + 1,
                           config.num_train_epochs))
            checkpoint = {"status": model.module.state_dict()}
            torch.save(
                checkpoint, '../user_data/save_model' + os.sep +
                '{}_checkpoint.pth.tar'.format(model_name))
            del checkpoint

    torch.distributed.barrier()
Esempio n. 2
0
def inference(args):
    # Check for CUDA
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = BertTokenizer.from_pretrained(args.bert_name)
    # Prepare jsons
    ind2organ = json.load(
        open(os.path.join(args.organs_dir_path, "ind2organ.json")))
    organ2label = json.load(
        open(os.path.join(args.organs_dir_path, "organ2label.json")))
    organ2voxels = json.load(
        open(os.path.join(args.organs_dir_path, "organ2voxels.json")))
    test_dataset = VoxelSentenceMappingTestRegDataset(args.test_json_path,
                                                      tokenizer, ind2organ)
    test_loader = DataLoader(
        test_dataset,
        batch_size=args.batch_size,
        collate_fn=collate_pad_sentence_reg_test_batch,
    )
    # Create model
    config = BertConfig.from_pretrained(args.bert_name)
    model = nn.DataParallel(
        RegModel(args.bert_name, config, final_project_size=3)).to(device)
    # Load model
    model.load_state_dict(torch.load(args.checkpoint_path,
                                     map_location=device))
    # Set model in evaluation mode
    model.train(False)
    # Create evaluator
    evaluator = InferenceEvaluatorPerOrgan(
        ind2organ,
        organ2label,
        organ2voxels,
        args.voxelman_images_path,
        test_dataset.organ2count,
        len(test_dataset),
    )
    center = torch.from_numpy(VOXELMAN_CENTER)
    # Restart counters
    evaluator.reset_counters()
    for input_batch, organs_indices, _ in tqdm(test_loader):
        input_batch = {key: val.to(device) for key, val in input_batch.items()}
        output_mappings = (model(
            input_ids=input["sentences"],
            attention_mask=input_batch["attn_mask"],
        ).cpu() * center)
        for output_mapping, organ_indices in zip(output_mappings,
                                                 organs_indices):
            evaluator.update_counters(output_mapping.numpy(),
                                      organ_indices.numpy())

        print(
            "The avg IOR on the test set is: "
            f"{evaluator.get_current_ior()} +/- {evaluator.get_ior_error_bar()}"
        )
        print(
            "The avg distance on the test set is: "
            f"{evaluator.get_current_distance()} +/- {evaluator.get_distance_error_bar()}"
        )
        print(
            "The avg miss distance on the test set is: "
            f"{evaluator.get_current_miss_distance()} +/- {evaluator.get_miss_distance_error_bar()}"
        )
        print("============================================")
        for organ_name in evaluator.organ2count.keys():
            if evaluator.get_current_ior_for_organ(organ_name) > -1:
                print(f"The avg IOR for {organ_name} is: "
                      f"{evaluator.get_current_ior_for_organ(organ_name)} +/- "
                      f"{evaluator.get_ior_error_bar_for_organ(organ_name)}")
                print(
                    f"The avg NVD {organ_name} is: "
                    f"{evaluator.get_current_distance_for_organ(organ_name)} +/- "
                    f"{evaluator.get_distance_error_bar_for_organ(organ_name)}"
                )
                print(
                    f"The avg NVD-O {organ_name} is: "
                    f"{evaluator.get_current_miss_distance_for_organ(organ_name)} +/- "
                    f"{evaluator.get_miss_distance_error_bar_for_organ(organ_name)}"
                )
                print("============================================")
Esempio n. 3
0
    mode="test",
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=args.batch_size,
    sampler=SequentialSampler(val_dataset),
    collate_fn=nlpbook.data_collator,
    drop_last=False,
    num_workers=0,
)
# %% initialize model

from transformers import BertConfig, BertForSequenceClassification

pt_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels=corpus.num_labels,
)

model = BertForSequenceClassification.from_pretrained(
    args.pretrained_model_name,
    config=pt_model_config,
)

# %% prepare training
from ratsnlp.nlpbook.classification import ClassificationTask
task = ClassificationTask(model, args)

# %%
trainer = nlpbook.get_trainer(args)

# %%
Esempio n. 4
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_type", type=str, required=True,
                        choices=["rbert", "bert_em_cls", "bert_em_es", "bert_em_all"],
                        help="Model type")
    parser.add_argument("--model_dir", type=str, required=True, help="Path to model directory")
    parser.add_argument("--input_file", type=str, required=True, help="Path to input file")
    parser.add_argument("--output_file", type=str, required=True, help="Path to output file (to store predicted labels)")
    parser.add_argument("--eval_batch_size", type=int, default=32, help="Batch size for evaluation.")
    parser.add_argument("--no_cuda", action="store_true", help="Whether to use GPU for evaluation.")
    parser.add_argument("--overwrite_cache", action="store_true", help="Whether to overwrite cached feature file.")
    args = parser.parse_args()
    
    init_logger()
    logger.info("%s" % args)
    config = BertConfig.from_pretrained(args.model_dir)
    
    train_args = torch.load(os.path.join(args.model_dir, "training_args.bin"))
    logger.info("Training args: {}".format(train_args))
    train_args.eval_batch_size = args.eval_batch_size
    train_args.overwrite_cache = args.overwrite_cache
    
    # For BERT-EM, we have to use GPU because we fix device="cuda" in the code
    args.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"

    # Check whether model exists
    if not os.path.exists(args.model_dir):
        raise Exception("Model doesn't exists! Train first!")

    # Load tokenizer
    tokenizer = load_tokenizer(train_args)
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

model_name = 'bert-base-multilingual-cased'

config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

train = pd.read_csv('/kaggle/input/dataset/train.csv')
valid = pd.read_csv('/kaggle/input/dataset/val.csv')
test = pd.read_csv('/kaggle/input/dataset/test.csv')

#IMP DATA FOR CONFIG

AUTO = tf.data.experimental.AUTOTUNE


# Configuration
EPOCHS = 3
Esempio n. 6
0
def create_long_model(save_model_to, attention_window, max_pos,
                      pretrained_config, pretrained_checkpoint,
                      pretrained_tokenizer):
    """
    Convert RoBERTa into Long-Version
    :param save_model_to: the model save path
    :param attention_window: the long-attention defined above
    :param max_pos: extend the position embedding to max_pos=4096
    :return: modified model and tokenizer
    """
    config = BertConfig.from_pretrained(pretrained_config)
    model = BertForMaskedLM.from_pretrained(pretrained_checkpoint,
                                            config=config)
    tokenizer = BertTokenizerFast.from_pretrained(pretrained_tokenizer,
                                                  model_max_length=max_pos)

    # extend position embedding
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.bert.embeddings.position_embeddings.weight.shape
    # RoBERTa has position 0,1 reserved, embedding size = max_pos + 2
    #max_pos += 2 # ??? is this fit for BERT-based RoBerta_zh?
    """ 
    RoBERTa reserved position 0 1,
    However, Bert-based RoBERTa_zh did not.
    """
    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos

    # allocate a larger position embedding matrix
    new_pos_embed = model.bert.embeddings.position_embeddings.weight.new_empty(
        max_pos, embed_size)

    # init by duplication
    k = 0
    step = current_max_pos
    while k < max_pos - 1:
        new_pos_embed[k:(
            k + step)] = model.bert.embeddings.position_embeddings.weight[0:]
        k += step
    model.bert.embeddings.position_embeddings.weight.data = new_pos_embed

    # The next problem is that: BERT_Based RoBERTa has not attribute [position_ids] for [bert.embeddings]
    # model.bert.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)

    # replace the modeling_bert.BertSelfAttention obj with LongformerSelfAttention
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.bert.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = layer.attention.self.query
        longformer_self_attn.key = layer.attention.self.key
        longformer_self_attn.value = layer.attention.self.value

        longformer_self_attn.query_global = copy.deepcopy(
            layer.attention.self.query)
        longformer_self_attn.key_global = copy.deepcopy(
            layer.attention.self.key)
        longformer_self_attn.value_global = copy.deepcopy(
            layer.attention.self.value)

        layer.attention.self = longformer_self_attn

    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer
Esempio n. 7
0
from transformers import BertForSequenceClassification, BertConfig
from torch.utils.data import DataLoader
import torch
from tools import start_debugger_on_exception
from dataset import DataSetBert
import numpy as np
start_debugger_on_exception()
train_dataset = DataSetBert(data_file='./data/data_train/train.csv')
val_dataset = DataSetBert(data_file='./data/data_train/val.csv')
test_dataset = DataSetBert(data_file='./data/data_train/test.csv')
from torch.utils.data import DataLoader
device = torch.device('cuda:6')
train_dataloader = DataLoader(train_dataset, batch_size=11, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=11, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=11, shuffle=True)
model_config = BertConfig.from_pretrained('bert-base-chinese')
model_config.num_hidden_layers = 3
model = BertForSequenceClassification(model_config)
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id
model.config.max_position_embeddings = 1024
model.to(device)
model.train()
model.to(device)
import pdb
pdb.set_trace()
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)
no_decay = ['bias', 'LayerNorm.weight']
Esempio n. 8
0
def main():
    args = parse_args()
    os.makedirs(args.output_dir, exist_ok=True)
    set_seed(args.seed)
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s :: %(levelname)s :: %(message)s')

    if args.numnet_model is not None:
        config = BertConfig.from_pretrained(
            args.model_name, num_labels=1)  # 1 label for regression
        # if args.contrastive:
        #     model = ContrastiveElectra.from_pretrained(args.model_name, config=config)
        # else:
        model = BertForSequenceClassification.from_pretrained(args.model_name,
                                                              config=config)
        state_dicts = torch.load(args.numnet_model)
        if "model" in state_dicts:
            logging.info("Loading in mutual electra format state_dicts.")
            model.load_state_dict(state_dicts["model"], strict=False)
        else:
            logging.info("Loading model weights only.")
            model.load_state_dict(state_dicts, strict=False)
    else:
        config = ElectraConfig.from_pretrained(
            args.model_name, num_labels=1)  # 1 label for regression
        model = ElectraForSequenceClassification.from_pretrained(
            args.model_name, config=config)
        if args.local_model_path is not None:
            state_dicts = torch.load(args.local_model_path)
            model.load_state_dict(state_dicts["model"])

    tokenizer = ElectraTokenizer.from_pretrained(args.model_name,
                                                 do_lower_case=True)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # TODO enable multi-gpu training if necessary
    pretrain_train_dataset = DapoDataset(args.data_dir, "train",
                                         tokenizer) if args.pretrain else None
    pretrain_dev_dataset = DapoDataset(args.data_dir, "dev",
                                       tokenizer) if args.pretrain else None

    if args.train:
        if args.contrastive:
            train_dataset = ContrastiveDataset(args.data_dir, "train",
                                               tokenizer)
            train_dataloader = DataLoader(train_dataset,
                                          batch_size=args.train_batch_size,
                                          shuffle=False,
                                          num_workers=8,
                                          collate_fn=mutual_contrast_collate)
            dev_dataset = ContrastiveDataset(
                args.data_dir, "dev",
                tokenizer) if args.eval or args.test else None
            dev_dataloader = DataLoader(dev_dataset,
                                        batch_size=args.train_batch_size,
                                        shuffle=False,
                                        num_workers=8,
                                        collate_fn=mutual_contrast_collate
                                        ) if dev_dataset is not None else None
        else:
            train_dataset = MutualDataset(args.data_dir, "train", tokenizer)
            train_dataloader = DataLoader(train_dataset,
                                          batch_size=args.train_batch_size,
                                          shuffle=True,
                                          num_workers=8,
                                          collate_fn=mutual_collate)
            dev_dataset = MutualDataset(
                args.data_dir, "dev",
                tokenizer) if args.eval or args.test else None
            dev_dataloader = DataLoader(
                dev_dataset,
                batch_size=args.train_batch_size,
                shuffle=False,
                num_workers=8,
                collate_fn=mutual_collate) if dev_dataset is not None else None

    else:
        train_dataset, train_dataloader = None, None

    # TODO: add test_dataset if we want to submit to leaderboard

    pretrain_train_dataloader = DataLoader(
        pretrain_train_dataset,
        batch_size=args.train_batch_size,
        shuffle=True,
        num_workers=8,
        collate_fn=dapo_collate
    ) if pretrain_train_dataset is not None else None
    pretrain_dev_dataloader = DataLoader(
        pretrain_dev_dataset,
        batch_size=args.train_batch_size,
        shuffle=False,
        num_workers=8,
        collate_fn=dapo_collate) if pretrain_dev_dataset is not None else None

    # currently eval_batch_size = train_batch_size

    if args.pretrain:
        logging.info("Start pretraining...")
        args.eval = True
        trainer = Trainer(args, model, device, pretrain_train_dataloader,
                          pretrain_dev_dataloader)
        trainer.train()
        return  # fine-tuning should be done separately

    if args.train:
        logging.info("Start training...")
        trainer = Trainer(args, model, device, train_dataloader,
                          dev_dataloader)
        trainer.train()

    # TODO: currently testing is on the dev set
    if args.test:
        logging.info("Start testing...")
        tester = Tester(args, model, device, dev_dataset, dev_dataloader)
        tester.test()
torch.backends.cudnn.benchmark = False

DEVICE: str = "cuda" if torch.cuda.is_available() and USE_GPU else "cpu"

# Some path for the training phase
DATASET_PATH: str = '../../data/train.json'
DATASET_DEV_PATH: str = '../../data/dev.json'
DATASET_TEST_PATH: str = '../../data/test.json'
GLOVE_PATH: str = "../../model/glove.6B.300d.txt"  # pre-trained glove embeddings path

# read the dataset
sentences, labels = read_dataset(DATASET_PATH)
sentences_dev, labels_dev = read_dataset(DATASET_DEV_PATH)

# -- Initialize bert --
bert_config = BertConfig.from_pretrained(model_name, output_hidden_states=True)
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name, config=bert_config)

# -- net configuration -- It improve the code modularity
net_configuration: dict = net_configurator(
    use_bert_embeddings=USE_BERT_EMBEDDINGS,
    use_crf=USE_CRF,
    use_biaffine_layer=USE_BIAFFINE_LAYER,
    use_pretrained=USE_GLOVE,
    use_dependecy_heads=USE_DEPENDENCY_HEADS,
    use_predicates=False,
    use_syntagnet=USE_SYNTAGNET)

dataset_train: SRL_Dataset = SRL_Dataset(sentences,
                                         labels,
def zero_percent_no_finetuning():
    parser = argparse.ArgumentParser()

    parser.add_argument("--test_data_path", required=True, type=str)
    parser.add_argument("--output_dir", required=True, type=str)
    parser.add_argument("--data_column", required=True, type=str)
    parser.add_argument("--label_column", required=True, type=str)
    parser.add_argument("--model_type", required=True)

    #parser.add_argument("--eval_split",
    #                    default=0.1,
    #                    type=float)
    #parser.add_argument("--test_split",
    #                    default=0.1,
    #                    type=float)
    parser.add_argument("--max_len", default=256, type=int)
    parser.add_argument("--batch_size", default=16, type=int)
    parser.add_argument("--num_epochs", default=4, type=int)
    parser.add_argument("--learning_rate", default=2e-5, type=float)
    parser.add_argument("--weight_decay", default=0.01, type=float)
    parser.add_argument("--warmup_proportion", default=0.1, type=float)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)

    args = parser.parse_args()

    print("Setting the random seed...")
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    print("Reading data...")
    df_test_data = pd.read_csv(args.test_data_path, sep="\t")
    test_data = df_test_data[args.data_column].tolist()
    test_labels = df_test_data[args.label_column].tolist()
    label_set = sorted(list(set(df_test_data[args.label_column].values)))
    test_labels = encode_labels(test_labels, label_set)

    print("loading model...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if args.model_type == "shebert":
        tokenizer = BertTokenizer.from_pretrained(
            "../models/crosloengual-bert-pytorch/vocab.txt")
        config = BertConfig.from_pretrained(
            "../models/crosloengual-bert-pytorch/bert_config.json",
            num_labels=len(label_set))
        model = BertForSequenceClassification.from_pretrained(
            "../models/crosloengual-bert-pytorch/pytorch_model.bin",
            config=config)
    elif args.model_type == "mbert":
        tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased', do_lower_case=True)
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-multilingual-cased', num_labels=len(label_set))
    else:
        print("Wrong argument value for model type")
        sys.exit()

    output_dir = args.output_dir
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    log_path = os.path.join(args.output_dir, "log")

    print("Evaluating on the test set...")
    test_dataloader = prepare_labeled_data(test_data, test_labels, tokenizer,
                                           args.max_len, args.batch_size)
    metrics = bert_evaluate(model, test_dataloader, device)

    with open(log_path, 'a') as f:
        f.write("Acc: " + str(metrics['accuracy']) + "\n")
        f.write("F1: " + str(metrics['f1']) + "\n")

    print("Done.")
def zero_percent():
    parser = argparse.ArgumentParser()

    parser.add_argument("--test_data_path", required=True, type=str)
    parser.add_argument("--output_dir", required=True, type=str)
    parser.add_argument("--data_column", required=True, type=str)
    parser.add_argument("--label_column", required=True, type=str)
    parser.add_argument("--offensive_label", required=True, type=str)

    parser.add_argument("--tokenizer_file", type=str)
    parser.add_argument("--config_file", type=str, required=True)
    parser.add_argument("--model_file", type=str, required=True)

    #parser.add_argument("--eval_split",
    #                    default=0.1,
    #                    type=float)
    #parser.add_argument("--test_split",
    #                    default=0.1,
    #                    type=float)
    parser.add_argument("--max_len", default=256, type=int)
    parser.add_argument("--batch_size", default=16, type=int)
    parser.add_argument("--num_epochs", default=4, type=int)
    parser.add_argument("--learning_rate", default=2e-5, type=float)
    parser.add_argument("--weight_decay", default=0.01, type=float)
    parser.add_argument("--warmup_proportion", default=0.1, type=float)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)

    args = parser.parse_args()

    output_dir = args.output_dir
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    log_path = os.path.join(args.output_dir, "log")

    print("Reading data...")
    df_test_data = pd.read_csv(args.test_data_path, sep="\t")
    df_test_data = consolidate_dataset_modified(df_test_data, args.data_column,
                                                args.label_column,
                                                args.offensive_label)
    test_data = df_test_data["data"].tolist()
    test_labels = df_test_data["labels"].tolist()
    print(test_labels)

    label_set = sorted(list(set(df_test_data["labels"].values)))
    test_labels = encode_labels(test_labels, label_set)
    print(test_labels)

    print("loading model...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if args.tokenizer_file is not None:
        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_file)
    else:
        tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased', do_lower_case=True)
    config = BertConfig.from_pretrained(args.config_file,
                                        num_labels=len(label_set))
    model = BertForSequenceClassification.from_pretrained(args.model_file,
                                                          config=config)

    print("Evaluating on the test set...")
    test_dataloader = prepare_labeled_data(test_data, test_labels, tokenizer,
                                           args.max_len, args.batch_size)
    metrics = bert_evaluate(model, test_dataloader, device)

    with open(log_path, 'a') as f:
        f.write("Acc: " + str(metrics['accuracy']) + "\n")
        f.write("F1: " + str(metrics['f1']) + "\n")

    print("Done.")
Esempio n. 12
0
    def run(self):
        num_train_epochs = 3
        gradient_accumulation_steps = 1
        weight_decay = 0.0
        learning_rate = 5e-5
        adam_epsilon = 1e-8
        warmup_steps = 0
        seed = 42
        logging_steps = 50

        train_batch_size = 16 * max(1, torch.cuda.device_count())
        train_dataloader = DataLoader(SnliDataset(config_file=self.config_file,
                                                  mode=TRAIN).get_dataset(),
                                      batch_size=train_batch_size,
                                      shuffle=True)
        dev_loader = DataLoader(SnliDataset(config_file=self.config_file,
                                            mode=DEV).get_dataset(),
                                batch_size=train_batch_size,
                                shuffle=True)
        test_loader = DataLoader(SnliDataset(config_file=self.config_file,
                                             mode=TEST).get_dataset(),
                                 batch_size=train_batch_size,
                                 shuffle=True)
        t_total = len(
            train_dataloader) // gradient_accumulation_steps * num_train_epochs
        if self.mode == 'train':
            self.logger.info('Loading pretrained model')
            config = BertConfig.from_pretrained(BERT_MODEL, num_labels=3)
            model = BertForSequenceClassification.from_pretrained(
                BERT_MODEL, config=config)
        else:
            self.logger.info('Loading trained model from local directory')
            config = BertConfig.from_json_file(
                f'{self.path}/checkpoint-best/config.json')
            model = BertForSequenceClassification.from_pretrained(
                f'{self.path}/checkpoint-best/pytorch_model.bin',
                config=config)

        if torch.cuda.device_count() == 1:
            model = model.cuda()
            self.logger.info('GPUs used: 1')
        elif torch.cuda.device_count() > 1:
            model = model.cuda()
            model = torch.nn.DataParallel(model)
            self.logger.info(f'GPUs used: {torch.cuda.device_count()}')
        else:
            self.logger.warn('No GPUs used!')

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=learning_rate,
                          eps=adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=t_total)

        global_step, accuracy = 0, 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(num_train_epochs), desc="Epoch")
        # Added here for reproductibility (even between python 2 and 3)
        self.set_seed(seed)
        if self.mode == 'train':
            self.logger.info('Running training')
            for _ in train_iterator:
                epoch_iterator = tqdm(train_dataloader, desc="Train Iteration")
                for step, batch in enumerate(epoch_iterator):
                    model.train()
                    batch = tuple(t.cuda() for t in batch)
                    inputs = {
                        'input_ids': batch[0],
                        'attention_mask': batch[1],
                        'token_type_ids': batch[2],
                        'labels': batch[3]
                    }
                    outputs = model(**inputs)
                    # model outputs are always tuple in transformers (see doc)
                    loss = outputs[0]

                    if torch.cuda.device_count() > 1:
                        loss = loss.mean(
                        )  # mean() to average on multi-gpu parallel training
                    if gradient_accumulation_steps > 1:
                        loss = loss / gradient_accumulation_steps

                    loss.backward()

                    tr_loss += loss.item()
                    if (step + 1) % gradient_accumulation_steps == 0:
                        optimizer.step()
                        scheduler.step()  # Update learning rate schedule
                        model.zero_grad()
                        global_step += 1
                        if global_step % logging_steps == 0:
                            epoch_iterator.set_description(
                                f'Loss: {(tr_loss - logging_loss)/logging_steps}'
                            )
                            logging_loss = tr_loss
                eval_acc = self.evaluate(dev_loader, model)
                self.logger.info(f'Dev accuracy: {eval_acc}')
                if accuracy < eval_acc:
                    output_dir = os.path.join(self.path,
                                              'checkpoint-{}'.format('best'))
                    self.logger.info(f'Saving best model to {output_dir}')
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    model_to_save.save_pretrained(output_dir)
                    # torch.save(args, os.path.join(output_dir, 'training_args.bin'))
        else:
            eval_acc = self.evaluate(train_dataloader, model)
            self.logger.info(f'Train Accuracy: {eval_acc}')
            eval_acc = self.evaluate(dev_loader, model)
            self.logger.info(f'Dev Accuracy: {eval_acc}')
            eval_acc = self.evaluate(test_loader, model)
            self.logger.info(f'Test Accuracy: {eval_acc}')
Esempio n. 13
0
def write_results(dataset, lvl, tokens, epochs, batch, test_labels, train_in,
                  test_in, model):
    """
    evaluate all runs for a model and generate the result table row with all results. For flat and per_level approaches
    :param dataset: dataset to test on
    :param lvl: lvl to test
    :param tokens: maximal token length
    :param epochs: maximal epochs the model was trained on
    :param batch: batch size for evaluating, same as for training
    :param test_labels: labels to used for testing
    :param train_in: what was used for training
    :param test_in: what will used for testing
    :param model: path to model
    :return: the result table row corresponding to the analysis of the given model
    """
    # Simulate config file
    arguments = {
        'model_name': 'bert-base-uncased',
        'max_length': tokens,
        'epochs': epochs,
        'batch_size': batch,
        'data_path': dataset,
        'lvl': lvl,
        'test_labels': test_labels
    }

    # Prepare tokenization for evaluation
    model_name = arguments['model_name']
    config = BertConfig.from_pretrained(model_name)
    config.output_hidden_states = False
    data, trunest_class_names, test_target = BERT_per_lvl.get_test_data(
        arguments)  # Get test data
    x = BERT_per_lvl.get_tokenized(model_name, config, data,
                                   tokens)  # Tokenize test data
    runs = [
        filename
        for filename in glob.iglob(model + "/**/model", recursive=True)
    ]  # get the 3 runs for each model

    res_list = []
    for run in runs:  # for each run evaluate
        res_list.append(evaluate(run, x, batch,
                                 test_target))  # f1_score, accuracy_score

    # Mean and std for the 3 runs
    f1_mean, accu_mean = np.mean(res_list, axis=0)
    f1_std, accu_std = np.std(res_list, axis=0)
    f1_string = '{:.3f}({:.3f})'.format(f1_mean, f1_std)
    acc_string = '{:.3f}({:.3f})'.format(accu_mean, accu_std)
    # For the levels not predicted by this model give "-" out
    aux = ['-'] * 6
    aux[(lvl - 1) * 2] = acc_string
    aux[(lvl - 1) * 2 + 1] = f1_string

    # Get the maximum of how many epochs the runs trained before early stopping kicked in
    _, _, leng, _ = get_model_plot(model)
    used_ep = len(leng[0])

    # Format data to generate a row of the results table
    table_data = [
        "Per_lvl", dataset, '{}({})'.format(epochs, used_ep), tokens, batch,
        len(runs), train_in, "Cat" + str(lvl), test_in
    ] + aux
    return table_data
Esempio n. 14
0
import torch
from collections import Counter
from transformers import BertTokenizer, BertConfig, BertForQuestionAnswering


model = BertForQuestionAnswering.from_pretrained("./final_model_split")
tokenizer = BertTokenizer.from_pretrained("./final_model_split")
config = BertConfig.from_pretrained("./final_model_split")


def f1_score(pred, ref):
    pred_tokens = list(pred)
    ref_tokens = list(ref)
    common = Counter(pred_tokens) & Counter(ref_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_tokens)
    recall = 1.0 * num_same / len(ref_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def evaluate(predictions, references):
    f1 = total = 0
    for ref, pred in zip(references, predictions):
        total += 1
        f1 += f1_score(pred, ref)
    f1 = 100.0 * f1 / total
    return f1
Esempio n. 15
0
                                                      start_labels)
    end_loss = nn.CrossEntropyLoss(ignore_index=-1)(end_preds, end_labels)
    class_loss = nn.CrossEntropyLoss()(class_preds, class_labels)
    return start_loss + end_loss + class_loss


def loss_fn_classifier(preds, labels):
    _, _, class_preds = preds
    _, _, class_labels = labels

    class_loss = nn.CrossEntropyLoss()(class_preds, class_labels)

    return class_loss


config = BertConfig.from_pretrained(bert_model)
config.num_labels = 5
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
model = BertForQuestionAnswering.from_pretrained(
    '/data/sv/CS230_Spring-2020/Guanshuo_TFQA_1stplace/code', config)
model = model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay':
    0.01
}, {
    'params':
Esempio n. 16
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--adv_type',
                        default='fgm',
                        type=str,
                        choices=['fgm', None])

    # /home/bert/ernie1.0_base_zh/torch
    # /home/bert/bert_base_zh/torch
    # /home/bert/chinese_roberta_wwm_large_ext_pytorch
    # /home/bert/roberta_wwm_base_ext_zh/torch
    parser.add_argument(
        "--model_name_or_path",
        default='/home/bert/chinese_roberta_wwm_large_ext_pytorch',
        type=str,
        help="Path to pre-trained model ",
    )

    parser.add_argument(
        "--data_dir",
        default='../data/Dataset/',
        type=str,
        help="Path to data ",
    )
    parser.add_argument(
        "--task_name",
        default='pair',
        type=str,
        help="The name of the task to train selected in the list: " +
        ", ".join(PROCESSORS.keys()),
    )
    parser.add_argument(
        "--output_dir",
        default='../user_data/tmp_data/checkpoints',
        type=str,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--max_seq_length",
        default=64,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_eval_during_train",
        action="store_true",
        help="Run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help="Set this flag if you are using an uncased model.",
    )

    parser.add_argument(
        "--per_gpu_train_batch_size",
        default=32,
        type=int,
        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=32,
        type=int,
        help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs",
        default=3.0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--warmup_rate",
                        default=0.1,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--logging_steps",
                        type=int,
                        default=50,
                        help="Log every X updates steps.")

    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir",
        type=bool,
        default=True,
        help="Overwrite the content of the output directory",
    )
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument(
        "--threads",
        type=int,
        default=10,
        help="multiple threads for converting example to features")

    args = parser.parse_args()

    # args.output_dir = os.path.join(args.output_dir, args.task_name)
    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )
    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in PROCESSORS:
        raise ValueError("Task not found: %s" % (args.task_name))

    processor = PROCESSORS[args.task_name]()
    label_list = processor.get_labels()
    num_labels = len(label_list)
    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        config = BertConfig.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(
            args.tokenizer_name
            if args.tokenizer_name else args.model_name_or_path,
            do_lower_case=args.do_lower_case,
        )
        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config)

        if args.local_rank == 0:
            torch.distributed.barrier(
            )  # Make sure only the first process in distributed training will download model & vocab

        model.to(args.device)
        global_step = train(args, tokenizer, model)
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        # Save the trained model and the tokenizer
        if (args.local_rank == -1 or torch.distributed.get_rank()
                == 0) and (not args.do_eval_during_train):
            output_dir = args.output_dir
            if not os.path.exists(output_dir) and args.local_rank in [-1, 0]:
                os.makedirs(output_dir)
            logger.info("Saving model checkpoint to %s", output_dir)
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Take care of distributed/parallel training
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)

    if args.do_eval and args.local_rank in [-1, 0]:
        output_dir = args.output_dir
        model = BertForSequenceClassification.from_pretrained(output_dir)
        model.to(args.device)
        tokenizer = BertTokenizer.from_pretrained(
            output_dir, do_lower_case=args.do_lower_case)
        result, _ = evaluate(args, model, tokenizer=tokenizer)
        output_eval_file = os.path.join(output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for k, v in result.items():
                logger.info("  {} : {}".format(k, v))
                writer.write("{} : {}\n".format(k, v))
Esempio n. 17
0
def main(args, _=None):
    """Run the ``catalyst-data text2embeddings`` script."""
    batch_size = args.batch_size
    num_workers = args.num_workers
    max_length = args.max_length
    pooling_groups = args.pooling.split(",")

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    if getattr(args, "in_huggingface", False):
        model_config = BertConfig.from_pretrained(args.in_huggingface)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel.from_pretrained(
            args.in_huggingface, config=model_config
        )
        tokenizer = BertTokenizer.from_pretrained(args.in_huggingface)
    else:
        model_config = BertConfig.from_pretrained(args.in_config)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel(config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_vocab)
    if getattr(args, "in_model", None) is not None:
        checkpoint = utils.load_checkpoint(args.in_model)
        checkpoint = {"model_state_dict": checkpoint}
        utils.unpack_checkpoint(checkpoint=checkpoint, model=model)

    model = model.eval()
    model, _, _, _, device = utils.process_components(model=model)

    df = pd.read_csv(args.in_csv)
    df = df.dropna(subset=[args.txt_col])
    df.to_csv(f"{args.out_prefix}.df.csv", index=False)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())
    num_samples = len(df)

    open_fn = LambdaReader(
        input_key=args.txt_col,
        output_key=None,
        lambda_fn=partial(
            tokenize_text,
            strip=args.strip,
            lowercase=args.lowercase,
            remove_punctuation=args.remove_punctuation,
        ),
        tokenizer=tokenizer,
        max_length=max_length,
    )

    dataloader = utils.get_loader(
        df, open_fn, batch_size=batch_size, num_workers=num_workers,
    )

    features = {}
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for idx, batch_input in enumerate(dataloader):
            batch_input = utils.any2device(batch_input, device)
            batch_output = model(**batch_input)
            mask = (
                batch_input["attention_mask"].unsqueeze(-1)
                if args.mask_for_max_length
                else None
            )

            if utils.check_ddp_wrapped(model):
                # using several gpu
                hidden_size = model.module.config.hidden_size
                hidden_states = model.module.config.output_hidden_states

            else:
                # using cpu or one gpu
                hidden_size = model.config.hidden_size
                hidden_states = model.config.output_hidden_states

            batch_features = process_bert_output(
                bert_output=batch_output,
                hidden_size=hidden_size,
                output_hidden_states=hidden_states,
                pooling_groups=pooling_groups,
                mask=mask,
            )

            # create storage based on network output
            if idx == 0:
                for layer_name, layer_value in batch_features.items():
                    layer_name = (
                        layer_name
                        if isinstance(layer_name, str)
                        else f"{layer_name:02d}"
                    )
                    _, embedding_size = layer_value.shape
                    features[layer_name] = np.memmap(
                        f"{args.out_prefix}.{layer_name}.npy",
                        dtype=np.float32,
                        mode="w+",
                        shape=(num_samples, embedding_size),
                    )

            indices = np.arange(
                idx * batch_size, min((idx + 1) * batch_size, num_samples)
            )
            for layer_name2, layer_value2 in batch_features.items():
                layer_name2 = (
                    layer_name2
                    if isinstance(layer_name2, str)
                    else f"{layer_name2:02d}"
                )
                features[layer_name2][indices] = _detach(layer_value2)
Esempio n. 18
0
from transformers import BertTokenizer, BertModel, BertConfig
from torch.nn.utils.rnn import pad_sequence

if len(sys.argv) < 4:
    print("Usage: python makecache.py cacheword_file processed_dir output_file", file=sys.stderr)
    sys.exit(-1)

cachewordfile = sys.argv[1]
processed_dir = sys.argv[2]
outputfile = sys.argv[3]

print('Initializing model', file=sys.stderr)

# adjust your model here
config = BertConfig.from_pretrained("bert-base-german-cased", output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")
model = BertModel.from_pretrained("bert-base-german-cased", config=config)
assert model.config.output_hidden_states
model.to('cuda')

model.eval()
torch.set_grad_enabled(False)

with open(cachewordfile) as f:
    cachewords = [x.strip() for x in f.readlines()]

word_index_map = dict(zip(cachewords, range(len(cachewords))))
tokenized_docs = [os.path.join(processed_dir, f) for f in os.listdir(processed_dir)]

Esempio n. 19
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--train_file", default=None, type=str)
    parser.add_argument("--eval_file", default=None, type=str)
    parser.add_argument("--model_name_or_path", default=None, type=str)
    parser.add_argument("--output_dir", default=None, type=str)
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model name")
    parser.add_argument("--vocab_file",
                        default="",
                        type=str,
                        help="vocab file path if not the same as model name")
    # parser.add_argument("--tokenizer_name", default="", type=str)
    parser.add_argument("--max_query_len", default=64, type=int)
    parser.add_argument("--max_seq_len", default=512, type=int)
    parser.add_argument("--do_train", action="store_true")
    parser.add_argument("--do_eval", action="store_true")

    parser.add_argument("--epoch", default=10, type=int)
    parser.add_argument("--train_batch_size", default=32, type=int)
    parser.add_argument("--eval_batch_size", default=32, type=int)
    parser.add_argument("--learning_rate", default=1e-6, type=float)
    parser.add_argument("--num_training_steps", default=10000, type=int)
    parser.add_argument("--num_labels", default=2, type=int)
    args = parser.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.device = device

    params = {
        'batch_size': args.train_batch_size,
        'shuffle': True,
        'num_workers': 8,
        'collate_fn': my_collate_fn
    }

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=True)
    config = BertConfig.from_pretrained(args.config_name,
                                        num_labels=args.num_labels)
    config.num_labels = 2
    print((config))
    model = BertForPassageRerank.from_pretrained(args.model_name_or_path,
                                                 config=config)

    model.to(args.device)

    if args.do_train:
        print("training...")
        params = {
            'batch_size': args.train_batch_size,
            'shuffle': True,
            'num_workers': 2,
            'collate_fn': my_collate_fn
        }
        # tokenizer = BertTokenizer.from_pretrained(
        #   args.tokenizer_name, do_lower_case=True)

        train_set = PassageData(args.train_file, tokenizer, args.max_query_len,
                                args.max_seq_len)
        dataloader = DataLoader(train_set, **params)
        num_train_each_epoch = len(dataloader)
        print("step: ", num_train_each_epoch)
        num_training_steps = len(dataloader) * args.epoch
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps)
        for ep in tqdm(range(args.epoch)):

            running_loss = 0.0
            step = 0
            for data in tqdm(dataloader):
                step += 1
                # data = data.to(args.device)
                inputs_ids, masks, segments_ids, \
                    labels = [x.to(args.device) for x in data]
                outputs = model(inputs_ids, masks, segments_ids, labels)
                loss = outputs

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()

                if step % 500 == 0:
                    print("loss:", loss)
                running_loss += loss.item()

    elif args.do_eval:

        print("evaling...")
        model.eval()

        idmap = "./data/ids_map.json"
        with open(idmap, 'r') as f:
            id_map = json.load(f)
        q_ids = id_map['q_id']
        qid_to_pid = id_map['qid_to_pid']
        params = {
            'batch_size': args.eval_batch_size,
            'shuffle': False,
            'num_workers': 4,
            'collate_fn': my_collate_fn
        }
        eval_set = PassageData(args.eval_file, tokenizer, args.max_query_len,
                               args.max_seq_len)
        dataloader = DataLoader(eval_set, **params)
        results = []
        count = 0
        fw = open('./data/output.tsv', 'w')
        i = 0
        for data in dataloader:
            if count == 2:
                break
            i += 1
            print(i)
            inputs_ids, masks, \
                segments_ids = [x.to(args.device) for x in data]
            with torch.no_grad():
                result = model(inputs_ids, masks, segments_ids)
            # print("result: ", result)

            for res in result:
                results.append(res[1])
                if len(results) == 1000:
                    print('greater than 1000')
                    q_id = q_ids[count]
                    pred_passages = torch.argsort(result[:, 1],
                                                  descending=True,
                                                  dim=-1)
                    rank = 1
                    for idx in pred_passages:
                        p_id = qid_to_pid[q_id][idx.item()]
                        if p_id != '000000':
                            fw.write(q_id + '\t' + p_id + '\t' +
                                     str(rank + 1) + '\n')
                            rank += 1
                    count += 1
                    results = []
        fw.close()
Esempio n. 20
0
    label = torch.tensor(data=label).type(torch.LongTensor)
    return input_ids, token_type_ids, attention_mask, label


print("***********load test data*****************")

config = roBerta_Config()
vocab = Vocab()
train_data, valid_data, test_data = vocab.get_train_dev_test()
test_dataset = BuildDataSet(test_data)
test_load = DataLoader(dataset=test_dataset,
                       batch_size=config.batch_size,
                       shuffle=False,
                       collate_fn=collate_fn)

print("***********load model weight*****************")

model_config = BertConfig.from_pretrained(
    pretrained_model_name_or_path="bert_source/bert_config.json")
model = BertForSequenceClassification(config=model_config)
model.load_state_dict(torch.load('save_bert/best_model.pth.tar'))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
config.device = device

print("***********make predict for test file*****************")

predict = model_infer(model, config, test_load)
submit_result(predict)
print("***********done*****************")
Esempio n. 21
0
from torch.utils.data.distributed import DistributedSampler

from tqdm import tqdm, trange
from transformers import (WEIGHTS_NAME, BertConfig,
                          BertForSequenceClassification, BertTokenizer)
from transformers import AdamW, WarmupLinearSchedule

testDataPath = './Data/Bert/bert_end_to_end.csv'
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import csv
maxLen = 512

pretrained_weights = 'bert-base-uncased'
config = BertConfig.from_pretrained('./Models/Bert', num_labels=2)
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertForSequenceClassification.from_pretrained('./Models/Bert',
                                                      config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Q, T, A, L = [], [], [], []
with open(testDataPath) as csvfile:
    csv_reader = csv.reader(csvfile, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
        else:
def inference(onnx_model, model_dir, examples, fast_tokenizer, num_threads):
    quantized_str = ''
    if 'quantized' in onnx_model:
        quantized_str = 'quantized'
    onnx_inference = []
    pytorch_inference = []
    # onnx session
    options = ort.SessionOptions()
    options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
    options.intra_op_num_threads = 1
    ort_session = ort.InferenceSession(onnx_model, options)
    # pytorch pretrained model and tokenizer
    if fast_tokenizer:
        tokenizer = BertTokenizerFast.from_pretrained(model_dir)
        tokenizer_str = "BertTokenizerFast"

    else:
        tokenizer = BertTokenizer.from_pretrained(model_dir)
        tokenizer_str = "BertTokenizer"
    config = BertConfig.from_pretrained(model_dir)
    model = BertForSequenceClassification.from_pretrained(model_dir,
                                                          config=config)
    #model.to("cpu")
    print(
        "**************** {} ONNX inference with batch tokenization and with {} tokenizer****************"
        .format(quantized_str, tokenizer_str))
    start_onnx_inference_batch = time.time()
    start_batch_tokenization = time.time()
    tokens_dict = tokenizer.batch_encode_plus(examples, max_length=128)
    total_batch_tokenization_time = time.time() - start_batch_tokenization
    total_inference_time = 0
    total_build_label_time = 0
    for i in range(len(examples)):
        """
        Onnx inference with batch tokenization
        """
        tokens = get_tokens(tokens_dict, i)
        #inference
        start_inference = time.time()
        ort_outs = ort_session.run(None, tokens)
        total_inference_time = total_inference_time + (time.time() -
                                                       start_inference)
        #build label
        start_build_label = time.time()
        torch_onnx_output = torch.tensor(ort_outs[0], dtype=torch.float32)
        onnx_logits = F.softmax(torch_onnx_output, dim=1)
        logits_label = torch.argmax(onnx_logits, dim=1)
        label = logits_label.detach().cpu().numpy()
        onnx_inference.append(label[0])
        total_build_label_time = total_build_label_time + (time.time() -
                                                           start_build_label)
    end_onnx_inference_batch = time.time()
    print("Total batch tokenization time (in seconds): ",
          total_batch_tokenization_time)
    print("Total inference time (in seconds): ", total_inference_time)
    print("Total build label time (in seconds): ", total_build_label_time)
    print(
        "Duration ONNX inference (in seconds) with {} and batch tokenization: "
        .format(tokenizer_str),
        end_onnx_inference_batch - start_onnx_inference_batch)

    print(
        "****************{} ONNX inference without batch tokenization and with {} tokenizer****************"
        .format(quantized_str, tokenizer_str))
    start_onnx_inference_no_batch = time.time()
    total_tokenization_time = 0
    total_inference_time = 0
    total_build_label_time = 0
    for example in examples:
        """
        Onnx inference without batch tokenization 
        """
        #input_ids, input_mask, segment_ids = preprocess(tokenizer, example)
        #tokenization
        start_tokenization = time.time()
        tokens = tokenizer.encode_plus(example)
        tokens = {name: np.atleast_2d(value) for name, value in tokens.items()}
        total_tokenization_time = total_tokenization_time + (
            time.time() - start_tokenization)
        #inference
        start_inference = time.time()
        ort_outs = ort_session.run(None, tokens)
        total_inference_time = total_inference_time + (time.time() -
                                                       start_inference)
        #build_label
        start_build_label = time.time()
        torch_onnx_output = torch.tensor(ort_outs[0], dtype=torch.float32)
        onnx_logits = F.softmax(torch_onnx_output, dim=1)
        logits_label = torch.argmax(onnx_logits, dim=1)
        label = logits_label.detach().cpu().numpy()
        onnx_inference.append(label[0])
        total_build_label_time = total_build_label_time + (time.time() -
                                                           start_build_label)

    end_onnx_inference_no_batch = time.time()
    print("One-by-one total tokenization time (in seconds): ",
          total_tokenization_time)
    print("Total inference time (in seconds): ", total_inference_time)
    print("Total build label time (in seconds): ", total_build_label_time)
    print(
        "Duration ONNX inference (in seconds) with {} and one-by-one tokenization: "
        .format(tokenizer_str),
        end_onnx_inference_no_batch - start_onnx_inference_no_batch)

    print(
        "****************Torch inference without batch tokenization, without quantization and with {} tokenizer****************"
        .format(tokenizer_str))
    start_torch_inference_no_quantization = time.time()
    total_tokenization_time = 0
    total_inference_time = 0
    total_build_label_time = 0
    for example in examples:
        """
        Pretrained bert pytorch model
        """
        # tokenization
        start_tokenization = time.time()
        input_ids, input_mask, segment_ids = preprocess(tokenizer, example)
        total_tokenization_time = total_tokenization_time + (
            time.time() - start_tokenization)
        # inference
        start_inference = time.time()
        torch_out = inference_pytorch(model,
                                      input_ids,
                                      input_mask,
                                      segment_ids,
                                      quantization=False,
                                      num_threads=num_threads)
        total_inference_time = total_inference_time + (time.time() -
                                                       start_inference)
        # build label
        start_build_label = time.time()
        logits_label = torch.argmax(torch_out, dim=1)
        label = logits_label.detach().cpu().numpy()
        pytorch_inference.append(label[0])
        total_build_label_time = total_build_label_time + (time.time() -
                                                           start_build_label)

    end_torch_inference_no_quantization = time.time()
    print("One-by-one total tokenization time (in seconds): ",
          total_tokenization_time)
    print("Total inference time (in seconds): ", total_inference_time)
    print("Total build label time (in seconds): ", total_build_label_time)
    print(
        "Duration PyTorch inference (in seconds) with {}, without quantization and with {} threads: "
        .format(tokenizer_str,
                num_threads), end_torch_inference_no_quantization -
        start_torch_inference_no_quantization)

    print(
        "****************Torch inference without batch tokenization, with quantization and with {} tokenizer****************"
        .format(tokenizer_str))

    start_torch_inference_w_quantization = time.time()
    total_tokenization_time = 0
    total_inference_time = 0
    total_build_label_time = 0
    for example in examples:
        """
        Pretrained bert pytorch model
        """
        # tokenization
        start_tokenization = time.time()
        input_ids, input_mask, segment_ids = preprocess(tokenizer, example)
        total_tokenization_time = total_tokenization_time + (
            time.time() - start_tokenization)
        # inference
        start_inference = time.time()
        torch_out = inference_pytorch(model,
                                      input_ids,
                                      input_mask,
                                      segment_ids,
                                      quantization=False,
                                      num_threads=num_threads)
        total_inference_time = total_inference_time + (time.time() -
                                                       start_inference)
        # build label
        start_build_label = time.time()
        logits_label = torch.argmax(torch_out, dim=1)
        label = logits_label.detach().cpu().numpy()
        pytorch_inference.append(label[0])
        total_build_label_time = total_build_label_time + (time.time() -
                                                           start_build_label)

    end_torch_inference_w_quantization = time.time()
    print("One-by-one total tokenization time (in seconds): ",
          total_tokenization_time)
    print("Total inference time (in seconds): ", total_inference_time)
    print("Total build label time (in seconds): ", total_build_label_time)
    print(
        "Duration PyTorch inference (in seconds) with {} and with quantization and with {} threads: "
        .format(tokenizer_str,
                num_threads), end_torch_inference_w_quantization -
        start_torch_inference_w_quantization)

    #
    # # compare ONNX Runtime and PyTorch results
    # np.testing.assert_allclose(to_numpy(torch_out), onnx_logits, rtol=1e-03, atol=1e-05)
    #
    # print("Exported model has been tested with ONNXRuntime, and the result looks good!")
    return onnx_inference, pytorch_inference
Esempio n. 23
0
 def __init__(self):
     super().__init__()
     config = BertConfig.from_pretrained("bert-base-uncased")
     self.model = BertModel(config)
Esempio n. 24
0
def train_and_test():

    visual_features = pkl.load(
        open('tf_features/visual_features_facenet.pkl', 'rb'))
    audio_features = pkl.load(open('tf_features/audio_features.pkl', 'rb'))
    x = pkl.load(open('tf_features/linguistic_features.pkl', 'rb'))
    token_type_ids = pkl.load(open('tf_features/token_type_ids.pkl', 'rb'))
    attention_mask = pkl.load(open('tf_features/attention_mask.pkl', 'rb'))
    labels = pkl.load(open('tf_features/labels.pkl', 'rb'))
    cv5_ids = pkl.load(open('tf_features/cv5_ids.pkl', 'rb'))
    visual_dim = visual_features.shape[-1]
    audio_dim = audio_features.shape[-1]
    print(visual_dim, audio_dim)

    sp = cv5_ids[0]
    train_l, train_labels = x[sp[0]], labels[sp[0]]
    train_v = visual_features[sp[0]]
    train_a = audio_features[sp[0]]

    test_l, test_labels = x[sp[1]], labels[sp[1]]
    test_v = visual_features[sp[1]]
    test_a = audio_features[sp[1]]
    print(train_v.shape)

    train_token_type_ids, test_token_type_ids, train_attention_mask, test_attention_mask = token_type_ids[sp[0]], \
                                           token_type_ids[sp[1]], attention_mask[sp[0]], attention_mask[sp[1]]

    # shuffle training data for batch reading
    n_train = len(train_v)
    n_eval = len(test_v)
    perm = np.random.permutation(n_train)
    train_l, train_a, train_v = train_l[perm], train_a[perm], train_v[perm]
    print(train_l.shape, train_a.shape, train_v.shape)
    train_labels = np.array(train_labels)[perm]
    train_token_type_ids, train_attention_mask = train_token_type_ids[
        perm], train_attention_mask[perm]

    train_l, test_l, train_labels, test_labels, train_token_type_ids, test_token_type_ids = torch.LongTensor(train_l), \
                                                                            torch.LongTensor(test_l), \
                                                                            torch.LongTensor(train_labels), \
                                                                            torch.LongTensor(test_labels), \
                                                                            torch.LongTensor(train_token_type_ids), \
                                                                            torch.LongTensor(test_token_type_ids)

    train_a, test_a, train_v, test_v = torch.FloatTensor(train_a), torch.FloatTensor(test_a), \
                                       torch.FloatTensor(train_v), torch.FloatTensor(test_v)

    train_attention_mask, test_attention_mask = torch.FloatTensor(train_attention_mask), \
                                                torch.FloatTensor(test_attention_mask)

    config = BertConfig.from_pretrained('bert-base-uncased', num_labels=3)
    config.visual_dim = visual_dim
    config.audio_dim = audio_dim
    bert_external = BertModel.from_pretrained('bert-base-uncased').to('cuda')
    bert_insert = mBertModel(config)
    bert_insert.embeddings = bert_external.embeddings
    bert_insert.encoder = bert_external.encoder
    bert_insert.pooler = bert_external.pooler
    model = mBertModel(config).to('cuda')

    eval_every = 5
    batch_size = 32
    test_batch_size = 4
    max_epochs = 500
    t_total = math.ceil(n_train / batch_size) * max_epochs
    lr = 2e-5
    epsilon = 1e-8
    max_grad_norm = 1.0
    weight_decay = 0.0

    optimizer, scheduler = get_optimizers(model,
                                          learning_rate=lr,
                                          adam_epsilon=epsilon,
                                          weight_decay=weight_decay,
                                          num_training_steps=t_total)

    # loss_fn = torch.nn.CrossEntropyLoss().cuda()
    model.train()
    model.zero_grad()

    day = time.localtime().tm_mday
    minute = time.localtime().tm_min
    hour = time.localtime().tm_hour
    save_dir = 'fine_tuning_checkpoints/' + '-%d-%d-%d/' % (day, hour, minute)
    # os.mkdir(save_dir)

    for ep in range(max_epochs):
        idx = 0
        avg_loss = 0
        n_batch = 0
        model.train()
        while idx < n_train:
            optimizer.zero_grad()
            batch_l = train_l[idx:(idx + batch_size)].to('cuda')
            batch_v = train_v[idx:(idx + batch_size)].to('cuda')
            batch_a = train_a[idx:(idx + batch_size)].to('cuda')
            batch_ty = train_token_type_ids[idx:(idx + batch_size)].to('cuda')
            batch_am = train_attention_mask[idx:(idx + batch_size)].to('cuda')
            ans = train_labels[idx:(idx + batch_size)].to('cuda')
            idx += batch_size
            preds = model(input_ids=batch_l,
                          input_visual=batch_v,
                          input_audio=batch_a,
                          token_type_ids=batch_ty,
                          attention_mask=batch_am,
                          labels=ans)
            loss = preds[0]
            # print(preds, ans)
            loss.backward()
            # print(loss.data.cpu().numpy())
            avg_loss += loss.data.cpu().numpy()
            n_batch += 1.

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            torch.cuda.empty_cache()

        avg_loss = avg_loss / n_batch
        print("epoch: %d avg_loss: %f" % (ep + 1, avg_loss))

        del batch_l, batch_v, batch_a, batch_ty, batch_am, ans
        torch.cuda.empty_cache()
        # time.sleep(20)

        if ep % eval_every == 0:
            idx = 0
            model.eval()
            eval_preds = np.array([])
            while idx < n_eval:
                test_batch_v = test_v[idx:(idx + test_batch_size)].to('cuda')
                test_batch_l = test_l[idx:(idx + test_batch_size)].to('cuda')
                test_batch_a = test_a[idx:(idx + test_batch_size)].to('cuda')
                test_batch_ty = test_token_type_ids[idx:(
                    idx + test_batch_size)].to('cuda')
                test_batch_am = test_attention_mask[idx:(
                    idx + test_batch_size)].to('cuda')
                test_ans = test_labels[idx:(idx + test_batch_size)].to('cuda')
                # time.sleep(20)
                # exit()
                test_pred = model(input_ids=test_batch_l,
                                  input_visual=test_batch_v,
                                  input_audio=test_batch_a,
                                  token_type_ids=test_batch_ty,
                                  attention_mask=test_batch_am,
                                  labels=test_ans)
                scores = test_pred[1]
                _, batch_eval_preds = scores.data.cpu().max(1)
                eval_preds = np.concatenate((eval_preds, batch_eval_preds),
                                            axis=-1)
                idx += test_batch_size
                torch.cuda.empty_cache()

            del test_batch_l, test_batch_v, test_batch_a, test_batch_ty, test_batch_am, test_ans
            torch.cuda.empty_cache()
            # metrics
            precison, recall, fscore, support = precision_recall_fscore_support(
                test_labels.cpu().numpy(),
                eval_preds,
                labels=[0, 1, 2],
                average=None)

            print(
                float(sum(eval_preds == test_labels.cpu().numpy())) /
                len(eval_preds))
            print(precison, recall, fscore, support)
            print('saving:')
            '''model_dir = save_dir + '%d' % (ep+1)
Esempio n. 25
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="数据文件目录,因当有train.text dev.text")

    parser.add_argument("--vob_file",
                        default=None,
                        type=str,
                        required=True,
                        help="词表文件")
    parser.add_argument("--model_config",
                        default=None,
                        type=str,
                        required=True,
                        help="模型配置文件json文件")
    parser.add_argument("--pre_train_model",
                        default=None,
                        type=str,
                        required=True,
                        help="预训练的模型文件,参数矩阵。如果存在就加载")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="输出结果的文件")

    # Other parameters
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="输入到bert的最大长度,通常不应该超过512")
    parser.add_argument("--do_train", action='store_true', help="是否进行训练")
    parser.add_argument("--train_batch_size",
                        default=8,
                        type=int,
                        help="训练集的batch_size")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="验证集的batch_size")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="梯度累计更新的步骤,用来弥补GPU过小的情况")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="学习率")
    parser.add_argument("--weight_decay", default=0.0, type=float, help="权重衰减")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="最大的梯度更新")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="epoch 数目")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="让学习增加到1的步数,在warmup_steps后,再衰减到0")

    args = parser.parse_args()
    assert os.path.exists(args.data_dir)
    assert os.path.exists(args.vob_file)
    assert os.path.exists(args.model_config)
    assert os.path.exists(args.pre_train_model)

    args.device = torch.device(
        "cuda:0" if torch.cuda.is_available() else "cpu")

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    #filename = './output/bert-sim.log',

    processor = SimProcessor()
    tokenizer_inputs = ()
    tokenizer_kwards = {
        'do_lower_case': False,
        'max_len': args.max_seq_length,
        'vocab_file': args.vob_file
    }
    tokenizer = BertTokenizer(*tokenizer_inputs, **tokenizer_kwards)

    train_dataset = load_and_cache_example(args, tokenizer, processor, 'train')
    eval_dataset = load_and_cache_example(args, tokenizer, processor, 'dev')
    test_dataset = load_and_cache_example(args, tokenizer, processor, 'test')

    bert_config = BertConfig.from_pretrained(args.model_config)
    bert_config.num_labels = len(processor.get_labels())
    model_kwargs = {'config': bert_config}

    model = BertForSequenceClassification.from_pretrained(
        args.pre_train_model, **model_kwargs)
    model = model.to(args.device)

    if args.do_train:
        trains(args, train_dataset, eval_dataset, model)
Esempio n. 26
0
from pprint import pprint

model = "python/craftassist/models/semantic_parser/ttad_bert_updated/caip_test_model.pth"
args_path = "python/craftassist/models/semantic_parser/ttad_bert_updated/caip_test_model_args.pk"
args = pickle.load(open(args_path, "rb"))

tokenizer = AutoTokenizer.from_pretrained(args.pretrained_encoder_name)
full_tree, tree_i2w = json.load(open(args.tree_voc_file))
dataset = CAIPDataset(tokenizer,
                      args,
                      prefix="",
                      full_tree_voc=(full_tree, tree_i2w))

enc_model = AutoModel.from_pretrained(args.pretrained_encoder_name)
bert_config = BertConfig.from_pretrained("bert-base-uncased")
bert_config.is_decoder = True
bert_config.vocab_size = len(tree_i2w) + 8
bert_config.num_hidden_layers = args.num_decoder_layers
dec_with_loss = DecoderWithLoss(bert_config, args, tokenizer)
encoder_decoder = EncoderDecoderWithLoss(enc_model, dec_with_loss, args)
encoder_decoder.load_state_dict(torch.load(model))

encoder_decoder = encoder_decoder.cuda()
_ = encoder_decoder.eval()


def get_beam_tree(chat, noop_thres=0.95, beam_size=5, well_formed_pen=1e2):
    btr = beam_search(chat, encoder_decoder, tokenizer, dataset, beam_size,
                      well_formed_pen)
    if btr[0][0].get("dialogue_type",
def main():
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of task is selected in [imdb, amazon]")
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir.")
    parser.add_argument("--cache_dir",
                        default='../cache',
                        type=str,
                        help="The cache data dir.")
    parser.add_argument(
        '--model_type',
        default=None,
        type=str,
        required=True,
        help="Model type selected in [bert, xlnet, xlm, cnn, lstm]")
    parser.add_argument(
        '--model_name_or_path',
        default='bert-base-uncased',
        type=str,
        help="Shortcut name is selected in [bert-base-uncased, ]")
    parser.add_argument(
        '--output_dir',
        default='../out',
        type=str,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    parser.add_argument(
        "--skip",
        default=20,
        type=int,
        help="Evaluate one testing point every skip testing point.")
    parser.add_argument("--num_random_sample",
                        default=5000,
                        type=int,
                        help="The number of random samples of each texts.")
    parser.add_argument("--similarity_threshold",
                        default=0.8,
                        type=float,
                        help="The similarity constraint to be "
                        "considered as synonym.")
    parser.add_argument("--perturbation_constraint",
                        default=100,
                        type=int,
                        help="The maximum size of perturbation "
                        "set of each word.")
    parser.add_argument(
        "--mc_error",
        default=0.01,
        type=float,
        help="Monte Carlo Error based on concentration inequality.")
    parser.add_argument("--train_type",
                        default='normal',
                        type=str,
                        help="Train type is selected in [normal, rs].")
    # other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="The maximum total input sequence length after tokenization.")
    parser.add_argument("--batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--ckpt",
                        default=-1,
                        type=int,
                        help="Which ckpt to load.")
    parser.add_argument("--seed",
                        default=42,
                        type=int,
                        help="Random seed for initializaiton.")

    args = parser.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.device = device

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger.warning("model type: %s, task name: %s, device: %s, train_type: %s",
                   args.model_type, args.task_name, device, args.train_type)

    set_seed(args)
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % args.task_name)

    task_class = processors[args.task_name]()
    label_list = task_class.get_labels()
    num_labels = len(label_list)
    args.num_labels = num_labels
    # load vocab.
    word2index = None
    if args.model_type != 'bert':
        with open(
                args.cache_dir + '/{}_vocab_train.pkl'.format(args.task_name),
                'rb') as f:
            vocab = pickle.load(f)
        index2word = vocab['index2word']
        word2index = vocab['word2index']
        word_mat = vocab['word_mat']
        args.word_mat = word_mat
        args.vocab_size = len(index2word)

    tokenizer = None
    if args.model_type == 'bert':
        tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                                  do_lower_case=True)
        args.vocab_size = tokenizer.vocab_size
        config = BertConfig.from_pretrained(args.model_name_or_path,
                                            num_labels=num_labels,
                                            finetuning_task=args.task_name)
        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)
    elif args.model_type == 'bow':
        args.embed_size = 300
        args.hidden_size = 100
        model = BOWModel(word_mat,
                         n_vocab=args.vocab_size,
                         embed_size=args.embed_size,
                         hidden_size=args.hidden_size,
                         num_classes=args.num_labels)
    elif args.model_type == 'decom_att':  # No using
        args.embed_size = 300
        args.hidden_size = 100
        model = DecompAttentionModel(word_mat,
                                     n_vocab=args.vocab_size,
                                     embed_size=args.embed_size,
                                     hidden_size=args.hidden_size,
                                     num_classes=args.num_labels)
    elif args.model_type == 'esim':
        args.embed_size = 300
        args.hidden_size = 100
        model = ESIM(vocab_size=args.vocab_size,
                     embedding_dim=args.embed_size,
                     hidden_size=args.hidden_size,
                     embeddings=torch.tensor(word_mat).float(),
                     padding_idx=0,
                     dropout=0.1,
                     num_classes=args.num_labels,
                     device=args.device)
    else:
        raise ValueError('model type is not found!')
    model.to(device)

    similarity_threshold = args.similarity_threshold
    perturbation_constraint = args.perturbation_constraint

    perturbation_file = args.cache_dir + '/' + args.task_name + '_perturbation_constraint_pca' + str(
        similarity_threshold) + "_" + str(perturbation_constraint) + '.pkl'
    with open(perturbation_file, 'rb') as f:
        perturb = pickle.load(f)

    # random smooth
    random_smooth = WordSubstitute(perturb)
    # generate randomized data
    randomize_testset(args, random_smooth, similarity_threshold,
                      perturbation_constraint)
    # calculate total variation
    calculate_tv_perturb(args, perturb)
    # Evaluation
    if args.ckpt < 0:
        checkpoints = glob.glob(
            args.output_dir + '/{}_{}_{}_checkpoint-*'.format(
                args.train_type, args.task_name, args.model_type))
        checkpoints.sort(key=lambda x: int(x.split('-')[-1]))
        checkpoint = checkpoints[-1]
    else:
        checkpoint = os.path.join(
            args.output_dir,
            '{}_{}_{}_checkpoint-{}'.format(args.train_type, args.task_name,
                                            args.model_type, args.ckpt))
    print("Evaluation result, load model from {}".format(checkpoint))
    model = load(args, checkpoint)
    randomized_evaluate(args, model, tokenizer, word2index)
Esempio n. 28
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")

    args = parser.parse_args()

    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend="nccl")
    args.device = device

    seed = 30004
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


    # prepare input
    import pickle
    with open('../1_1/distribution_dict1.pickle', 'rb') as f:
        distribution_dict1 = pickle.load(f)  
    with open('../1_1/distribution_dict2.pickle', 'rb') as f:
        distribution_dict2 = pickle.load(f)
    with open('../1_1/distribution_dict3.pickle', 'rb') as f:
        distribution_dict3 = pickle.load(f)
    with open('../1_1/distribution_dict4.pickle', 'rb') as f:
        distribution_dict4 = pickle.load(f)

    json_dir = '../../input/simplified-nq-train.jsonl'
    max_data = 9999999999

    id_list = []
    neg_id_list = []
    data_dict = {}
    neg_data_dict = {}
    with open(json_dir) as f:
        for n, line in tqdm(enumerate(f)):
            if n > max_data:
                break
            data = json.loads(line)

            is_pos = False
            annotations = data['annotations'][0]
            if annotations['yes_no_answer'] == 'YES':
                is_pos = True
            elif annotations['yes_no_answer'] == 'NO':
                is_pos = True
            elif annotations['short_answers']:
                is_pos = True
            elif annotations['long_answer']['candidate_index'] != -1:
                is_pos = True

            if is_pos and len(data['long_answer_candidates'])>1:
                data_id = data['example_id']
                id_list.append(data_id)

                # random sampling
                if data_id in distribution_dict1:
                    candidate_index_list = np.array(distribution_dict1[data_id]['candidate_index_list'])
                    prob_list = np.power(np.array(distribution_dict1[data_id]['prob_list']),1)
                elif data_id in distribution_dict2:
                    candidate_index_list = np.array(distribution_dict2[data_id]['candidate_index_list'])
                    prob_list = np.power(np.array(distribution_dict2[data_id]['prob_list']),1)
                elif data_id in distribution_dict3:
                    candidate_index_list = np.array(distribution_dict3[data_id]['candidate_index_list'])
                    prob_list = np.power(np.array(distribution_dict3[data_id]['prob_list']),1)
                else:
                    candidate_index_list = np.array(distribution_dict4[data_id]['candidate_index_list'])
                    prob_list = np.power(np.array(distribution_dict4[data_id]['prob_list']),1)
                prob_list /= sum(prob_list)
                negative_candidate_index = random_sample_negative_candidates(candidate_index_list, prob_list)

                #
                doc_words = data['document_text'].split()
                # negative
                candidate = data['long_answer_candidates'][negative_candidate_index]
                negative_candidate_words = doc_words[candidate['start_token']:candidate['end_token']]  
                negative_candidate_start = candidate['start_token']
                negative_candidate_end = candidate['end_token']
                # positive
                candidate = data['long_answer_candidates'][annotations['long_answer']['candidate_index']]
                positive_candidate_words = doc_words[candidate['start_token']:candidate['end_token']]
                positive_candidate_start = candidate['start_token']
                positive_candidate_end = candidate['end_token']

                # initialize data_dict
                data_dict[data_id] = {
                                      'question_text': data['question_text'],
                                      'annotations': data['annotations'],  
                                      'positive_text': positive_candidate_words,
                                      'positive_start': positive_candidate_start,  
                                      'positive_end': positive_candidate_end,   
                                      'negative_text': negative_candidate_words,       
                                      'negative_start': negative_candidate_start,  
                                      'negative_end': negative_candidate_end,               
                                     }

            elif (not is_pos) and len(data['long_answer_candidates'])>=1:
                data_id = data['example_id']
                neg_id_list.append(data_id)

                # random sampling
                if data_id in distribution_dict1:
                    candidate_index_list = np.array(distribution_dict1[data_id]['candidate_index_list'])
                    prob_list = np.power(np.array(distribution_dict1[data_id]['prob_list']),1)
                elif data_id in distribution_dict2:
                    candidate_index_list = np.array(distribution_dict2[data_id]['candidate_index_list'])
                    prob_list = np.power(np.array(distribution_dict2[data_id]['prob_list']),1)
                elif data_id in distribution_dict3:
                    candidate_index_list = np.array(distribution_dict3[data_id]['candidate_index_list'])
                    prob_list = np.power(np.array(distribution_dict3[data_id]['prob_list']),1)
                else:
                    candidate_index_list = np.array(distribution_dict4[data_id]['candidate_index_list'])
                    prob_list = np.power(np.array(distribution_dict4[data_id]['prob_list']),1)
                prob_list /= sum(prob_list)
                negative_candidate_index = random_sample_negative_candidates(candidate_index_list, prob_list)

                #
                doc_words = data['document_text'].split()
                # negative
                candidate = data['long_answer_candidates'][negative_candidate_index]
                negative_candidate_words = doc_words[candidate['start_token']:candidate['end_token']]  
                negative_candidate_start = candidate['start_token']
                negative_candidate_end = candidate['end_token']

                # initialize data_dict
                neg_data_dict[data_id] = {
                                          'question_text': data['question_text'],  
                                          'negative_text': negative_candidate_words,       
                                          'negative_start': negative_candidate_start,  
                                          'negative_end': negative_candidate_end,               
                                         }


    print(len(id_list), len(neg_id_list))
    random.shuffle(id_list)
    random.shuffle(neg_id_list) # length of neg_id_list must be longer than id_list otherwise data generator will error


    # hyperparameters
    max_seq_len = 360
    max_question_len = 64
    learning_rate = 0.000002
    batch_size = 3
    ep = 0


    # build model
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model_path = 'model/'
    config = BertConfig.from_pretrained(model_path)
    config.num_labels = 5
    config.vocab_size = 30531
    tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)
    model = BertForQuestionAnswering.from_pretrained('weights/epoch2/', config=config)

    # add new tokens
    new_token_dict = {
                      '<P>':'qw1',
                      '<Table>':'qw2',
                      '<Tr>':'qw3',
                      '<Ul>':'qw4',
                      '<Ol>':'qw5',
                      '<Fl>':'qw6',
                      '<Li>':'qw7',
                      '<Dd>':'qw8',
                      '<Dt>':'qw9',
                     }
    new_token_list = [
                      'qw1',
                      'qw2',
                      'qw3',
                      'qw4',
                      'qw5',
                      'qw6',
                      'qw7',
                      'qw8',
                      'qw9',
                     ]

    num_added_toks = tokenizer.add_tokens(new_token_list)
    print('We have added', num_added_toks, 'tokens')
    model.resize_token_embeddings(len(tokenizer))

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model.to(args.device)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)
    model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )


    # training

    # iterator for training
    train_datagen = TFQADataset(id_list=id_list, neg_id_list=neg_id_list)
    train_sampler = DistributedSampler(train_datagen)
    train_collate = Collator(id_list=id_list,
                             neg_id_list=neg_id_list,
                             data_dict=data_dict, 
                             neg_data_dict=neg_data_dict,
                             new_token_dict=new_token_dict,
                             tokenizer=tokenizer, 
                             max_seq_len=max_seq_len, 
                             max_question_len=max_question_len)
    train_generator = DataLoader(dataset=train_datagen,
                                 sampler=train_sampler,
                                 collate_fn=train_collate,
                                 batch_size=batch_size,
                                 num_workers=3,
                                 pin_memory=True)

    # train
    losses1 = AverageMeter() # start
    losses2 = AverageMeter() # end
    losses3 = AverageMeter() # class
    accuracies1 = AverageMeter() # start
    accuracies2 = AverageMeter() # end
    accuracies3 = AverageMeter() # class
    model.train()
    for j,(batch_input_ids, batch_attention_mask, batch_token_type_ids, batch_y_start, batch_y_end, batch_y) in enumerate(train_generator):
        batch_input_ids = batch_input_ids.cuda()
        batch_attention_mask = batch_attention_mask.cuda()
        batch_token_type_ids = batch_token_type_ids.cuda()
        labels1 = batch_y_start.cuda()
        labels2 = batch_y_end.cuda()
        labels3 = batch_y.cuda()

        logits1, logits2, logits3 = model(batch_input_ids, batch_attention_mask, batch_token_type_ids)
        y_true = (batch_y_start, batch_y_end, batch_y)
        loss1, loss2, loss3 = loss_fn((logits1, logits2, logits3), (labels1, labels2, labels3))
        loss = loss1+loss2+loss3
        acc1, n_position1 = get_position_accuracy(logits1, labels1)
        acc2, n_position2 = get_position_accuracy(logits2, labels2)
        acc3, n_position3 = get_position_accuracy(logits3, labels3)

        losses1.update(loss1.item(), n_position1)
        losses2.update(loss2.item(), n_position2)
        losses3.update(loss3.item(), n_position3)
        accuracies1.update(acc1, n_position1)
        accuracies2.update(acc2, n_position2)
        accuracies3.update(acc3, n_position2)

        optimizer.zero_grad()

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

        optimizer.step()

    if args.local_rank == 0:
        print('epoch: {}, train_loss1: {}, train_loss2: {}, train_loss3: {}, train_acc1: {}, train_acc2: {}, train_acc3: {}'.format(ep,losses1.avg,losses2.avg,losses3.avg,accuracies1.avg,accuracies2.avg,accuracies3.avg), flush=True)

        out_dir = 'weights/epoch3/'
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        torch.save(model.module.state_dict(), out_dir+'pytorch_model.bin')
Esempio n. 29
0
import re
from random import shuffle
import random
import numpy as np
from transformers import BertTokenizer, BertForMaskedLM, BertConfig
import math
import time
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from EncDecStructure import *

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
model_version = 'bert-base-uncased'
config = BertConfig.from_pretrained(model_version, output_hidden_states=False)
model = BertForMaskedLM.from_pretrained(model_version, config=config)
model.train()
cuda = torch.cuda.is_available()
if cuda:
    model = model.cuda()

tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=model_version.endswith("uncased"))
CLS = '[CLS]'
SEP = '[SEP]'
MASK = '[MASK]'
mask_id = tokenizer.convert_tokens_to_ids([MASK])[0]
sep_id = tokenizer.convert_tokens_to_ids([SEP])[0]
cls_id = tokenizer.convert_tokens_to_ids([CLS])[0]
model2 = SentenceTransformer('bert-base-nli-mean-tokens')
model2.eval()
Esempio n. 30
0
    def train(self, train_path: str, valid_path: str, types_path: str,
              input_reader_cls: BaseInputReader):
        args = self.args
        train_label, valid_label = 'train', 'valid'

        self._logger.info("Datasets: %s, %s" % (train_path, valid_path))
        self._logger.info("Model type: %s" % args.model_type)

        # create log csv files
        self._init_train_logging(train_label)
        self._init_eval_logging(valid_label)

        # read datasets
        input_reader = input_reader_cls(types_path, self._tokenizer,
                                        args.neg_term_count,
                                        args.neg_relation_count,
                                        args.max_span_size, self._logger)
        input_reader.read({train_label: train_path, valid_label: valid_path})
        self._log_datasets(input_reader)

        train_dataset = input_reader.get_dataset(train_label)
        train_sample_count = train_dataset.document_count
        updates_epoch = train_sample_count // args.train_batch_size
        updates_total = updates_epoch * args.epochs

        validation_dataset = input_reader.get_dataset(valid_label)

        self._logger.info("Updates per epoch: %s" % updates_epoch)
        self._logger.info("Updates total: %s" % updates_total)

        # create model
        model_class = models.get_model(self.args.model_type)

        # load model
        config = BertConfig.from_pretrained(self.args.model_path,
                                            cache_dir=self.args.cache_path)
        util.check_version(config, model_class, self.args.model_path)

        config.model_version = model_class.VERSION
        model = model_class.from_pretrained(
            self.args.model_path,
            config=config,
            cls_token=self._tokenizer.convert_tokens_to_ids('[CLS]'),
            relation_types=input_reader.relation_type_count - 1,
            term_types=input_reader.term_type_count,
            max_pairs=self.args.max_pairs,
            prop_drop=self.args.prop_drop,
            size_embedding=self.args.size_embedding,
            freeze_transformer=self.args.freeze_transformer,
            args=self.args,
            beta=self.args.beta,
            alpha=self.args.alpha,
            sigma=self.args.sigma)

        model.to(self._device)

        # create optimizer
        optimizer_params = self._get_optimizer_params(model)
        optimizer = AdamW(optimizer_params,
                          lr=args.lr,
                          weight_decay=args.weight_decay,
                          correct_bias=False)
        # create scheduler
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.lr_warmup * updates_total,
            num_training_steps=updates_total)
        # create loss function
        rel_criterion = torch.nn.BCEWithLogitsLoss(reduction='none')
        term_criterion = torch.nn.CrossEntropyLoss(reduction='none')
        compute_loss = SynFueLoss(rel_criterion, term_criterion, model,
                                  optimizer, scheduler, args.max_grad_norm)

        # eval validation set
        if args.init_eval:
            self._eval(model, validation_dataset, input_reader, 0,
                       updates_epoch)

        # train
        best_f1 = 0.0
        for epoch in range(args.epochs):
            # train epoch
            self._train_epoch(model, compute_loss, optimizer, train_dataset,
                              updates_epoch, epoch)

            # eval validation sets
            if not args.final_eval or (epoch == args.epochs - 1):
                rel_nec_eval = self._eval(model, validation_dataset,
                                          input_reader, epoch + 1,
                                          updates_epoch)
                if best_f1 < rel_nec_eval[-1]:
                    # save final model
                    best_f1 = rel_nec_eval[-1]
                    extra = dict(epoch=args.epochs,
                                 updates_epoch=updates_epoch,
                                 epoch_iteration=0)
                    global_iteration = args.epochs * updates_epoch
                    self._save_model(self._save_path,
                                     model,
                                     self._tokenizer,
                                     global_iteration,
                                     optimizer=optimizer
                                     if self.args.save_optimizer else None,
                                     save_as_best=True,
                                     extra=extra,
                                     include_iteration=False)

        self._logger.info("Logged in: %s" % self._log_path)
        self._logger.info("Saved in: %s" % self._save_path)
        self._close_summary_writer()