def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
     config.num_labels = self.num_labels
     model = BertForSequenceClassification(config)
     model.eval()
     loss, logits = model(input_ids, token_type_ids, input_mask, sequence_labels)
     result = {
         "loss": loss,
         "logits": logits,
     }
     self.parent.assertListEqual(
         list(result["logits"].size()),
         [self.batch_size, self.num_labels])
     self.check_loss_output(result)
def main():

    bert_base_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2)
    bert_base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=bert_base_config)
    count = 0
    for name, param in bert_base_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in bert_base_uncased: ', count)

    roberta_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2)
    roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=roberta_config)
    count = 0
    for name, param in roberta_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in roberta: ', count)

    albert_config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2)
    albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=albert_config)
    count = 0
    for name, param in albert_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in albert: ', count)
Example #3
0
    def __init__(self,
                 checkpoint_path='logs/checkpoint.pth',
                 eval_report_path='logs/report.txt',
                 is_training=True,
                 train_path='train.csv',
                 test_path='test.csv',
                 log_dir='drive/My Drive/dm/logs/',
                 batch_size=16):

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.learning_rate = 5e-5
        self.num_epochs = 6
        self.batch_size = batch_size
        self.log_interval = 1000
        self.is_training = is_training
        self._plot_server = None
        self.log_dir = log_dir

        self.checkpoint_path = checkpoint_path
        self.best_model_path = checkpoint_path + '.best'
        self.eval_report = eval_report_path
        self.train_data_path = train_path
        self.test_data_path = test_path
        self.train_loader = QQPLoader(self.device, self.train_data_path, self.batch_size)
        self.test_loader = QQPLoader(self.device, self.test_data_path, self.batch_size)

        self.model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self._maybe_load_checkpoint()
        self.model.to(self.device)
Example #4
0
    def save_and_reload(self, path, model_name):

        torch.cuda.empty_cache()
        self.model.to('cpu')
        # Save a trained model
        model_to_save = self.model.module if hasattr(
            self.model,
            'module') else self.model  # Only save the model it-self
        output_model_file = os.path.join(path, "{}.bin".format(model_name))
        torch.save(model_to_save.state_dict(), output_model_file)

        # Load a trained model that you have fine-tuned
        model_state_dict = torch.load(output_model_file)
        if self.multi_label:
            self.model = BertForMultiLabelSequenceClassification.from_pretrained(
                self.pretrained_model_path,
                num_labels=len(self.data.labels),
                state_dict=model_state_dict)
        else:
            self.model = BertForSequenceClassification.from_pretrained(
                self.pretrained_model_path,
                num_labels=len(self.data.labels),
                state_dict=model_state_dict)

        if self.is_fp16:
            self.model.half()
        torch.cuda.empty_cache()
        self.model.to(self.device)
Example #5
0
 def __init__(self, pretrain_path, max_length):
     nn.Module.__init__(self)
     # self.bert = BertModel.from_pretrained(pretrain_path)
     self.bert = BertForSequenceClassification.from_pretrained(
         pretrain_path, num_labels=2)
     self.max_length = max_length
     self.tokenizer = BertTokenizer.from_pretrained(pretrain_path)
def load_model(model_name: str, task_name: str):
    if model_name not in cache:
        cache[model_name] = dict()
    if task_name not in cache[model_name]:
        model_path = str(Path(f"models/{model_name}/{task_name}/"))
        model = BertForSequenceClassification.from_pretrained(model_path,
                                                              config=config)
        cache[model_name][task_name] = model
    return cache[model_name][task_name]
 def create_model(self):
     if self.model_configuration.bert_model in ("xlnet-base-cased",):
         model = XLNetForSequenceClassification.from_pretrained(self.model_configuration.bert_model,
                                                                num_labels=self.model_configuration.num_labels)
     else:
         model = BertForSequenceClassification.from_pretrained(self.model_configuration.bert_model,
                                                               num_labels=self.model_configuration.num_labels)
     model.to(device)
     return model
Example #8
0
 def __init__(self, pretrain_path, max_length): 
     nn.Module.__init__(self)
     # self.bert = BertModel.from_pretrained(pretrain_path)
     self.bert = BertForSequenceClassification.from_pretrained(
             pretrain_path,
             num_labels=2)
     self.max_length = max_length
     self.tokenizer = BertTokenizer.from_pretrained(os.path.join(
         pretrain_path, 'bert_vocab.txt'))
     self.modelName = 'Bert'
Example #9
0
def main(text):
    tokenizer = BertTokenizer.from_pretrained('./', do_lower_case=True)
    model = BertForSequenceClassification.from_pretrained('./')
    model.to(device)
    texts = []
    preds = []
    texts.append("[CLS] " + text[:509] + " [SEP]")
    tokenized_texts = [tokenizer.tokenize(sent) for sent in texts]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(
        input_ids,
        maxlen=100,
        dtype="long",
        truncating="post",
        padding="post"
    )
    attention_masks = [[float(i>0) for i in seq] for seq in input_ids]

    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)
  
    prediction_data = TensorDataset(
        prediction_inputs,
        prediction_masks
    )

    prediction_dataloader = DataLoader(
      prediction_data, 
      sampler=SequentialSampler(prediction_data),
      batch_size=1
    )
    model.eval()
    preds = []

    for batch in prediction_dataloader:
        # добавляем батч для вычисления на GPU
        batch = tuple(t.to(device) for t in batch)
    
        # Распаковываем данные из dataloader
        b_input_ids, b_input_mask = batch
    
        # При использовании .no_grad() модель не будет считать и хранить градиенты.
        # Это ускорит процесс предсказания меток для тестовых данных.
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        # Перемещаем logits и метки классов на CPU для дальнейшей работы
        logits = logits[0].detach().cpu().numpy()

        # Сохраняем предсказанные классы и ground truth
        batch_preds = np.argmax(logits, axis=1) 
        preds.extend(batch_preds)
    return preds
Example #10
0
def get_model(model, pretrained, resume, n_classes, dataset, log_dir):
    if resume:
        model = torch.load(os.path.join(log_dir, "last_model.pth"))
        d = train_data.input_size()[0]
    elif model_attributes[model]["feature_type"] in (
            "precomputed",
            "raw_flattened",
    ):
        assert pretrained
        # Load precomputed features
        d = train_data.input_size()[0]
        model = nn.Linear(d, n_classes)
        model.has_aux_logits = False
    elif model == "resnet50":
        model = torchvision.models.resnet50(pretrained=pretrained)
        d = model.fc.in_features
        model.fc = nn.Linear(d, n_classes)
    elif model == "resnet34":
        model = torchvision.models.resnet34(pretrained=pretrained)
        d = model.fc.in_features
        model.fc = nn.Linear(d, n_classes)
    elif model == "wideresnet50":
        model = torchvision.models.wide_resnet50_2(pretrained=pretrained)
        d = model.fc.in_features
        model.fc = nn.Linear(d, n_classes)
    elif model.startswith('bert'):
        if dataset == "MultiNLI":

            assert dataset == "MultiNLI"

            from pytorch_transformers import BertConfig, BertForSequenceClassification

            config_class = BertConfig
            model_class = BertForSequenceClassification

            config = config_class.from_pretrained("bert-base-uncased",
                                                  num_labels=3,
                                                  finetuning_task="mnli")
            model = model_class.from_pretrained("bert-base-uncased",
                                                from_tf=False,
                                                config=config)
        elif dataset == "jigsaw":
            from transformers import BertForSequenceClassification
            model = BertForSequenceClassification.from_pretrained(
                model, num_labels=n_classes)
            print(f'n_classes = {n_classes}')
        else:
            raise NotImplementedError
    else:
        raise ValueError(f"{model} Model not recognized.")

    return model
Example #11
0
def train(
    root=True,
    binary=False,
    bert="bert-large-uncased",
    epochs=30,
    batch_size=8,
    save=False,
):
    trainset = SSTDataset("train", root=root, binary=binary)
    devset = SSTDataset("dev", root=root, binary=binary)
    testset = SSTDataset("test", root=root, binary=binary)

    config = BertConfig.from_pretrained(bert)
    if not binary:
        config.num_labels = 5
    model = BertForSequenceClassification.from_pretrained(bert, config=config)

    model = model.to(device)
    lossfn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    for epoch in range(1, epochs):
        train_loss, train_acc = train_one_epoch(model,
                                                lossfn,
                                                optimizer,
                                                trainset,
                                                batch_size=batch_size)
        val_loss, val_acc = evaluate_one_epoch(model,
                                               lossfn,
                                               optimizer,
                                               devset,
                                               batch_size=batch_size)
        test_loss, test_acc = evaluate_one_epoch(model,
                                                 lossfn,
                                                 optimizer,
                                                 testset,
                                                 batch_size=batch_size)
        logger.info(f"epoch={epoch}")
        logger.info(
            f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, test_loss={test_loss:.4f}"
        )
        logger.info(
            f"train_acc={train_acc:.3f}, val_acc={val_acc:.3f}, test_acc={test_acc:.3f}"
        )
        if save:
            label = "binary" if binary else "fine"
            nodes = "root" if root else "all"
            torch.save(model, f"{bert}__{nodes}__{label}__e{epoch}.pickle")

    logger.success("Done!")
Example #12
0
def main():
    torch.cuda.empty_cache()
    parser = setup_parser()
    args = parser.parse_args()
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory already exists and is not empty.")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    args.n_gpu = torch.cuda.device_count()
    args.device = device
    set_seed(args)
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: {}".format(args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    ##Load Models
    config = BertConfig.from_pretrained(args.config_name)
    config.num_labels = 1
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                              do_lower_case=args.do_lower_case)
    model = BertForSequenceClassification.from_pretrained(
        args.model_name_or_path, config=config, num_labels=1)

    model.to(args.device)
    args.n_gpu = 1

    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info('global step = {}, average loss = {}'.format(
            global_step, tr_loss))
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        logger.info("saving model checkpoint to {}".format(args.output_dir))
        model_to_save = model.module if hasattr(model, 'module') else model
        # model_to_save.save_pretrained(args.output_dir)
        torch.save(model_to_save.state_dict(), 'saved_model.pth')
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
Example #13
0
def predict_bert(text):
    import torch
    from keras.preprocessing.sequence import pad_sequences
    import pandas as pd
    import numpy as np
    from pytorch_transformers import BertTokenizer, BertForSequenceClassification

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device == 'cpu':
        print('cpu')
    else:
        n_gpu = torch.cuda.device_count()
        print(torch.cuda.get_device_name(0))

    model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                          num_labels=2)
    model.load_state_dict(torch.load(DIR_DATA_MODELS / 'BERT_model.h5'))

    sentences = '[CLS] ' + str(text) + ' [SEP]'

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)

    tok = tokenizer.tokenize(sentences)

    input_ids = tokenizer.convert_tokens_to_ids(tok)
    input_ids = pad_sequences([input_ids, ''],
                              maxlen=100,
                              dtype="long",
                              truncating="post",
                              padding="post")

    attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]

    train_inputs = torch.tensor(input_ids[0]).long().to(device)
    train_masks = torch.tensor(attention_masks[0]).long().to(device)

    train_inputs = train_inputs.unsqueeze_(0)
    train_masks = train_masks.unsqueeze_(0)

    model.to(device)

    logits = model(train_inputs,
                   token_type_ids=None,
                   attention_mask=train_masks)

    return logits
    def forward(self, captions, position_ids, region_features, attention_mask):

        # batch_size = region_features.shape[0]

        embeddings = self.embedding_layer(region_features, captions,
                                          position_ids)

        # print(self.classifier);exit(0)

        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        attention_mask = (1.0 - attention_mask) * -10000.0

        model_checkpoint = "distilbert-base-uncased"
        model = AutoModelForSequenceClassification.from_pretrained(
            model_checkpoint, num_labels=2)

        print(model)
        exit(0)

        model = BertForSequenceClassification(BertConfig())
        encoder = model.bert.encoder
        pooler = model.bert.pooler
        dropout = model.dropout
        classifier = model.classifier

        output = encoder(embeddings, attention_mask, head_mask=self.head_mask)
        # print(type(output))
        # print(len(output));exit(0)
        output = pooler(output[0])
        output = dropout(output)
        output = classifier(output)

        print(output.shape)
        print(output)
        exit(0)

        hidden_states = self.encoder(embeddings, attention_mask,
                                     self.head_mask)[0]

        output = self.classifier(hidden_states)

        print(hidden_states.shape, output.shape)
        exit(0)

        return self.classifier(hidden_states)
Example #15
0
def test(ckpt):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                          num_labels=5)
    load_checkpoint(model, ckpt)
    model.eval()

    #  while(1):
    #  sentence = input("Enter Sentence: ")
    sentence = sys.argv[1]
    encode = tokenizer.encode(sentence, add_special_tokens=True)
    padded = [encode + [0] * (512 - len(encode))]
    sentence = torch.tensor(padded)
    label = torch.tensor([0])

    results = model(sentence, label)
    _softmax = F.softmax(results[0], dim=1)
    pred = torch.argmax(F.softmax(results[0], dim=1)).item()
    print(f"{pred+1}")
Example #16
0
    def _load_dnli_model(self):
        # Download pretrained weight
        dnli_model_fname = os.path.join(self.opt['datapath'], 'dnli_model.bin')
        if not os.path.exists(dnli_model_fname):
            print(f"[ Download pretrained dnli model params to {dnli_model_fname}]")
            download_from_google_drive(
                '1Qawz1pMcV0aGLVYzOgpHPgG5vLSKPOJ1',
                dnli_model_fname
            )

        # Load pretrained weight
        print(f"[ Load pretrained dnli model from {dnli_model_fname}]")
        model_state_dict = torch.load(dnli_model_fname)
        dnli_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', state_dict=model_state_dict, num_labels=3)
        if self.use_cuda:
            dnli_model.cuda()
        dnli_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

        return dnli_model, dnli_tokenizer
def predict_model(args, save=True):
    dataset_name = args.dataset_name[0]
    model_type = args.model_type
    test_dataset = path_tensor_dataset / f"{model_type}_{dataset_name}.pkl"
    test_dataset = pickle_load(test_dataset)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=args.batch_size,
                                 pin_memory=True,
                                 num_workers=4,
                                 shuffle=False)

    model_dir = path_model / f"{args.model_type}_{args.model_name}/checkpoint_epoch{args.epoch_num}"
    if model_type == "bert":
        model = BertForSequenceClassification.from_pretrained(model_dir,
                                                              num_labels=126)
    elif model_type == "xlnet":
        model = XLNetForSequenceClassification.from_pretrained(model_dir,
                                                               num_labels=126)
    else:
        raise ValueError("")
    model.zero_grad()
    model.eval()
    model = model.cuda(args.gpu_device_ids[0])
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model, device_ids=args.gpu_device_ids)

    res = []
    for batch in tqdm(test_dataloader, desc="Iteration"):
        batch = tuple(x.cuda(args.gpu_device_ids[0]) for x in batch)
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2]
        }
        with torch.no_grad():
            outputs = model(**inputs)[0]
        res.append(outputs)
    res = torch.cat(res, 0).cpu()
    if save:
        filename = f"{model_type}_{dataset_name}_epoch{args.epoch_num}_res.pkl"
        pickle_save(res, path_model_output / filename)
    return res
Example #18
0
    def __init__(self,
                 device=torch.device(
                     "cuda" if torch.cuda.is_available() else "cpu"),
                 is_paralleled=False,
                 BATCH_SIZE=128,
                 CPU_COUNT=1,
                 CHUNKSIZE=1):
        self.device = device if isinstance(
            device, torch.device) else torch.device(device)
        model_path = os.path.join(os.path.dirname(__file__),
                                  "support_model.bin")
        self.model_type = 'supportr'

        if not os.path.isfile(model_path):
            logger.info(
                f'Model {self.model_type} does not exist at {model_path}. Try to download it now.'
            )
            model = 'support_model'
            fetch_pretrained_model(model, model_path)

        if self.device.type == "cpu":
            model_state_dict = torch.load(model_path,
                                          map_location=self.device.type)
        else:
            model_state_dict = torch.load(model_path)
        self.model = BertForSequenceClassification.from_pretrained(
            'bert-base-cased', state_dict=model_state_dict, num_labels=1)
        if is_paralleled:
            if self.device.type == "cpu":
                print("Data parallel is not available with cpus")
            else:
                self.model = torch.nn.DataParallel(self.model)

        self.model.to(device)
        self.model.eval()
        self.batch_size = BATCH_SIZE
        self.cpu_count = CPU_COUNT
        self.chunksize = CHUNKSIZE
Example #19
0
import pandas as pd
from joblib import dump
import torch
from utils.dataset import MLBERT

seed = 500
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

# Network Hyperparameters

num_classes = 1588
batch_size = 128
model_type = 'bert-base-multilingual-cased'
model = BertForSequenceClassification.from_pretrained('model/')
checkpoint = torch.load(
    'model/checkpoints/metric_learning/bert/model_5epochs.pt',
    map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])

cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if cuda else "cpu")

if cuda:
    print("Cuda available")
    model.cuda()

kwargs = {'num_workers': 0, 'pin_memory': True} if cuda else {}

ml_test = MLBERT(train=False, file='processed_data/ml_test_bert.pt')
Example #20
0
def main():
    torch.manual_seed(42)

    # Random
    #params = {'batch_size': 32, 'dropout': 0, 'hidden_dim': 128, 'learning_rate': 0.01, 'num_epochs': 5, 'num_layers': 2, 'oversample': False, 'soft_labels': False}
    # Glove
    params = {
        'batch_size': 32,
        'dropout': 0,
        'hidden_dim': 128,
        'learning_rate': 0.001,
        'num_epochs': 5,
        'num_layers': 2,
        'oversample': False,
        'soft_labels': False
    }
    # Random
    #params = {'batch_size': 32, 'dropout': 0, 'hidden_dim': 256, 'learning_rate': 0.0001, 'num_epochs': 5, 'num_layers': 3, 'oversample': False, 'soft_labels': False}

    #some params
    experiment_number = 1
    test_percentage = 0.1
    val_percentage = 0.2
    batch_size = params["batch_size"]
    num_epochs = 5  #params["num_epochs"]
    dropout = params["dropout"]
    embedding_dim = 300
    model_name = "CNN"  #'Bert' #"CNN" #"LSTM"
    unsupervised = True
    embedding = "Glove"  #"Random" ##"Glove" # "Both" #
    soft_labels = False
    combine = embedding == "Both"

    # LSTM parameters
    if model_name == "LSTM":
        hidden_dim = params["hidden_dim"]
        num_layers = params["num_layers"]

    # Bert parameter
    num_warmup_steps = 100
    num_total_steps = 1000
    if model_name == "Bert":
        embedding = "None"
    if embedding == "Both":
        combine = True
        embedding = "Random"
    else:
        combine = False
    learning_rate = params["learning_rate"]  #5e-5, 3e-5, 2e-5
    oversample_bool = False
    weighted_loss = True
    # load data
    dataset = Dataset("../data/cleaned_tweets_orig.csv",
                      use_embedding=embedding,
                      embedd_dim=embedding_dim,
                      combine=combine,
                      for_bert=(model_name == "Bert"))

    #dataset.oversample()
    train_data, val_test_data = split_dataset(dataset,
                                              test_percentage + val_percentage)
    val_data, test_data = split_dataset(
        val_test_data, test_percentage / (test_percentage + val_percentage))

    # print(len(train_data))
    #save_data(train_data, 'train')
    #save_data(test_data, 'test')

    #define loaders
    if oversample_bool:
        weights, targets = get_loss_weights(train_data, return_targets=True)
        class_sample_count = [
            1024 / 20, 13426, 2898 / 2
        ]  # dataset has 10 class-1 samples, 1 class-2 samples, etc.
        oversample_weights = 1 / torch.Tensor(class_sample_count)
        oversample_weights = oversample_weights[targets]
        # oversample_weights = torch.tensor([0.9414, 0.2242, 0.8344]) #torch.ones((3))-
        sampler = torch.utils.data.sampler.WeightedRandomSampler(
            oversample_weights, len(oversample_weights))
        train_loader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=batch_size,
                                                   collate_fn=my_collate,
                                                   sampler=sampler)
    else:
        train_loader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=batch_size,
                                                   collate_fn=my_collate)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=batch_size,
                                             collate_fn=my_collate)

    #define model
    if model_name == "CNN":
        vocab_size = len(dataset.vocab)
        model = CNN(vocab_size, embedding_dim, combine=combine)
    elif model_name == "LSTM":
        vocab_size = len(dataset.vocab)
        model = LSTM(vocab_size,
                     embedding_dim,
                     batch_size=batch_size,
                     hidden_dim=hidden_dim,
                     lstm_num_layers=num_layers,
                     combine=combine,
                     dropout=dropout)

    elif model_name == "Bert":
        model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased", num_labels=3)
        train_loader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=batch_size,
                                                   collate_fn=bert_collate)
        val_loader = torch.utils.data.DataLoader(val_data,
                                                 batch_size=batch_size,
                                                 collate_fn=bert_collate)

    #device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #LOSS : weighted cross entropy loss, by class counts of other classess
    if weighted_loss:
        weights = torch.tensor([0.9414, 0.2242, 0.8344], device=device)
    else:
        weights = torch.ones(3, device=device)
    #weights = torch.tensor([1.0, 1.0, 1.0], device = device) #get_loss_weights(train_data).to(device) # not to run again
    criterion = nn.CrossEntropyLoss(weight=weights)
    if soft_labels:
        criterion = weighted_soft_cross_entropy
    #latent model
    if unsupervised:
        vocab_size = len(dataset.vocab)
        criterion = nn.CrossEntropyLoss(weight=weights, reduction='none')
        model = Rationalisation_model(vocab_size,
                                      embedding_dim=embedding_dim,
                                      model=model_name,
                                      batch_size=batch_size,
                                      combine=combine,
                                      criterion=criterion)

    if not model_name == "Bert":
        model.embedding.weight.data.copy_(dataset.vocab.vectors)
        if combine:
            model.embedding_glove.weight.data.copy_(dataset.glove.vectors)

    #model to device
    model.to(device)

    #optimiser
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    if model_name == "Bert":
        optimizer = AdamW(model.parameters(),
                          lr=learning_rate,
                          correct_bias=False)
        # Linear scheduler for adaptive lr
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=num_warmup_steps,
                                         t_total=num_total_steps)
    else:
        scheduler = None

    plot_log = defaultdict(list)
    for epoch in range(num_epochs):
        #train and validate
        epoch_loss, epoch_acc = train_epoch(model,
                                            train_loader,
                                            optimizer,
                                            criterion,
                                            device,
                                            soft_labels=soft_labels,
                                            weights=weights,
                                            scheduler=scheduler,
                                            unsupervised=unsupervised)
        val_loss, val_acc = evaluate_epoch(model,
                                           val_loader,
                                           criterion,
                                           device,
                                           soft_labels=soft_labels,
                                           weights=weights,
                                           unsupervised=unsupervised)
        #save for plotting
        for name, point in zip(
            ["train_loss", "train_accuracy", "val_loss", "val_accuracy"],
            [epoch_loss, epoch_acc, val_loss, val_acc]):
            plot_log[f'{name}'] = point
        #realtime feel
        print(f'Epoch: {epoch+1}')
        print(
            f'\tTrain Loss: {epoch_loss:.5f} | Train Acc: {epoch_acc*100:.2f}%'
        )
        print(f'\t Val. Loss: {val_loss:.5f} |  Val. Acc: {val_acc*100:.2f}%')
    sample_sentences_and_z(model, train_loader, device, dataset.vocab)
    #save plot
    results_directory = f'plots/{experiment_number}'
    os.makedirs(results_directory, exist_ok=True)
    for name, data in plot_log.items():
        save_plot(data, name, results_directory)
    #save model
    torch.save(model, os.path.join(results_directory, 'model_cnn.pth'))
    #confusion matrix and all that fun
    loss, acc, predictions, ground_truth = evaluate_epoch(
        model,
        val_loader,
        criterion,
        device,
        is_final=True,
        soft_labels=soft_labels,
        weights=weights,
        unsupervised=unsupervised)
    conf_matrix = confusion_matrix(ground_truth, predictions)
    class_report = classification_report(ground_truth, predictions)
    print('\nFinal Loss and Accuracy\n----------------\n')
    print(f'\t Val. Loss: {loss:.5f} |  Val. Acc: {acc*100:.2f}%')
    print('\nCONFUSION MATRIX\n----------------\n')
    print(conf_matrix)
    print('\nCLASSSIFICATION REPORT\n----------------------\n')
    print(class_report)

    plot_confusion_matrix(ground_truth,
                          predictions,
                          classes=["Hate speech", "Offensive", "Neither"],
                          normalize=False,
                          title='Confusion matrix')
    plt.show()
Example #21
0
            else:
                input_ids = torch.tensor(tokenizer.encode(s),
                                         device=device).unsqueeze(
                                             0)  # Batch size 1
                results.append(
                    clf.forward(input_ids)[0].detach().cpu().numpy().flatten())
        return np.array(results).reshape(-1, 2)


print('loading models and data...')
default = 'bert-base-uncased'
mdir = '/scratch/users/vision/chandan/pacmed/glue/SST-2-3epoch'  # '/scratch/users/vision/chandan/pacmed/glue/SST-2-middle/'
device = 'cpu'

tokenizer = BertTokenizer.from_pretrained(mdir)
clf = BertForSequenceClassification.from_pretrained(mdir).eval().to(device)
masked_predictor = BertForMaskedLM.from_pretrained(default).eval().to(device)

lines = open('data/stsa.binary.test', 'r').read()
lines = [line for line in lines.split('\n') if not line is '']
classes = [int(line[0]) for line in lines]
reviews = [line[2:] for line in lines]

num_reviews = 1821  # 1821
save_freq = 1
scores_iid = {}
scores_conditional = {}
scores_remove = {}
scores_lime = {}

# loop over reviews
Example #22
0
        return text, label


rate_train_dataset = RateDataset(train_df)
print(f"Train dataset: {len(rate_train_dataset)}")
itr_num = len(rate_train_dataset)
train_loader = DataLoader(rate_train_dataset,
                          batch_size=16,
                          shuffle=True,
                          num_workers=2)

device = torch.device("cuda:7")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
#config = BertConfig.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=5)
#model = BertForMultiLabelSequenceClassification(config)
model.to(device)

optimizer = Adam(model.parameters(), lr=1e-6)

itr = 1
p_itr = 500
s_itr = 10000
epochs = 5
total_loss = 0
total_len = 0
total_correct = 0


def save_checkpoint(model, save_pth):
def main(log_in_file, lm_path, lm_type, data_path, usegpu, n_fold, total_step,
         eval_every, early_stop, lr, weight_decay, lr_decay_in_layers,
         wd_decay_in_layers, max_length, max_title_rate, content_head_rate,
         batch_size, lr_scheduler_type, input_pattern, clean_method,
         warmup_rate, classifier_dropout, classifier_active, seed):
    arg_name_value_pairs = deepcopy(locals())
    prefix = time.strftime('%Y%m%d_%H%M')
    logger = logging.getLogger('default')
    formatter = logging.Formatter("%(asctime)s %(message)s")
    if log_in_file:
        handler1 = logging.FileHandler(prefix + '.log')
        handler1.setFormatter(formatter)
        handler1.setLevel(logging.DEBUG)
        logger.addHandler(handler1)
    handler2 = logging.StreamHandler()
    handler2.setFormatter(formatter)
    handler2.setLevel(logging.DEBUG)
    logger.addHandler(handler2)
    logger.setLevel(logging.DEBUG)
    for arg_name, arg_value in arg_name_value_pairs.items():
        logger.info(f'{arg_name}: {arg_value}')
    global tokenizer
    if lm_type == 'bert':
        tokenizer = BertTokenizer(os.path.join(lm_path, 'vocab.txt'))
    else:
        tokenizer = XLNetTokenizer(os.path.join(lm_path, 'spiece.model'))
        global PAD, PAD_t, CLS_t, SEP_t
        PAD_t = '<pad>'
        CLS_t = '<cls>'
        SEP_t = '<sep>'
        PAD = tokenizer.convert_tokens_to_ids([PAD_t])[0]
    logger.info(f'padding token is {PAD}')
    processed_train = preprocess(
        os.path.join(data_path, 'Train_DataSet.csv'),
        os.path.join(data_path,
                     'Train_DataSet_Label.csv'), tokenizer, max_length,
        input_pattern, clean_method, max_title_rate, content_head_rate, logger)
    processed_test = preprocess(os.path.join(data_path, 'Test_DataSet.csv'),
                                False, tokenizer, max_length, input_pattern,
                                clean_method, max_title_rate,
                                content_head_rate, logger)
    logger.info('seed everything and create model')
    seed_everything(seed)
    no_decay = ['.bias', 'LayerNorm.bias', 'LayerNorm.weight']
    if lm_type == 'xlnet':
        model = XLNetForSequenceClassification.from_pretrained(
            lm_path, num_labels=3, summary_last_dropout=classifier_dropout)
        if classifier_active == 'relu':
            model.sequence_summary.activation = nn.ReLU()
        if usegpu:
            model = model.cuda()
        model_layer_names = [
            'transformer.mask_emb', 'transformer.word_embedding.weight'
        ]
        model_layer_names += [
            f'transformer.layer.{i}.' for i in range(model.config.n_layer)
        ]
        model_layer_names += ['sequence_summary.summary', 'logits_proj']
    else:
        model = BertForSequenceClassification.from_pretrained(
            lm_path, num_labels=3, hidden_dropout_prob=classifier_dropout)
        if classifier_active == 'relu':
            model.bert.pooler.activation = nn.ReLU()
        if usegpu:
            model = model.cuda()
        model_layer_names = ['bert.embeddings']
        model_layer_names += [
            'bert.encoder.layer.{}.'.format(i)
            for i in range(model.config.num_hidden_layers)
        ]
        model_layer_names += ['bert.pooler', 'classifier']
    optimizer = optimizer = AdamW([{
        'params': [
            p for n, p in model.named_parameters()
            if layer_name in n and not any(nd in n for nd in no_decay)
        ],
        'lr':
        lr * (lr_decay_in_layers**i),
        'weight_decay':
        weight_decay * (wd_decay_in_layers**i)
    } for i, layer_name in enumerate(model_layer_names[::-1])] + [{
        'params': [
            p for n, p in model.named_parameters()
            if layer_name in n and any(nd in n for nd in no_decay)
        ],
        'lr':
        lr * (lr_decay_in_layers**i),
        'weight_decay':
        .0
    } for i, layer_name in enumerate(model_layer_names[::-1])])
    if lr_scheduler_type == 'linear':
        lr_scheduler = WarmupLinearSchedule(optimizer,
                                            warmup_steps=warmup_rate,
                                            t_total=total_step)
    elif lr_scheduler_type == 'constant':
        lr_scheduler = WarmupConstantSchedule(optimizer,
                                              warmup_steps=warmup_rate)
    else:
        raise ValueError

    model_state_0 = deepcopy(model.state_dict())
    optimizer_state_0 = deepcopy(optimizer.state_dict())

    test_iter = get_data_iter(processed_test,
                              batch_size * 4,
                              collect_test_func,
                              shuffle=False)
    pred = np.zeros((len(processed_test), 3))
    val_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(
            KFold(n_splits=n_fold, shuffle=True,
                  random_state=seed).split(processed_train)):
        model.load_state_dict(model_state_0)
        optimizer.load_state_dict(optimizer_state_0)
        if lr_scheduler_type == 'linear':
            lr_scheduler = WarmupLinearSchedule(optimizer,
                                                warmup_steps=warmup_rate,
                                                t_total=total_step)
        elif lr_scheduler_type == 'constant':
            lr_scheduler = WarmupConstantSchedule(optimizer,
                                                  warmup_steps=warmup_rate)
        else:
            raise ValueError
        train_iter = get_data_iter([processed_train[i] for i in train_idx],
                                   batch_size, collect_func)
        val_iter = get_data_iter([processed_train[i] for i in val_idx],
                                 batch_size * 4,
                                 collect_func,
                                 shuffle=False)

        best_model, best_score = training(model=model,
                                          optimizer=optimizer,
                                          lr_scheduler=lr_scheduler,
                                          train_iter=train_iter,
                                          val_iter=val_iter,
                                          total_step=total_step,
                                          tokenizer=tokenizer,
                                          usegpu=usegpu,
                                          eval_every=eval_every,
                                          logger=logger,
                                          early_stop=early_stop,
                                          fold_idx=fold_idx)
        model.load_state_dict(best_model)
        val_scores.append(best_score)
        pred += predict(model, test_iter, usegpu)
    logger.info(f'average: {np.mean(val_scores):.6f}')
    pred = pred / n_fold
    prob_df = pd.DataFrame()
    submit = pd.DataFrame()
    submit['id'] = [i['id'] for i in processed_test]
    submit['label'] = pred.argmax(-1)
    prob_df['id'] = [i['id'] for i in processed_test]
    prob_df['0'] = pred[:, 0]
    prob_df['1'] = pred[:, 1]
    prob_df['2'] = pred[:, 2]
    submit.to_csv(f'submit_{prefix}.csv', index=False)
    prob_df.to_csv(f'probability_{prefix}.csv', index=False)
Example #24
0
def main():
    #os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    torch.set_num_threads(1)
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=False,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=False,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval or not.")
    parser.add_argument("--eval_on",
                        default="dev",
                        help="Whether to run eval on the dev set or test set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    #processors = FormationProcessor

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    processor = FormationProcessor()
    tokenizer = BertTokenizer.from_pretrained(
        '/home/ypd-19-2/SpERT/model/bertbase-20210122T060007Z-001/bertbase')

    train_examples = None
    num_train_optimization_steps = 0
    if args.do_train:
        train_examples = processor.get_train_examples()
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    # Prepare model
    config = BertConfig.from_pretrained(args.bert_model,
                                        num_labels=1,
                                        finetuning_task=args.task_name)
    model = BertForSequenceClassification.from_pretrained(args.bert_model,
                                                          from_tf=False,
                                                          config=config)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warmup_steps,
                                     t_total=num_train_optimization_steps)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    #label_map = {i: label for i, label in enumerate(label_list, 1)}
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.float)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids=input_ids,
                             token_type_ids=segment_ids,
                             attention_mask=input_mask,
                             labels=label_ids)[0]
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        #label_map = {i : label for i, label in enumerate(label_list,1)}
        model_config = {
            "bert_model": args.bert_model,
            "do_lower": args.do_lower_case,
            "max_seq_length": args.max_seq_length,
            "num_labels": 1
        }
        json.dump(
            model_config,
            open(os.path.join(args.output_dir, "model_config.json"), "w"))
        # Load a trained model and config that you have fine-tuned
    else:
        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForSequenceClassification.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)

    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        loss_test = nn.L1Loss()
        if args.eval_on == "dev":
            eval_examples = processor.get_dev_examples()
        elif args.eval_on == "test":
            eval_examples = processor.get_test_examples()
        else:
            raise ValueError("eval on dev or test set only")
        eval_features = convert_examples_to_features(eval_examples,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.float)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        y_true = []
        y_pred = []
        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model(input_ids=input_ids,
                               token_type_ids=segment_ids,
                               attention_mask=input_mask)[0]

            #logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
            input_mask = input_mask.to('cpu').numpy()
            batch_loss = loss_test(logits, label_ids)
            eval_loss += batch_loss
            y_true.append(label_ids)
            y_pred.append(logits)

        print('eval_loss')
        print(eval_loss / len(eval_dataloader))
Example #25
0
import torch
from pytorch_transformers import BertForSequenceClassification, BertTokenizer
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


def tokenization_step(input_seq, tok=tokenizer, pad=True):
    tokenized_mapped = tok.convert_tokens_to_ids(tok.tokenize(input_seq))
    essay_size = len(tokenized_mapped)
    if pad:
        return (torch.LongTensor(np.array([101] + tokenized_mapped + [102] + [0] * (510 - essay_size)).reshape(1, -1)),
                torch.LongTensor(np.array([1]*(essay_size + 2) + [0]*(510-essay_size)).reshape(1, -1)))
    else:
        return (torch.LongTensor(np.array([101] + tokenized_mapped + [102]).reshape(-1, 1)),
                torch.LongTensor(np.array([1] * (essay_size + 2)).reshape(1, -1)))


x, mask = tokenization_step("Hello World", pad=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
y = model(x, attention_mask=mask)
Example #26
0
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))

    if compute_acc:
        print("correct:", correct, "total:", total)
        acc = correct / total
        return predictions, acc
    return predictions

# 4. 训练该下游任务模型
# 定义模型
device = torch.device("cuda:4" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=3).to(device)
# 随机初始化,进行模型预测
print("*"*50)
print(model.config)
print("*"*50)
_, acc = get_predictions(model, train_loader, compute_acc=True)
print("classification acc:", acc)

# 检查参数个数同时选择需要进行梯度更新的参数
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad == True]
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)
print("整个分类模型的参数量:", sum(p.numel() for p in model_params))
print("线性分类器的参数量:", sum(p.numel() for p in clf_params))
Example #27
0
        elif use_postag:
            idx_matches = svsm.match_senses(idx_vec, None, postags[idx], topn=None)

        else:
            idx_matches = svsm.match_senses(idx_vec, None, None, topn=1)

        matches.append(idx_matches)
         

    return matches, word_ind, tokens

BERT_BASE_DIR = 'bert_torch_model/'    
vec_path = 'lmms_1024.bert-large-cased.npz'

model = BertForSequenceClassification.from_pretrained(BERT_BASE_DIR, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(BERT_BASE_DIR, do_lower_case=True)

# To load bert model in GPU

#model = model.cuda()

model.eval()

senses_vsm = SensesVSM(vec_path, normalize=True)

@app.route("/synset_processing", methods=['POST'])

def predict_synset():

Example #28
0
def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )

    parser.add_argument(
        "--train_data",
        default=None,
        type=str,
        required=True,
        help="The input training data file name."
        " Should be the .tsv file (or other data file) for the task.")

    parser.add_argument(
        "--val_data",
        default=None,
        type=str,
        required=True,
        help="The input validation data file name."
        " Should be the .tsv file (or other data file) for the task.")

    parser.add_argument(
        "--test_data",
        default=None,
        type=str,
        required=True,
        help="The input test data file name."
        " Should be the .tsv file (or other data file) for the task.")

    parser.add_argument("--log_path",
                        default=None,
                        type=str,
                        required=True,
                        help="The log file path.")

    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    parser.add_argument("--save_model",
                        default=False,
                        action='store_true',
                        help="Whether to save the model.")

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )

    parser.add_argument(
        "--embed_mode",
        default=None,
        type=str,
        required=True,
        help="The embedding type selected in the list: all, note, chunk, no.")

    parser.add_argument("--c",
                        type=float,
                        required=True,
                        help="The parameter c for scaled adjusted mean method")

    parser.add_argument("--task_name",
                        default="BERT_mortality_am",
                        type=str,
                        required=True,
                        help="The name of the task.")

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--max_chunk_num",
        default=64,
        type=int,
        help=
        "The maximum total input chunk numbers after WordPiece tokenization.")
    parser.add_argument("--train_batch_size",
                        default=1,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=1,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.0,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--num_train_epochs",
                        default=3,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.save_model:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    LOG_PATH = args.log_path
    MAX_LEN = args.max_seq_length

    config = DotMap()
    config.hidden_dropout_prob = 0.1
    config.layer_norm_eps = 1e-12
    config.initializer_range = 0.02
    config.max_note_position_embedding = 1000
    config.max_chunk_position_embedding = 1000
    config.embed_mode = args.embed_mode
    config.layer_norm_eps = 1e-12
    config.hidden_size = 768

    config.task_name = args.task_name

    write_log(
        ("New Job Start! \n"
         "Data directory: {}, Directory Code: {}, Save Model: {}\n"
         "Output_dir: {}, Task Name: {}, embed_mode: {}\n"
         "max_seq_length: {},  max_chunk_num: {}\n"
         "train_batch_size: {}, eval_batch_size: {}\n"
         "learning_rate: {}, warmup_proportion: {}\n"
         "num_train_epochs: {}, seed: {}, gradient_accumulation_steps: {}"
         ).format(args.data_dir,
                  args.data_dir.split('_')[-1], args.save_model,
                  args.output_dir, config.task_name, config.embed_mode,
                  args.max_seq_length, args.max_chunk_num,
                  args.train_batch_size, args.eval_batch_size,
                  args.learning_rate, args.warmup_proportion,
                  args.num_train_epochs, args.seed,
                  args.gradient_accumulation_steps), LOG_PATH)

    content = "config setting: \n"
    for k, v in config.items():
        content += "{}: {} \n".format(k, v)
    write_log(content, LOG_PATH)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    write_log("Number of GPU is {}".format(n_gpu), LOG_PATH)
    for i in range(n_gpu):
        write_log(("Device Name: {},"
                   "Device Capability: {}").format(
                       torch.cuda.get_device_name(i),
                       torch.cuda.get_device_capability(i)), LOG_PATH)

    train_file_path = os.path.join(args.data_dir, args.train_data)
    val_file_path = os.path.join(args.data_dir, args.val_data)
    test_file_path = os.path.join(args.data_dir, args.test_data)
    train_df = pd.read_csv(train_file_path)
    val_df = pd.read_csv(val_file_path)
    test_df = pd.read_csv(test_file_path)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=True)

    write_log("Tokenize Start!", LOG_PATH)
    train_labels, train_inputs, train_masks, train_note_ids = Tokenize_with_note_id(
        train_df, MAX_LEN, tokenizer)
    validation_labels, validation_inputs, validation_masks, validation_note_ids = Tokenize_with_note_id(
        val_df, MAX_LEN, tokenizer)
    test_labels, test_inputs, test_masks, test_note_ids = Tokenize_with_note_id(
        test_df, MAX_LEN, tokenizer)
    write_log("Tokenize Finished!", LOG_PATH)
    train_inputs = torch.tensor(train_inputs)
    validation_inputs = torch.tensor(validation_inputs)
    test_inputs = torch.tensor(test_inputs)
    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)
    test_labels = torch.tensor(test_labels)
    train_masks = torch.tensor(train_masks)
    validation_masks = torch.tensor(validation_masks)
    test_masks = torch.tensor(test_masks)
    write_log(("train dataset size is %d,\n"
               "validation dataset size is %d,\n"
               "test dataset size is %d") %
              (len(train_inputs), len(validation_inputs), len(test_inputs)),
              LOG_PATH)

    (train_labels, train_inputs, train_masks, train_ids, train_note_ids,
     train_chunk_ids) = concat_by_id_list_with_note_chunk_id(
         train_df, train_labels, train_inputs, train_masks, train_note_ids,
         MAX_LEN)
    (validation_labels, validation_inputs, validation_masks, validation_ids,
     validation_note_ids,
     validation_chunk_ids) = concat_by_id_list_with_note_chunk_id(
         val_df, validation_labels, validation_inputs, validation_masks,
         validation_note_ids, MAX_LEN)
    (test_labels, test_inputs, test_masks, test_ids, test_note_ids,
     test_chunk_ids) = concat_by_id_list_with_note_chunk_id(
         test_df, test_labels, test_inputs, test_masks, test_note_ids, MAX_LEN)

    model = BertForSequenceClassification.from_pretrained(args.bert_model,
                                                          num_labels=2)
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    num_train_steps = int(
        len(train_df) / args.train_batch_size /
        args.gradient_accumulation_steps * args.num_train_epochs)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    m = torch.nn.Softmax(dim=1)

    start = time.time()
    # Store our loss and accuracy for plotting
    train_loss_set = []

    # Number of training epochs (authors recommend between 2 and 4)
    epochs = args.num_train_epochs

    train_batch_generator = mask_batch_generator(args.max_chunk_num,
                                                 train_inputs, train_labels,
                                                 train_masks)
    validation_batch_generator = mask_batch_generator(args.max_chunk_num,
                                                      validation_inputs,
                                                      validation_labels,
                                                      validation_masks)

    write_log("Training start!", LOG_PATH)
    # trange is a tqdm wrapper around the normal python range
    with torch.autograd.set_detect_anomaly(True):
        for epoch in trange(epochs, desc="Epoch"):
            # Training

            # Set our model to training mode (as opposed to evaluation mode)
            model.train()

            # Tracking variables
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0

            # Train the data for one epoch
            tr_ids_num = len(train_ids)
            tr_batch_loss = []
            for step in range(tr_ids_num):
                b_input_ids, b_labels, b_input_mask = next(
                    train_batch_generator)
                b_input_ids = b_input_ids.to(device)
                b_input_mask = b_input_mask.to(device)
                b_labels = b_labels.repeat(b_input_ids.shape[0]).to(device)
                # Forward pass
                outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask,
                                labels=b_labels)
                loss, logits = outputs[:2]
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                train_loss_set.append(loss.item())
                # Backward pass
                loss.backward()
                # Update parameters and take a step using the computed gradient
                if (step + 1) % args.train_batch_size == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    train_loss_set.append(np.mean(tr_batch_loss))
                    tr_batch_loss = []

                # Update tracking variables
                tr_loss += loss.item()
                nb_tr_examples += b_input_ids.size(0)
                nb_tr_steps += 1

            write_log("Train loss: {}".format(tr_loss / nb_tr_steps), LOG_PATH)

            # Validation

            # Put model in evaluation mode to evaluate loss on the validation set
            model.eval()

            # Tracking variables
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            # Evaluate data for one epoch
            ev_ids_num = len(validation_ids)
            for step in range(ev_ids_num):
                with torch.no_grad():
                    b_input_ids, b_labels, b_input_mask = next(
                        validation_batch_generator)
                    b_input_ids = b_input_ids.to(device)
                    b_input_mask = b_input_mask.to(device)
                    b_labels = b_labels.repeat(b_input_ids.shape[0])
                    outputs = model(b_input_ids,
                                    token_type_ids=None,
                                    attention_mask=b_input_mask)
                    # Move logits and labels to CPU
                    logits = outputs[-1]
                    logits = m(logits).detach().cpu().numpy()[:, 1]
                    label_ids = b_labels.numpy()

                    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

                    eval_accuracy += tmp_eval_accuracy
                    nb_eval_steps += 1

            write_log(
                "Validation Accuracy: {}".format(eval_accuracy /
                                                 nb_eval_steps), LOG_PATH)
            output_checkpoints_path = os.path.join(
                args.output_dir,
                "bert_fine_tuned_with_note_checkpoint_%d.pt" % epoch)
            if args.save_model:
                if n_gpu > 1:
                    torch.save(
                        {
                            'epoch': epoch,
                            'model_state_dict': model.module.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': loss,
                        }, output_checkpoints_path)

                else:
                    torch.save(
                        {
                            'epoch': epoch,
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': loss,
                        }, output_checkpoints_path)
    end = time.time()

    write_log("total training time is: {}s".format(end - start), LOG_PATH)

    fig1 = plt.figure(figsize=(15, 8))
    plt.title("Training loss")
    plt.xlabel("Chunk Batch")
    plt.ylabel("Loss")
    plt.plot(train_loss_set)
    if args.save_model:
        output_fig_path = os.path.join(
            args.output_dir, "bert_fine_tuned_with_note_training_loss.png")
        plt.savefig(output_fig_path, dpi=fig1.dpi)
        output_model_state_dict_path = os.path.join(
            args.output_dir, "bert_fine_tuned_with_note_state_dict.pt")
        if n_gpu > 1:
            torch.save(model.module.state_dict(), output_model_state_dict_path)
        else:
            torch.save(model.state_dict(), output_model_state_dict_path)
        write_log("Model saved!", LOG_PATH)
    else:
        output_fig_path = os.path.join(
            args.output_dir,
            "bert_fine_tuned_with_note_training_loss_{}_{}.png".format(
                args.seed,
                args.data_dir.split('_')[-1]))
        plt.savefig(output_fig_path, dpi=fig1.dpi)
        write_log("Model not saved as required", LOG_PATH)

    # Prediction on test set

    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    predictions, true_labels, test_adm_ids = [], [], []

    # Predict
    te_ids_num = len(test_ids)
    for step in range(te_ids_num):
        b_input_ids = test_inputs[step][-args.max_chunk_num:, :].to(device)
        b_input_mask = test_masks[step][-args.max_chunk_num:, :].to(device)
        b_labels = test_labels[step].repeat(b_input_ids.shape[0])
        # Telling the model not to compute or store gradients, saving memory and speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        # Move logits and labels to CPU
        logits = outputs[-1]
        logits = m(logits).detach().cpu().numpy()[:, 1]
        label_ids = b_labels.numpy()
        adm_ids = test_ids[step].repeat(b_input_ids.shape[0])

        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)
        test_adm_ids.append(adm_ids)

    try:
        flat_logits = [item for sublist in predictions for item in sublist]
    except TypeError:
        flat_logits = [
            item for sublist in predictions for item in test_func(sublist)
        ]
    flat_predictions = (np.array(flat_logits) >= 0.5).astype(np.int)
    try:
        flat_true_labels = [
            item for sublist in true_labels for item in sublist
        ]
    except TypeError:
        flat_true_labels = [
            item for sublist in true_labels for item in test_func(sublist)
        ]
    try:
        flat_adm_ids = [item for sublist in test_adm_ids for item in sublist]
    except TypeError:
        flat_adm_ids = [
            item for sublist in test_adm_ids for item in test_func(sublist)
        ]

    output_chunk_df = pd.DataFrame({
        'logits': flat_logits,
        'pred_label': flat_predictions,
        'label': flat_true_labels,
        'Adm_ID': flat_adm_ids
    })

    if args.save_model:
        output_chunk_df.to_csv(os.path.join(args.output_dir,
                                            'test_chunk_predictions.csv'),
                               index=False)
    else:
        output_chunk_df.to_csv(os.path.join(
            args.output_dir, 'test_chunk_predictions_{}_{}.csv'.format(
                args.seed,
                args.data_dir.split('_')[-1])),
                               index=False)

    output_df = get_patient_score(output_chunk_df, args.c)
    if args.save_model:
        output_df.to_csv(os.path.join(args.output_dir, 'test_predictions.csv'),
                         index=False)
    else:
        output_df.to_csv(os.path.join(
            args.output_dir,
            'test_predictions_{}_{}.csv'.format(args.seed,
                                                args.data_dir.split('_')[-1])),
                         index=False)
    write_performance(output_df['label'].values,
                      output_df['pred_label'].values,
                      output_df['logits'].values, config, args)
Example #29
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .jsonl files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        '--num_choices',
        type=int,
        default=4,
        help=
        "Number of answer choices (will pad if less, throw exception if more)")
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")

    ###########################
    # ### KYLES NEW SETTINGS  #
    ###########################

    parser.add_argument('--dev_name2',
                        default='',
                        help="the name of the dev experiment")

    parser.add_argument('--dev_name',
                        default='',
                        help="the name of the dev experiment")

    parser.add_argument("--remove_model",
                        default=False,
                        action='store_true',
                        help="Remove the pytorch model after done training")

    parser.add_argument("--override",
                        default=False,
                        action='store_true',
                        help="Override the existing directory")

    parser.add_argument("--no_save_checkpoints",
                        default=False,
                        action='store_true',
                        help="Don't save the model after each checkpoint ")

    parser.add_argument(
        "--run_existing",
        default='',
        help=
        "Run in eval model with an existing model, points to output_model_file"
    )

    parser.add_argument("--bert_config",
                        default='',
                        help="Location of the existing BERT configuration")

    parser.add_argument(
        "--inoculate",
        default='',
        help=
        "Inoculate/continue training the model with challenge dataset (should contain pointer to existing fine-tuned model)"
    )

    parser.add_argument(
        "--intermediate_model",
        default='',
        help=
        "Use the BERT weights of an intermediate BERT model (trained on some other task)"
    )

    parser.add_argument(
        "--exclude_dataset",
        default='',
        type=str,
        help=
        "Datasets to exclude (in case of Aristo dataset with multiple datasets built in)"
    )

    parser.add_argument("--train_name",
                        default='',
                        type=str,
                        help="the number of multiple choice options")

    parser.add_argument(
        "--limit_train",
        default='',
        type=str,
        help="(for multi-dataset datasets) the datasets to use for training")

    parser.add_argument(
        "--limit_test",
        default='',
        type=str,
        help="(for multi-dataset datasets) the datasets to use for testing")

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train and not args.override:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))
    elif os.path.exists(args.output_dir) and args.override:
        shutil.rmtree(args.output_dir)
        os.makedirs(args.output_dir)
    else:
        os.makedirs(args.output_dir)

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    ## create a backup of the run script
    with open(os.path.join(args.output_dir, "run.sh"), 'w') as runner:
        print("python -m mcqa_datasets.arc_mc %s" % ' '.join(sys.argv[1:]),
              file=runner)

    # Setup logging
    log_file = os.path.join(args.output_dir, "logger.log")
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
        filename=log_file)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=args.num_choices)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config)
    logger.info('loaded a pre-trained model..')

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    reader = ARCExampleReader()
    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                reader,
                                                tokenizer,
                                                evaluate=False)

        ## use an existing model or inoculate

        ## continue to train a model
        if args.inoculate:
            logger.info('Trying to load a pre-trained model..')
            model = model_class.from_pretrained(args.inoculate)
            logger.info('Finished loading..')
            #tokenizer = model_class.from_pretrained(args.inoculate)
            model.to(args.device)

        ## use an existing model, similar to the STILT idea; currently only works for BertForSequence
        elif args.intermediate_model:
            intermediate_model = BertForSequenceClassification.from_pretrained(
                args.intermediate_model)
            #intermediate_tokenizer = tokenizer_class.from_pretrained(args.intermediate_model)
            intermediate_model.to(args.device)
            ## just switch the bert weights..?
            model.bert = intermediate_model.bert
            #intermediate_model = model_class.from_trained()

        global_step, tr_loss = train(args,
                                     train_dataset=train_dataset,
                                     model=model,
                                     reader=reader,
                                     tokenizer=tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

    ###################################
    # ## EVALUATION (KYLE's VERSION)  #
    ###################################

    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        ## run an existing model
        if args.run_existing:
            model = model_class.from_pretrained(args.run_existing)
            tokenizer = tokenizer_class.from_pretrained(args.run_existing)
            logger.info('Evaluating using an existing model: %s' %
                        args.run_existing)
        else:
            model = model_class.from_pretrained(args.output_dir)
            tokenizer = tokenizer_class.from_pretrained(args.output_dir)
            logger.info('Evaluating using the trained model: %s' %
                        args.output_dir)

        try:
            model.to(args.device)
        except:
            raise ValueError(
                'No model found, did you not train or link up with pre-trained model?'
            )
            # tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
            # model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
            # model.to(args.device)

        ## the actual evaluation
        global_step = ""
        result = evaluate(args, model, reader, tokenizer, prefix=global_step)
        result = dict(
            (k + '_{}'.format(global_step), v) for k, v in result.items())
        results.update(result)

        ##
        if args.dev_name2:
            global_step = ""
            results = {}
            logger.info('Now running on second development/held-out set...')
            ## update link
            args.dev_name = args.dev_name2
            result = evaluate(args,
                              model,
                              reader,
                              tokenizer,
                              prefix=global_step,
                              next_fname="next")
            result = dict(
                (k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

    # Evaluation
    # results = {}
    # if args.do_eval and args.local_rank in [-1, 0]:
    #     checkpoints = [args.output_dir]
    #     if args.eval_all_checkpoints:
    #         checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
    #         logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
    #     logger.info("Evaluate the following checkpoints: %s", checkpoints)
    #     for checkpoint in checkpoints:
    #         global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
    #         model = model_class.from_pretrained(checkpoint)
    #         model.to(args.device)
    #         result = evaluate(args, model, reader, tokenizer, prefix=global_step)
    #         result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
    #         results.update(result)

    if args.remove_model:
        logger.info('REMOVING THE MODEL!')
        try:
            os.remove(os.path.join(args.output_dir, "pytorch_model.bin"))
            os.remove(os.path.join(args.output_dir, "vocab.txt"))
        except Exception as e:
            logger.error(e, exc_info=True)

    return results
Example #30
0
def main():
    # parse the arguments
    parser = argparse.ArgumentParser(description='Process some integers.')
    # required parameters
    parser.add_argument("func",
                        default='help',
                        type=str,
                        help="train/test/help")
    parser.add_argument("--data_dir", default="data", type=str, required=False)
    parser.add_argument("--task_name", default=None, type=str, required=False)
    parser.add_argument("--tag", default=None, type=str, required=False)
    parser.add_argument("--input_dir", default=None, type=str, required=False)
    parser.add_argument("--output_dir", default=None, type=str, required=False)
    parser.add_argument("--model_name",
                        default="bert-base-uncased",
                        type=str,
                        required=False)

    args = parser.parse_args()

    # do the func
    if args.func == "help":
        print("train to generate model, test to evaluate model")
    else:
        # gather parameters
        tag = args.tag
        if tag == None:
            tag = args.tag = str(uuid.uuid1())
        print("params: {}\ntag: {}".format(str(args), tag))
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = args.n_gpu = torch.cuda.device_count()
        logging.basicConfig(
            format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
            datefmt='%m/%d/%Y %H:%M:%S',
            level=logging.INFO)
        logger.warning("device: %s, n_gpu: %s", device, n_gpu)
        set_seed(args)
        args.task_name = args.task_name.lower()
        # TODO task specific settings
        num_labels = None

        if args.func == "train":
            pass  # train on the task
            # gather parameters
            config = BertConfig.from_pretrained()

            output_dir = args.output_dir = args.output_dir if args.output_dir else "model"
            if os.path.exists(output_dir) and os.list(output_dir):
                raise ValueError("Output dir exists")
            config = BertConfig.from_pretrained(args.model_name,
                                                num_labels=num_labels,
                                                finetuning_task=args.task_name)
            tokenizer = BertTokenizer.from_pretrained(args.model_name,
                                                      do_lower_case="uncased"
                                                      in args.model_name)
            model = BertForSequenceClassification.from_pretrained(
                args.model_name, from_tf=False, config=config)

        elif args.func == "test":
            pass  # test on the task
        else:
            raise NotImplementedError