type=str,
    help=
    "The input data dir. Should contain the .tsv files (or other data files) for the task."
)
parser.add_argument("--no_cuda",
                    action='store_true',
                    help="Avoid using CUDA when available")
args = parser.parse_args()

dir = args.dir

mc_model_path = args.mc_model_path

mc_model = RobertaForMultipleChoice.from_pretrained(mc_model_path)

mc_tokenizer = RobertaTokenizer.from_pretrained(mc_model_path)

mc_model.eval()

device = torch.device(
    "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

sent_encoder = SentenceTransformer('roberta-base-nli-stsb-mean-tokens',
                                   device=device)
mc_model.to(device)

tagger = spacy.load("en_core_web_lg")
word_vector = gensim.models.KeyedVectors.load_word2vec_format(
    '/net/nfs.websail/yyv959/counter-fitted-vectors.txt', binary=False)
stop_words = stopwords.words('english')
Esempio n. 2
0
 def __init__(self, model_name='microsoft/codebert-base'):
     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
     self.model = RobertaModel.from_pretrained(model_name)
     self.vector_length = self.urls[model_name]
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--input_pattern", default=None, type=str, required=True)
    parser.add_argument("--output_dir", default=None, type=str, required=True)
    parser.add_argument("--vocab_file", default=None, type=str, required=True)
    parser.add_argument("--do_lower_case", action='store_true')
    parser.add_argument("--max_seq_length", default=512, type=int)
    parser.add_argument("--doc_stride", default=128, type=int)
    parser.add_argument("--max_query_length", default=64, type=int)
    parser.add_argument("--include_unknowns", default=0.03, type=float)
    parser.add_argument("--max_position", default=50, type=int)
    parser.add_argument("--num_threads", default=16, type=int)
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--start_num', type=int, default=-1)
    parser.add_argument('--end_num', type=int, default=-1)
    parser.add_argument('--generate_count', type=int, default=100)
    parser.add_argument('--hard_mode',type=bool,default=False)
    parser.add_argument('--DataName',type=str,default="SST")
    

    args = parser.parse_args()

    #tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    print("Vocab SIze!",tokenizer.vocab_size)
    

    prefix = "cached_{0}_{1}_{2}_{3}".format(str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length),args.DataName)

    prefix = os.path.join(args.output_dir, prefix)
    os.makedirs(prefix, exist_ok=True)

    
    for input_path in glob(args.input_pattern):

        if args.start_num >= 0 and args.end_num >= 0:
            continue
        cached_path = os.path.join(prefix, os.path.split(input_path)[1] + ".pkl")
        if os.path.exists(cached_path):
            logging.info("{} already exists.".format(cached_path))
            continue
        is_training = True if input_path.find("train") != -1 else False
        logging.info("train:{}".format(is_training))

        examples = []
        train = pd.read_csv(args.input_pattern, sep='\t', header=0)
        for i in range(len(train)):
            examples.append(NqExample(train['sentence'][i].split(' '),train['label'][i]))
#        for dirname in os.listdir(args.input_pattern):
#            label = dirname.split(".")[0]
#            dirname = os.path.join(args.input_pattern,dirname)
#            for filename in os.listdir(dirname):
#                filepath = os.path.join(dirname,filename)
#                with open(filepath,'r',encoding='utf-8') as f:
#                    doc_tokens = f.read().split(' ')
#                    examples.append(
#                            NqExample(
#                              doc_tokens=doc_tokens,
#                              label=label))
        

        run_convert_examples_to_features(args=args,
                                         examples=examples,
                                         tokenizer=tokenizer,
                                         is_training=is_training,
                                         cached_path=cached_path)
Esempio n. 4
0
    ###### Set the seed for generating random numbers
    torch.manual_seed(args.seed)
    if use_cuda:
        torch.cuda.manual_seed(args.seed)

    ###### INSTANTIATE MODEL
    tokenizer = None
    config = None
    model = None

    successful_download = False
    retries = 0

    while (retries < 5 and not successful_download):
        try:
            tokenizer = RobertaTokenizer.from_pretrained(
                PRE_TRAINED_MODEL_NAME)

            config = RobertaConfig.from_pretrained(PRE_TRAINED_MODEL_NAME,
                                                   num_labels=len(CLASS_NAMES),
                                                   id2label={
                                                       0: -1,
                                                       1: 0,
                                                       2: 1,
                                                   },
                                                   label2id={
                                                       -1: 0,
                                                       0: 1,
                                                       1: 2,
                                                   })
            config.output_attentions = True
            model = RobertaForSequenceClassification.from_pretrained(
Esempio n. 5
0
 def __init__(self, model_path, base_model='roberta'):
     self.model = ReRanker(base_model=base_model)
     self.model.load_state_dict(torch.load(model_path))
     self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base',
                                                       max_len=512)
     print("load reranker model from ", model_path)
Esempio n. 6
0
from argparse import ArgumentParser
import h5py
import numpy as np
import json

argp = ArgumentParser()
argp.add_argument('--input_path')
argp.add_argument('--output_path')
argp.add_argument('--bert_model', help='code_bert or graph_code_bert')
args = argp.parse_args()
print(args)

# Load pre-trained model tokenizer (vocabulary)
# Crucially, do not do basic tokenization; PTB is tokenized. Just do wordpiece tokenization.
if args.bert_model == 'code_bert':
    tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
    model = RobertaModel.from_pretrained('microsoft/codebert-base',
                                         output_hidden_states=True)
    LAYER_COUNT = 12
    FEATURE_COUNT = 768
elif args.bert_model == 'graph_code_bert':
    tokenizer = RobertaTokenizer.from_pretrained(
        'microsoft/graphcodebert-base')
    model = RobertaModel.from_pretrained('microsoft/graphcodebert-base',
                                         output_hidden_states=True)
    LAYER_COUNT = 12
    FEATURE_COUNT = 768
else:
    raise ValueError("BERT model must be base or large")

code_list = []  #codesearchnet里所有数据的列表
Esempio n. 7
0
 def __init__(self, pretrain_path, max_length, cat_entity_rep=False):
     nn.Module.__init__(self)
     self.roberta = RobertaModel.from_pretrained(pretrain_path)
     self.max_length = max_length
     self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
     self.cat_entity_rep = cat_entity_rep
        x = torch.mean(x, 0)
        x = self.dropout(x)
        if task_id == 0:
            ret = self.classifier(x)
        elif task_id == 1:
            ret = self.topic_classifier(x)
        return ret


# In[8]:

# 初回実行時のみ保存
# トークンidの順番は,seed_everythingで固定できなかったので,実行する度に変動します.
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base', additional_special_tokens=sorted(topic_tokens))
# tokenizer.save_pretrained('../models/topic_tokenizer/')
tokenizer = RobertaTokenizer.from_pretrained('../models/topic_tokenizer/')

# ### トピックトークン付与

# In[9]:

X_val = '[' + val_df.topic_id.map(
    str).values + '] </s> ' + val_df.description.values
X_val2 = '[' + val2_df.topic_id.map(
    str).values + '] </s> ' + val2_df.description.values
test_X = '[' + test_df.topic_id.map(
    str).values + '] </s> ' + test_df.description.values

X_val = np.array(X_val)
X_val2 = np.array(X_val2)
test_X = np.array(test_X)
Esempio n. 9
0
 def __init__(self):
     self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
     self.model = RobertaForSequenceClassification.from_pretrained(
         "ghanashyamvtatti/roberta-fake-news")
     self.softmax_fn = torch.nn.Softmax(dim=1)
     self.client = language_v1.LanguageServiceClient()
Esempio n. 10
0
def predict():
    """Determine which are yes-ands are not from a given dialogue data set with a finetuned BERT yes-and classifier"""
    parser = ArgumentParser()
    parser.add_argument(
        "--model",
        default="bert-base-uncased",
        help=
        "Provide pretrained model type that is consisten with BERT model that was fine-tuned."
    )
    parser.add_argument(
        "--model_checkpoint",
        default="runs/yesand_cornell_bert_base_iter1",
        help="Provide a directory for a pretrained BERT model.")
    parser.add_argument(
        "--data_path",
        default="data/reformatted_cornell.json",
        help="Provide a datapath for which predictions will be made.")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--predictions_folder",
        default="data/opus_predictions/",
        help="Provide a folderpath for which predictions will be saved to.")
    parser.add_argument("--test",
                        default=False,
                        dest='test',
                        action='store_true',
                        help='runs validation after 1 training step')

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger.info("Arguments: {}".format(pformat(args)))

    logger.info("Loading model and tokenizer.")
    if 'roberta' in args.model:
        model = RobertaForSequenceClassification.from_pretrained(
            args.model_checkpoint)
        tokenizer = RobertaTokenizer.from_pretrained(args.model_checkpoint)
        args.max_len = ROBERTA_MAX_LEN
    elif 'bert' in args.model:
        model = BertForSequenceClassification.from_pretrained(
            args.model_checkpoint)
        tokenizer = BertTokenizer.from_pretrained(args.model_checkpoint)
        args.max_len = BERT_MAX_LEN
    else:
        error = f"Invalid model type given for args.model: {args.model}. Must either contain 'bert' or 'roberta"
        logger.info(error)
        return

    logger.info("Loading data to predict: {}".format(args.data_path))

    if 'opus' in args.data_path:
        data_to_predict = get_opus_data(args.data_path)
    else:
        data_to_predict = get_list_data(args.data_path)

    logger.info("Building data loader...")
    prediction_dataloader = get_data_loader(args, data_to_predict, tokenizer)

    logger.info("Making predictions...")
    predictions = predict_label(args, model, prediction_dataloader,
                                data_to_predict)
    logger.info("Predictions complete for {} dialogue pairs. ".format(
        len(predictions)))

    logger.info("Saving predictions...")

    if not Path(args.predictions_folder).is_dir():
        Path(args.predictions_folder).mkdir(parents=True, exist_ok=False)
    identifier = Path(args.data_path).name
    checkpoint = Path(args.model_checkpoint).name
    predictions_fp = f"{args.predictions_folder}pred_{checkpoint}_{identifier}"
    with open(predictions_fp, 'w') as f:
        json.dump(predictions, f, indent=4)
    logger.info("Predictions saved to {}.".format(predictions_fp))
Esempio n. 11
0
def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    ###############
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--pretrain_model",
                        default='bert-case-uncased',
                        type=str,
                        required=True,
                        help="Pre-trained model")
    parser.add_argument("--num_labels_task",
                        default=None,
                        type=int,
                        required=True,
                        help="num_labels_task")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--task",
                        default=None,
                        type=int,
                        required=True,
                        help="Choose Task")
    ###############

    args = parser.parse_args()

    processors = Processor_1

    num_labels = args.num_labels_task

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}"
        .format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model)

    train_examples = None
    num_train_steps = None
    aspect_list = None
    sentiment_list = None
    processor = processors()
    num_labels = num_labels
    train_examples, aspect_list, sentiment_list = processor.get_train_examples(
        args.data_dir)

    if args.task == 1:
        num_labels = len(aspect_list)
    elif args.task == 2:
        num_labels = len(sentiment_list)
    else:
        print("What's task?")
        exit()

    num_train_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    #model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=args.num_labels_task, output_hidden_states=False, output_attentions=False, return_dict=True)
    model = RobertaForMaskedLMDomainTask.from_pretrained(
        args.pretrain_model,
        num_labels=args.num_labels_task,
        output_hidden_states=False,
        output_attentions=False,
        return_dict=True)

    # Prepare optimizer
    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()

    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    #no_decay = ['bias', 'LayerNorm.weight']
    no_grad = [
        'bert.encoder.layer.11.output.dense_ent',
        'bert.encoder.layer.11.output.LayerNorm_ent'
    ]
    param_optimizer = [(n, p) for n, p in param_optimizer
                       if not any(nd in n for nd in no_grad)]
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=int(t_total *
                                                                     0.1),
                                                num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
            exit()

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      aspect_list,
                                                      sentiment_list,
                                                      args.max_seq_length,
                                                      tokenizer, args.task)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_attention_mask = torch.tensor(
            [f.attention_mask for f in train_features], dtype=torch.long)
        if args.task == 1:
            print("Excuting the task 1")
        elif args.task == 2:
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in train_features], dtype=torch.long)
        else:
            print("Wrong here2")

        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        if args.task == 1:
            train_data = TensorDataset(all_input_ids, all_attention_mask,
                                       all_label_ids)
        elif args.task == 2:
            train_data = TensorDataset(all_input_ids, all_attention_mask,
                                       all_segment_ids, all_label_ids)
        else:
            print("Wrong here1")
        '''
        print("========")
        print(train_data)
        print(type(train_data))
        exit()
        '''

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        output_loss_file = os.path.join(args.output_dir, "loss")
        loss_fout = open(output_loss_file, 'w')
        model.train()

        ##########Pre-Pprocess#########
        ###############################

        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                #batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch))
                batch = tuple(t.to(device) for i, t in enumerate(batch))

                if args.task == 1:
                    input_ids, attention_mask, label_ids = batch
                elif args.task == 2:
                    input_ids, attention_mask, segment_ids, label_ids = batch
                else:
                    print("Wrong here3")

                if args.task == 1:
                    #loss, logits, hidden_states, attentions
                    #output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
                    #loss = output.loss
                    loss, logit = model(input_ids_org=input_ids,
                                        token_type_ids=None,
                                        attention_mask=attention_mask,
                                        sentence_label=label_ids,
                                        func="task_class")
                elif args.task == 2:
                    #loss, logits, hidden_states, attentions
                    #output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, labels=label_ids)
                    #output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, labels=label_ids)
                    #output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
                    #loss = output.loss
                    loss, logit = model(input_ids_org=input_ids,
                                        token_type_ids=None,
                                        attention_mask=attention_mask,
                                        sentence_label=label_ids,
                                        func="task_class")
                else:
                    print("Wrong!!")

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    ###
                    #optimizer.backward(loss)
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    ###
                else:
                    loss.backward()

                loss_fout.write("{}\n".format(loss.item()))
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    ###
                    if args.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()
                    global_step += 1
                    ###
            if epoch < 2:
                continue
            else:
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                #output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_{}".format(global_step))
                output_model_file = os.path.join(
                    args.output_dir, "pytorch_model.bin_{}".format(epoch))
                torch.save(model_to_save.state_dict(), output_model_file)

        # Save a trained model
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        torch.save(model_to_save.state_dict(), output_model_file)
def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path,
                            pytorch_dump_folder_path, model_size):
    # Load configuration defined in the metadata file
    with open(metadata_path) as metadata_file:
        metadata = json.load(metadata_file)
    config = LukeConfig(use_entity_aware_attention=True,
                        **metadata["model_config"])

    # Load in the weights from the checkpoint_path
    state_dict = torch.load(checkpoint_path, map_location="cpu")

    # Load the entity vocab file
    entity_vocab = load_entity_vocab(entity_vocab_path)

    tokenizer = RobertaTokenizer.from_pretrained(
        metadata["model_config"]["bert_model_name"])

    # Add special tokens to the token vocabulary for downstream tasks
    entity_token_1 = AddedToken("<ent>", lstrip=False, rstrip=False)
    entity_token_2 = AddedToken("<ent2>", lstrip=False, rstrip=False)
    tokenizer.add_special_tokens(
        dict(additional_special_tokens=[entity_token_1, entity_token_2]))
    config.vocab_size += 2

    print(f"Saving tokenizer to {pytorch_dump_folder_path}")
    tokenizer.save_pretrained(pytorch_dump_folder_path)
    with open(
            os.path.join(pytorch_dump_folder_path,
                         LukeTokenizer.vocab_files_names["entity_vocab_file"]),
            "w") as f:
        json.dump(entity_vocab, f)

    tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path)

    # Initialize the embeddings of the special tokens
    word_emb = state_dict["embeddings.word_embeddings.weight"]
    ent_emb = word_emb[tokenizer.convert_tokens_to_ids(["@"])[0]].unsqueeze(0)
    ent2_emb = word_emb[tokenizer.convert_tokens_to_ids(["#"])[0]].unsqueeze(0)
    state_dict["embeddings.word_embeddings.weight"] = torch.cat(
        [word_emb, ent_emb, ent2_emb])

    # Initialize the query layers of the entity-aware self-attention mechanism
    for layer_index in range(config.num_hidden_layers):
        for matrix_name in ["query.weight", "query.bias"]:
            prefix = f"encoder.layer.{layer_index}.attention.self."
            state_dict[prefix + "w2e_" + matrix_name] = state_dict[prefix +
                                                                   matrix_name]
            state_dict[prefix + "e2w_" + matrix_name] = state_dict[prefix +
                                                                   matrix_name]
            state_dict[prefix + "e2e_" + matrix_name] = state_dict[prefix +
                                                                   matrix_name]

    # Initialize the embedding of the [MASK2] entity using that of the [MASK] entity for downstream tasks
    entity_emb = state_dict["entity_embeddings.entity_embeddings.weight"]
    entity_emb[entity_vocab["[MASK2]"]] = entity_emb[entity_vocab["[MASK]"]]

    model = LukeModel(config=config).eval()

    missing_keys, unexpected_keys = model.load_state_dict(state_dict,
                                                          strict=False)
    if not (len(missing_keys) == 1
            and missing_keys[0] == "embeddings.position_ids"):
        raise ValueError(
            f"Missing keys {', '.join(missing_keys)}. Expected only missing embeddings.position_ids"
        )
    if not (all(
            key.startswith("entity_predictions") or key.startswith("lm_head")
            for key in unexpected_keys)):
        raise ValueError(
            "Unexpected keys"
            f" {', '.join([key for key in unexpected_keys if not (key.startswith('entity_predictions') or key.startswith('lm_head'))])}"
        )

    # Check outputs
    tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path,
                                              task="entity_classification")

    text = (
        "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the"
        " new world number one avoid a humiliating second- round exit at Wimbledon ."
    )
    span = (39, 42)
    encoding = tokenizer(text,
                         entity_spans=[span],
                         add_prefix_space=True,
                         return_tensors="pt")

    outputs = model(**encoding)

    # Verify word hidden states
    if model_size == "large":
        expected_shape = torch.Size((1, 42, 1024))
        expected_slice = torch.tensor([[0.0133, 0.0865, 0.0095],
                                       [0.3093, -0.2576, -0.7418],
                                       [-0.1720, -0.2117, -0.2869]])
    else:  # base
        expected_shape = torch.Size((1, 42, 768))
        expected_slice = torch.tensor([[0.0037, 0.1368, -0.0091],
                                       [0.1099, 0.3329, -0.1095],
                                       [0.0765, 0.5335, 0.1179]])

    if not (outputs.last_hidden_state.shape == expected_shape):
        raise ValueError(
            f"Outputs.last_hidden_state.shape is {outputs.last_hidden_state.shape}, Expected shape is {expected_shape}"
        )
    if not torch.allclose(
            outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
        raise ValueError

    # Verify entity hidden states
    if model_size == "large":
        expected_shape = torch.Size((1, 1, 1024))
        expected_slice = torch.tensor([[0.0466, -0.0106, -0.0179]])
    else:  # base
        expected_shape = torch.Size((1, 1, 768))
        expected_slice = torch.tensor([[0.1457, 0.1044, 0.0174]])

    if not (outputs.entity_last_hidden_state.shape != expected_shape):
        raise ValueError(
            f"Outputs.entity_last_hidden_state.shape is {outputs.entity_last_hidden_state.shape}, Expected shape is"
            f" {expected_shape}")
    if not torch.allclose(outputs.entity_last_hidden_state[0, :3, :3],
                          expected_slice,
                          atol=1e-4):
        raise ValueError

    # Finally, save our PyTorch model and tokenizer
    print("Saving PyTorch model to {}".format(pytorch_dump_folder_path))
    model.save_pretrained(pytorch_dump_folder_path)
Esempio n. 13
0
    DistilBertTokenizer, DistilBertForMaskedLM, \
    RobertaTokenizer, RobertaForMaskedLM

albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
albert_model = AlbertForMaskedLM.from_pretrained('albert-base-v2').eval()

albert_large_tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2')
albert_large_model = AlbertForMaskedLM.from_pretrained(
    'albert-large-v2').eval()

distilbert_tokenizer = DistilBertTokenizer.from_pretrained(
    'distilbert-base-cased')
distilbert_model = DistilBertForMaskedLM.from_pretrained(
    'distilbert-base-cased').eval()

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-large').eval()

top_k = 10


def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
        token = ''.join(tokenizer.decode(w).split())
        if token not in ignore_tokens:
            tokens.append(token.replace('##', ''))
    return '\n'.join(tokens[:top_clean])

Esempio n. 14
0
def test(config, model_name="ddi_e-5-0.9229.pkl"):

    lable_error = {1: 0, 2: 0, 3: 0, 4: 0}

    # vocab = torch.load(os.path.join(config.ROOT_DIR, 'vocab.pt'))
    #+++
    # logging.info('Load pretrained vectors: {}*{}'.format(vocab.word_num, vocab.word_dim))
    # logging.info('Number of classes: {}'.format(vocab.class_num))

    if config.BERT_MODE == 2:
        logging.info('Model: {}'.format(config.pretrained_model_name))
        tokenizer = RobertaTokenizer.from_pretrained(
            config.pretrained_model_name, do_lower_case=config.do_lower_case)
        tokenizer.add_special_tokens(
            {"additional_special_tokens": ["<e1>", "</e1>", "<e2>", "</e2>"]})
        bert_config = RobertaConfig.from_pretrained(
            config.pretrained_model_name,
            num_labels=num_labels,
            finetuning_task=config.task)
        model = MyRoberta(bert_config, config)
        model.resize_token_embeddings(len(tokenizer))

    if config.BERT_MODE == 3:
        logging.info('Model: {}'.format(config.pretrained_model_name))
        tokenizer = BertTokenizer.from_pretrained(
            config.pretrained_model_name, do_lower_case=config.do_lower_case)
        tokenizer.add_special_tokens({
            "additional_special_tokens": [
                "<e1>", "</e1>", "<e2>", "</e2>", "<e10>", "<e11>", "<e12>",
                "<e13>", "</e10>", "</e11>", "</e12>", "</e13>", "<e20>",
                "<e21>", "<e22>", "<e23>", "</e20>", "</e21>", "</e22>",
                "</e23>"
            ]
        })
        bert_config = BertConfig.from_pretrained(config.pretrained_model_name,
                                                 num_labels=num_labels,
                                                 finetuning_task=config.task)
        model = Mybert_without_entity_information(bert_config, config)
        model.resize_token_embeddings(len(tokenizer))

    if config.BERT_MODE == 1:
        logging.info('Model: {}'.format(config.pretrained_model_name))
        tokenizer = BertTokenizer.from_pretrained(
            config.pretrained_model_name, do_lower_case=config.do_lower_case)
        tokenizer.add_special_tokens({
            "additional_special_tokens": [
                "<e1>", "</e1>", "<e2>", "</e2>", "<e10>", "<e11>", "<e12>",
                "<e13>", "</e10>", "</e11>", "</e12>", "</e13>", "<e20>",
                "<e21>", "<e22>", "<e23>", "</e20>", "</e21>", "</e22>",
                "</e23>", "drug1", "drug2"
            ]
        })
        bert_config = BertConfig.from_pretrained(config.pretrained_model_name,
                                                 num_labels=num_labels,
                                                 finetuning_task=config.task)
        model = Mybert(bert_config, config)
        model.resize_token_embeddings(len(tokenizer))

    if config.BERT_MODE == 5:
        logging.info('Model: {}'.format(config.pretrained_model_name))
        tokenizer = BertTokenizer.from_pretrained(
            config.pretrained_model_name, do_lower_case=config.do_lower_case)
        tokenizer.add_special_tokens({
            "additional_special_tokens": [
                "<e1>", "</e1>", "<e2>", "</e2>", "<e10>", "<e11>", "<e12>",
                "<e13>", "</e10>", "</e11>", "</e12>", "</e13>", "<e20>",
                "<e21>", "<e22>", "<e23>", "</e20>", "</e21>", "</e22>",
                "</e23>"
            ]
        })
        bert_config = BertConfig.from_pretrained(config.pretrained_model_name,
                                                 num_labels=num_labels,
                                                 finetuning_task=config.task)
        model = Mybert_without_attention(bert_config, config)
        model.resize_token_embeddings(len(tokenizer))

    if config.BERT_MODE == 6:
        logging.info('Model: {}'.format(config.pretrained_model_name))
        tokenizer = BertTokenizer.from_pretrained(
            config.pretrained_model_name, do_lower_case=config.do_lower_case)
        tokenizer.add_special_tokens({
            "additional_special_tokens": [
                "<e1>", "</e1>", "<e2>", "</e2>", "<e10>", "<e11>", "<e12>",
                "<e13>", "</e10>", "</e11>", "</e12>", "</e13>", "<e20>",
                "<e21>", "<e22>", "<e23>", "</e20>", "</e21>", "</e22>",
                "</e23>"
            ]
        })
        bert_config = BertConfig.from_pretrained(config.pretrained_model_name,
                                                 num_labels=num_labels,
                                                 finetuning_task=config.task)
        model = Mybert_without_packedBiGRU(bert_config, config)
        model.resize_token_embeddings(len(tokenizer))
    if config.BERT_MODE == 7:
        logging.info('Model: {}'.format(config.pretrained_model_name))
        logging.info('Model: {}'.format("Mybert_startent"))
        tokenizer = BertTokenizer.from_pretrained(
            config.pretrained_model_name, do_lower_case=config.do_lower_case)
        tokenizer.add_special_tokens({
            "additional_special_tokens": [
                "<e1>", "</e1>", "<e2>", "</e2>", "<e11>", "</e11>", "<e12>",
                "</e12>", "<e10>", "</e10>", "<e13>", "</e13>", "<e20>",
                "</e20>", "<e23>", "</e23>", "<e21>", "</e21>", "<e22>",
                "</e22>"
            ]
        })
        bert_config = BertConfig.from_pretrained(config.pretrained_model_name,
                                                 num_labels=num_labels,
                                                 finetuning_task=config.task)
        model = Mybert_startent(bert_config, config)
        model.resize_token_embeddings(len(tokenizer))

    test_dataset = torch.load(os.path.join(config.ROOT_DIR, 'test_c.pt'))
    test_loader = DataLoader(test_dataset, config.BATCH_SIZE, shuffle=True)

    logging.info('Number of test pair: {}'.format(len(test_dataset)))

    # num_params = sum(np.prod(p.size()) for p in model.parameters())
    # num_embedding_params = np.prod(model.word_emb.weight.size()) + np.prod(model.tag_emb.weight.size())
    # print('# of parameters: {}'.format(num_params))
    # print('# of word embedding parameters: {}'.format(num_embedding_params))
    # print('# of parameters (excluding embeddings): {}'.format(num_params - num_embedding_params))

    if model_name is None:
        model_path = utils.best_model_path(config.SAVE_DIR,
                                           config.DATA_SET,
                                           i=0)

        logging.info(
            'Loading the best model on validation set: {}'.format(model_path))
        model.load_state_dict(torch.load(model_path, map_location='cpu'))
    else:
        model_path = os.path.join(config.SAVE_DIR, config.DATA_SET, model_name)
        model_path = r"checkpoint/BioBert\drugmask\addClassifieddata0.25effect0.125Int0.5\lossweight\lossweight-0.8411.pkl"
        # model_path = os.path.join('checkpoint/BioBert/biobert_gru2_drop00_ddi_e-5', model_name)
        logging.info('Loading the model: {}'.format(model_path))
        model.load_state_dict(torch.load(model_path, map_location='cpu'))
    model.eval()
    model.to(DEVICE)
    # model.display()

    torch.set_grad_enabled(False)

    def run_iter(batch):
        sent = batch[0].to(DEVICE)
        mask = batch[1].to(DEVICE)
        segment = batch[2].to(DEVICE)
        label = batch[3].to(DEVICE)
        e1_mask = batch[4].to(DEVICE)
        e2_mask = batch[5].to(DEVICE)
        length = batch[6].to(DEVICE)
        logits = model(input_ids=sent,
                       attention_mask=mask,
                       token_type_ids=segment,
                       labels=label,
                       e1_mask=e1_mask,
                       e2_mask=e2_mask,
                       length=length)

        label_pred = logits.max(1)[1]

        return label_pred.cpu()

    test_labels = []
    test_preds = []

    for test_batch in test_loader:
        test_pred = run_iter(batch=test_batch)

        test_labels.extend(test_batch[3])
        test_preds.extend(test_pred)

    test_p, test_r, test_f1, _ = metrics.precision_recall_fscore_support(
        test_labels, test_preds, labels=[1, 2, 3, 4], average='micro')
    test_p_n, test_r_n, test_f1_n, _ = metrics.precision_recall_fscore_support(
        test_labels, test_preds, labels=[0], average='micro')
    test_p_a, test_r_a, test_f1_a, _ = metrics.precision_recall_fscore_support(
        test_labels, test_preds, labels=[1], average='micro')
    test_p_e, test_r_e, test_f1_e, _ = metrics.precision_recall_fscore_support(
        test_labels, test_preds, labels=[2], average='micro')
    test_p_m, test_r_m, test_f1_m, _ = metrics.precision_recall_fscore_support(
        test_labels, test_preds, labels=[3], average='micro')
    test_p_i, test_r_i, test_f1_i, _ = metrics.precision_recall_fscore_support(
        test_labels, test_preds, labels=[4], average='micro')
    # plt.figure("ROC Curve")
    # plt.title("ROC Curve")
    # plt.xlabel('Recall')
    # plt.ylabel('Precision')
    # precision, recall, _ = metrics.roc_curve(test_labels, test_preds)
    # plt.plot(recall,precision)
    # plt.show()
    # for i, l in enumerate(test_labels):
    #     if l!=test_preds[i] and int(l)!=0:
    #         lable_error[int(l)]+=1

    logging.info(
        'precision =  {:.4f}: recall = {:.4f}, fscore = {:.4f}'.format(
            test_p, test_r, test_f1))
    logging.info(
        'negative: precision =  {:.4f}: recall = {:.4f}, fscore = {:.4f}'.
        format(test_p_n, test_r_n, test_f1_n))
    logging.info(
        'advise: precision =  {:.4f}: recall = {:.4f}, fscore = {:.4f}'.format(
            test_p_a, test_r_a, test_f1_a))
    logging.info(
        'effect: precision =  {:.4f}: recall = {:.4f}, fscore = {:.4f}'.format(
            test_p_e, test_r_e, test_f1_e))
    logging.info(
        'mechanism: precision =  {:.4f}: recall = {:.4f}, fscore = {:.4f}'.
        format(test_p_m, test_r_m, test_f1_m))
    logging.info(
        'int: precision =  {:.4f}: recall = {:.4f}, fscore = {:.4f}'.format(
            test_p_i, test_r_i, test_f1_i))
    num_labels = 3 if task == 'c' else 2

    # Set tokenizer for different models
    if model_name == 'bert':
        if task == 'all':
            model = MTL_Transformer_LSTM(model_name, model_size, args=args)
        else:
            model = BERT(model_size, args=args, num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(f'bert-{model_size}-uncased')
    elif model_name == 'roberta':
        if task == 'all':
            model = MTL_Transformer_LSTM(model_name, model_size, args=args)
        else:
            model = RoBERTa(model_size, args=args, num_labels=num_labels)
        tokenizer = RobertaTokenizer.from_pretrained(f'roberta-{model_size}')
    elif model_name == 'bert-gate' and task == 'all':
        model_name = model_name.replace('-gate', '')
        model = GatedModel(model_name, model_size, args=args)
        tokenizer = BertTokenizer.from_pretrained(f'bert-{model_size}-uncased')
    elif model_name == 'roberta-gate' and task == 'all':
        model_name = model_name.replace('-gate', '')
        model = GatedModel(model_name, model_size, args=args)
        tokenizer = RobertaTokenizer.from_pretrained(f'roberta-{model_size}')

    # Move model to correct device
    model = model.to(device=device)

    if args['ckpt'] != '':
        model.load_state_dict(load(args['ckpt']))
Esempio n. 16
0
def run(args, config, train_data, valid_data, test_data=None):
    ############################ PARAMETER SETTING ##########################
    num_workers = config['dataloader']['n_jobs']
    batch_size = config['dataloader']['batch_size']
    # learning_rate = config['optimizer']['learning_rate']
    # warmup_proportion = config['optimizer']['warmup_proportion']
    # save_ckpt_dir = os.path.join(args.save_path, 'checkpoints')

    audio_length = 3000
    epochs = args.epochs

    tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_path)
    ############################## PREPARE DATASET ##########################
    train_dataset = DownstreamDataset(train_data, tokenizer, audio_length)
    train_loader = torch.utils.data.DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        collate_fn=lambda x: collate(x, tokenizer, config['upstream'][
            'acoustic']),
        shuffle=True,
        num_workers=num_workers)
    valid_dataset = DownstreamDataset(valid_data, tokenizer, audio_length)
    valid_loader = torch.utils.data.DataLoader(
        dataset=valid_dataset,
        batch_size=batch_size,
        collate_fn=lambda x: collate(x, tokenizer, config['upstream'][
            'acoustic']),
        shuffle=False,
        num_workers=num_workers)

    if test_data is None:
        test_data = valid_data
    test_dataset = DownstreamDataset(test_data, tokenizer, audio_length)
    test_loader = torch.utils.data.DataLoader(
        dataset=test_dataset,
        batch_size=batch_size,
        collate_fn=lambda x: collate(x, tokenizer, config['upstream'][
            'acoustic']),
        shuffle=False,
        num_workers=num_workers)
    ########################### CREATE MODEL #################################
    model = MultiModalEncoderDecoder(
        ckpt_path=args.ckpt_path,
        num_classes=config['downstream']['label_num'])
    model.cuda()

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           T_max=epochs)

    ########################### TRAINING #####################################
    count, best_metric, save_metric, best_epoch = 0, -np.inf, None, 0

    for epoch in range(epochs):
        epoch_train_loss = []
        model.train()
        start_time = time.time()

        time.sleep(
            2
        )  # avoid the deadlock during the switch between the different dataloaders
        progress = tqdm(train_loader, desc='Epoch {:0>3d}'.format(epoch))
        for acoustic_inputs, semantic_inputs, label_inputs, _ in progress:
            speech_inputs = acoustic_inputs[0].cuda()
            speech_attention_mask = acoustic_inputs[1].cuda()
            text_inputs = semantic_inputs[0].cuda()
            text_attention_mask = semantic_inputs[1].cuda()

            label_inputs = label_inputs.cuda()

            model.zero_grad()
            logits, _ = model(
                text_encoder_inputs=speech_inputs,
                text_encoder_attention_mask=speech_attention_mask,
                text_decoder_inputs=text_inputs,
                text_decoder_attention_mask=text_attention_mask,
                speech_encoder_inputs=text_inputs,
                speech_encoder_attention_mask=text_attention_mask,
                speech_decoder_inputs=speech_inputs,
                speech_decoder_attention_mask=speech_attention_mask,
            )

            loss = loss_fn(logits,
                           label_inputs,
                           num_classes=config['downstream']['label_num'])

            epoch_train_loss.append(loss)

            loss.backward()
            optimizer.step()
            scheduler.step()

            count += 1

            acc_train_loss = torch.mean(
                torch.tensor(epoch_train_loss)).cpu().detach().numpy()
            progress.set_description("Epoch {:0>3d} - Loss {:.4f}".format(
                epoch, acc_train_loss))

        model.eval()
        pred_y, true_y = [], []
        with torch.no_grad():
            time.sleep(
                2
            )  # avoid the deadlock during the switch between the different dataloaders
            for acoustic_inputs, semantic_inputs, label_inputs, _ in valid_loader:
                speech_inputs = acoustic_inputs[0].cuda()
                speech_attention_mask = acoustic_inputs[1].cuda()
                text_inputs = semantic_inputs[0].cuda()
                text_attention_mask = semantic_inputs[1].cuda()

                true_y.extend(list(label_inputs.numpy()))

                logits, hiddens = model(
                    text_encoder_inputs=speech_inputs,
                    text_encoder_attention_mask=speech_attention_mask,
                    text_decoder_inputs=text_inputs,
                    text_decoder_attention_mask=text_attention_mask,
                    speech_encoder_inputs=text_inputs,
                    speech_encoder_attention_mask=text_attention_mask,
                    speech_decoder_inputs=speech_inputs,
                    speech_decoder_attention_mask=speech_attention_mask,
                )

                if config['downstream']['label_num'] == 1:
                    prediction = logits.view(-1)
                    label_outputs = prediction.cpu().detach().numpy().astype(
                        float)
                else:
                    if args.task_name == "verification":
                        # for speaker verification we take the hidden before the classifier as the output
                        label_outputs = hiddens.cpu().detach().numpy().astype(
                            float)
                    else:
                        prediction = torch.argmax(logits, axis=1)
                        label_outputs = prediction.cpu().detach().numpy(
                        ).astype(int)

                pred_y.extend(list(label_outputs))

        # think about the metric calculation
        key_metric, report_metric = downstream_metrics(pred_y, true_y,
                                                       args.task_name)

        epoch_train_loss = torch.mean(
            torch.tensor(epoch_train_loss)).cpu().detach().numpy()

        elapsed_time = time.time() - start_time
        print("The time elapse of epoch {:03d}".format(epoch) + " is: " +
              time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
        print('Valid Metric: {} - Train Loss: {:.3f}'.format(
            ' - '.join([
                '{}: {:.3f}'.format(key, value)
                for key, value in report_metric.items()
            ]), epoch_train_loss))

        if key_metric > best_metric:
            best_metric, best_epoch = key_metric, epoch
            print('Better Metric found on dev, calculate performance on Test')
            pred_y, true_y = [], []
            with torch.no_grad():
                time.sleep(
                    2
                )  # avoid the deadlock during the switch between the different dataloaders
                for acoustic_inputs, semantic_inputs, label_inputs, _ in test_loader:
                    speech_inputs = acoustic_inputs[0].cuda()
                    speech_attention_mask = acoustic_inputs[1].cuda()
                    text_inputs = semantic_inputs[0].cuda()
                    text_attention_mask = semantic_inputs[1].cuda()

                    true_y.extend(list(label_inputs.numpy()))

                    logits, hiddens = model(
                        text_encoder_inputs=speech_inputs,
                        text_encoder_attention_mask=speech_attention_mask,
                        text_decoder_inputs=text_inputs,
                        text_decoder_attention_mask=text_attention_mask,
                        speech_encoder_inputs=text_inputs,
                        speech_encoder_attention_mask=text_attention_mask,
                        speech_decoder_inputs=speech_inputs,
                        speech_decoder_attention_mask=speech_attention_mask,
                    )

                    if config['downstream']['label_num'] == 1:
                        prediction = logits.view(-1)
                        label_outputs = prediction.cpu().detach().numpy(
                        ).astype(float)
                    else:
                        if args.task_name == "verification":
                            label_outputs = hiddens.cpu().detach().numpy(
                            ).astype(float)
                        else:
                            prediction = torch.argmax(logits, axis=1)
                            label_outputs = prediction.cpu().detach().numpy(
                            ).astype(int)

                    pred_y.extend(list(label_outputs))

            _, save_metric = downstream_metrics(pred_y, true_y, args.task_name)
            print("Test Metric: {}".format(' - '.join([
                '{}: {:.3f}'.format(key, value)
                for key, value in save_metric.items()
            ])))

    print("End. Best epoch {:03d}: {}".format(
        best_epoch, ' - '.join([
            '{}: {:.3f}'.format(key, value)
            for key, value in save_metric.items()
        ])))
    return save_metric
 def _define_tokenizer(self):
     return RobertaTokenizer.from_pretrained(self.params["model_name"],
                                             do_lower_case=True)
def train(train_Xy, n_epochs=4, batch_size=4):  # val_Xy
    tokenizer = RobertaTokenizer.from_pretrained("allenai/biomed_roberta_base")
    model = RobertaForSequenceClassification.from_pretrained(
        "allenai/biomed_roberta_base").to(device=device)

    #from transformers import Adam, AdamW
    from transformers import AdamW
    #optimizer = AdamW(model.parameters())
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

    best_val = np.inf
    train_epoch_loss = 0
    for epoch in range(n_epochs):
        model.train()
        print("on epoch ", epoch)
        train_epoch_loss = 0

        batch_X, batch_y = [], []
        cur_batch_size = 0

        for i, article in enumerate(train_Xy):
            if (i % 100) == 0:
                print("on article", i)

            # sample instances from current article
            cur_X, cur_y = instances_from_article(article,
                                                  max_instances=batch_size -
                                                  cur_batch_size)

            batch_X.extend(cur_X)
            batch_y.extend(cur_y)

            cur_batch_size += len(cur_X)

            if cur_batch_size >= batch_size:
                optimizer.zero_grad()

                batch_X_tensor = tokenizer.batch_encode_plus(
                    batch_X[:batch_size],
                    max_length=512,
                    add_special_tokens=True,
                    pad_to_max_length=True)
                batch_y_tensor = torch.tensor(batch_y[:batch_size])

                loss, logits = model(
                    torch.tensor(
                        batch_X_tensor['input_ids']).to(device=device),
                    attention_mask=torch.tensor(
                        batch_X_tensor['attention_mask']).to(device=device),
                    labels=batch_y_tensor.to(device=device))
                train_epoch_loss += loss.cpu().detach().numpy()

                #import pdb; pdb.set_trace()
                #print("batch loss: {}".format(loss))
                loss.backward()
                optimizer.step()

                # empty out current batch
                cur_batch_size = 0
                batch_X, batch_y = [], []

        print("total epoch train loss {}".format(train_epoch_loss))

        ####
        # eval on val set
        ###
        print("evaluating on val...")
        model.eval()
        total_correct, total_preds = 0, 0
        val_loss = 0
        for j, article in enumerate(val_Xy):
            val_X, val_y = instances_from_article(article,
                                                  max_instances=batch_size)
            val_X_tensor = tokenizer.batch_encode_plus(val_X[:batch_size],
                                                       max_length=512,
                                                       add_special_tokens=True,
                                                       pad_to_max_length=True)
            val_y_tensor = torch.tensor(val_y[:batch_size])

            loss, logits = model(
                torch.tensor(val_X_tensor['input_ids']).to(device=device),
                attention_mask=torch.tensor(
                    val_X_tensor['attention_mask']).to(device=device),
                labels=torch.tensor(val_y_tensor).to(device=device))
            val_loss += loss.cpu().detach().numpy()

            class_preds = torch.argmax(logits, dim=1).detach().cpu()
            total_correct += (class_preds == val_y_tensor).sum()
            total_preds += len(val_X)
        #import pdb; pdb.set_trace()
        val_acc = total_correct / float(
            total_preds)  # note that the baseline depends on neg samples
        print("val loss, acc after epoch {} is: {}, {}".format(
            epoch, val_loss, val_acc))
        if val_loss < best_val:
            print("new best loss: {}".format(val_loss))
            best_val = val_loss
            torch.save(model.state_dict(), "inference.model")
Esempio n. 19
0
def evaluate(args):
    """
    Evaluate a masked language model using CrowS-Pairs dataset.
    """

    print("Evaluating:")
    print("Input:", args.input_file)
    print("Model:", args.lm_model)
    print("=" * 100)

    logging.basicConfig(level=logging.INFO)

    # load data into panda DataFrame
    df_data = read_data(args.input_file)

    # supported masked language models
    if args.lm_model == "scibert-bert":
        tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
        model = BertForMaskedLM.from_pretrained('allenai/scibert_scivocab_uncased')
        uncased = True
    elif args.lm_model == "biobert-bert":
        tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
        model = BertForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1')
        uncased = True
    elif args.lm_model == "scibert-roberta":
        tokenizer = RobertaTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
        model = RobertaForMaskedLM.from_pretrained('allenai/scibert_scivocab_uncased')
        uncased = True
    elif args.lm_model == "biobert-roberta":
        tokenizer = RobertaTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
        model = RobertaForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1')
        uncased = True
    elif args.lm_model == "roberta":
        tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
        model = RobertaForMaskedLM.from_pretrained('roberta-large')
        uncased = False
    elif args.lm_model == "bert":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForMaskedLM.from_pretrained('bert-base-uncased')
        uncased = True

    model.eval()
    model.to('cuda')

    mask_token = tokenizer.mask_token
    log_softmax = torch.nn.LogSoftmax(dim=0)
    vocab = tokenizer.get_vocab()
    with open(args.lm_model + ".vocab", "w") as f:
        f.write(json.dumps(vocab))

    lm = {"model": model,
          "tokenizer": tokenizer,
          "mask_token": mask_token,
          "log_softmax": log_softmax,
          "uncased": uncased
          }

    # score each sentence.
    # each row in the dataframe has the sentid and score for pro and anti stereo.
    df_score = pd.DataFrame(columns=['sent_more', 'sent_less',
                                     'sent_more_score', 'sent_less_score',
                                     'score', 'stereo_antistereo', 'bias_type'])

    total_stereo, total_antistereo = 0, 0
    stereo_score, antistereo_score = 0, 0

    N = 0
    neutral = 0
    total = len(df_data.index)
    with tqdm(total=total) as pbar:
        for index, data in df_data.iterrows():
            direction = data['direction']
            bias = data['bias_type']
            score = mask_unigram(data, lm)

            for stype in score.keys():
                score[stype] = round(score[stype], 3)

            N += 1
            pair_score = 0
            pbar.update(1)
            if score['sent1_score'] == score['sent2_score']:
                neutral += 1
            else:
                if direction == 'stereo':
                    total_stereo += 1
                    if score['sent1_score'] > score['sent2_score']:
                        stereo_score += 1
                        pair_score = 1
                elif direction == 'antistereo':
                    total_antistereo += 1
                    if score['sent2_score'] > score['sent1_score']:
                        antistereo_score += 1
                        pair_score = 1

            sent_more, sent_less = '', ''
            if direction == 'stereo':
                sent_more = data['sent1']
                sent_less = data['sent2']
                sent_more_score = score['sent1_score']
                sent_less_score = score['sent2_score']
            else:
                sent_more = data['sent2']
                sent_less = data['sent1']
                sent_more_score = score['sent1_score']
                sent_less_score = score['sent2_score']

            df_score = df_score.append({'sent_more': sent_more,
                                        'sent_less': sent_less,
                                        'sent_more_score': sent_more_score,
                                        'sent_less_score': sent_less_score,
                                        'score': pair_score,
                                        'stereo_antistereo': direction,
                                        'bias_type': bias
                                        }, ignore_index=True)

    df_score.to_csv(args.output_file)
    print('=' * 100)
    print('Total examples:', N)
    print('Metric score:', round((stereo_score + antistereo_score) / N * 100, 2))
    print('Stereotype score:', round(stereo_score / total_stereo * 100, 2))
    if antistereo_score != 0:
        print('Anti-stereotype score:', round(antistereo_score / total_antistereo * 100, 2))
    print("Num. neutral:", neutral, round(neutral / N * 100, 2))
    print('=' * 100)
    print()
def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    ###############
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--pretrain_model",
                        default='bert-case-uncased',
                        type=str,
                        required=True,
                        help="Pre-trained model")
    parser.add_argument("--num_labels_task",
                        default=None,
                        type=int,
                        required=True,
                        help="num_labels_task")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--eval_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--task",
                        default=None,
                        type=int,
                        required=True,
                        help="Choose Task")
    ###############

    args = parser.parse_args()
    #print(args.do_train, args.do_eval)
    #exit()

    processors = Processor_1

    num_labels = args.num_labels_task

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}"
        .format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    #args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")
    '''
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    '''
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model)

    train_examples = None
    num_train_steps = None
    aspect_list = None
    sentiment_list = None
    processor = processors()
    num_labels = num_labels
    #train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir)

    filenames = os.listdir(args.output_dir)
    filenames = [x for x in filenames if "pytorch_model.bin_" in x]
    print(filenames)

    file_mark = []
    model_performace = dict()
    for x in filenames:
        #file_mark.append([x, True])
        file_mark.append([x, False])

    ####
    ####
    test_examples, aspect_list, sentiment_list = processor.get_test_examples(
        args.data_dir)
    if args.task == 1:
        num_labels = len(aspect_list)
    elif args.task == 2:
        num_labels = len(sentiment_list)
    else:
        print("What's task?")
        exit()
    test = convert_examples_to_features(test_examples, aspect_list,
                                        sentiment_list, args.max_seq_length,
                                        tokenizer, args.task)
    eval_examples = test_examples
    ###

    for x, mark in file_mark:
        print(x, mark)
        output_model_file = os.path.join(args.output_dir, x)

        #model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=num_labels, output_hidden_states=False, output_attentions=False, return_dict=True)
        model = RobertaForMaskedLMDomainTask.from_pretrained(
            args.pretrain_model,
            output_hidden_states=False,
            output_attentions=False,
            return_dict=True,
            num_labels=args.num_labels_task)
        model.load_state_dict(torch.load(output_model_file), strict=False)
        #strict False: ignore non-matching keys

        #param_optimizer = [para[0] for para in model.named_parameters()]
        #param_optimizer = [para for para in model.named_parameters()][-2]
        #print(param_optimizer)

        model.to(device)
        if mark:
            eval_features = dev
        else:
            eval_features = test

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_attention_mask = torch.tensor(
            [f.attention_mask for f in eval_features], dtype=torch.long)
        if args.task == 1:
            print("Excuting the task 1")
        elif args.task == 2:
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)
        else:
            print("Wrong here2")

        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)

        if args.task == 1:
            eval_data = TensorDataset(all_input_ids, all_attention_mask,
                                      all_label_ids)
        elif args.task == 2:
            eval_data = TensorDataset(all_input_ids, all_attention_mask,
                                      all_segment_ids, all_label_ids)
        else:
            print("Wrong here1")

        if args.local_rank == -1:
            eval_sampler = RandomSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        if mark:
            output_eval_file = os.path.join(
                args.output_dir,
                "eval_results_{}.txt".format(x.split("_")[-1]))
            output_file_pred = os.path.join(
                args.output_dir, "eval_pred_{}.txt".format(x.split("_")[-1]))
            output_file_glod = os.path.join(
                args.output_dir, "eval_gold_{}.txt".format(x.split("_")[-1]))
        else:
            output_eval_file = os.path.join(
                args.output_dir,
                "test_results_{}.txt".format(x.split("_")[-1]))
            output_file_pred = os.path.join(
                args.output_dir, "test_pred_{}.txt".format(x.split("_")[-1]))
            output_file_glod = os.path.join(
                args.output_dir, "test_gold_{}.txt".format(x.split("_")[-1]))

        fpred = open(output_file_pred, "w")
        fgold = open(output_file_glod, "w")

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")):
            #batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch))
            batch = tuple(t.to(device) for i, t in enumerate(batch))

            if args.task == 1:
                input_ids, attention_mask, label_ids = batch
            elif args.task == 2:
                input_ids, attention_mask, segment_ids, label_ids = batch
            else:
                print("Wrong here3")

            if args.task == 1:
                #loss, logits, hidden_states, attentions
                '''
                output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
                logits = output.logits
                tmp_eval_loss = output.loss
                '''
                #
                tmp_eval_loss, logits = model(input_ids_org=input_ids,
                                              sentence_label=label_ids,
                                              attention_mask=attention_mask,
                                              func="task_class")
                #logits = output.logits
                #tmp_eval_loss = output.loss
            elif args.task == 2:
                #loss, logits, hidden_states, attentions
                '''
                output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
                logits = output.logits
                tmp_eval_loss = output.loss
                '''
                #
                tmp_eval_loss, logits = model(input_ids_org=input_ids,
                                              sentence_label=label_ids,
                                              attention_mask=attention_mask,
                                              func="task_class")
                #exit()
                #logits = output.logits
                #tmp_eval_loss = output.loss
            else:
                print("Wrong!!")

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy, pred = accuracy(logits, label_ids)
            for a, b in zip(pred, label_ids):
                fgold.write("{}\n".format(b))
                fpred.write("{}\n".format(a))

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples

        result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy}

        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        model_performace[x] = eval_accuracy

    #################
    #################
    '''
Esempio n. 21
0
 def __init__(self, pretrain_path, max_length):
     nn.Module.__init__(self)
     self.roberta = RobertaForSequenceClassification.from_pretrained(
         pretrain_path, num_labels=2)
     self.max_length = max_length
     self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
Esempio n. 22
0
                        help='output model path and name')
    parser.add_argument('--benchmark',
                        action='store_true',
                        default=False,
                        help='Get benchmark performance of quantized model.')
    parser.add_argument('--benchmark_nums',
                        type=int,
                        default=1000,
                        help="Benchmark numbers of samples")
    parser.add_argument('--mode',
                        type=str,
                        default='performance',
                        choices=['performance', 'accuracy'],
                        help="Mode of benchmark")
    args = parser.parse_args()
    tokenizer = RobertaTokenizer.from_pretrained(args.input_dir,
                                                 do_lower_case=True)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (
        args.task_name, )
    eval_dataset = load_and_cache_examples(args,
                                           args.task_name,
                                           tokenizer,
                                           evaluate=True)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, \
        batch_size=args.eval_batch_size)

    def eval_func(model):
        return evaluate_onnxrt(args, model, tokenizer, eval_dataloader)
Esempio n. 23
0
    'cnn_filters': 300,
    'cnn_kernel_size': 5,
    'init_lr': 1e-4,
    'max_lr': 8e-4
}

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                               lowercase=True,
                                               add_special_tokens=True)

albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2',
                                                   lowercase=True,
                                                   add_special_tokens=True)

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base',
                                                     lowercase=True,
                                                     add_special_tokens=True)

xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',
                                                 lowercase=True,
                                                 add_special_tokens=True)


def data_generator(f_path, params):
    with open(f_path) as f:
        for line in f:
            line = line.rstrip()
            text, slot_intent = line.split('\t')
            words = text.split()[1:-1]
            slot_intent = slot_intent.split()
            slots, intent = slot_intent[1:-1], slot_intent[-1]
 def setup_python_tokenizer(self):
     self.base_tokenizer = RobertaTokenizer.from_pretrained('roberta-base',
                                                            do_lower_case=False,
                                                            cache_dir=self.test_dir)
# In[2]:

#################################################################
### Step 1
#################################################################

from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, SequentialSampler

import json

from transformers import RobertaTokenizer

# Load the RoBERTa tokenizer.
print('Loading BERT tokenizer...')
tokenizer = RobertaTokenizer.from_pretrained('roberta-large',
                                             do_lower_case=True)

from transformers import RobertaForSequenceClassification, AdamW, RobertaConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = RobertaForSequenceClassification.from_pretrained(
    './step_1_casual_sentence_classifier_model',  # use my stored model
    num_labels=2,  # The number of output labels--2 for binary classification.
    # You can increase this for multi-class tasks.
    output_attentions=False,  # Whether the model returns attentions weights.
    output_hidden_states=False,  # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()
        type=str,
        help=
        'directory that contains corpus files to be encoded, in jsonl format.',
        required=True)
    parser.add_argument('--index',
                        type=str,
                        help='directory to store brute force index of corpus',
                        required=True)
    parser.add_argument('--batch', type=int, help='batch size', default=8)
    parser.add_argument('--device',
                        type=str,
                        help='device cpu or cuda [cuda:0, cuda:1...]',
                        default='cuda:0')
    args = parser.parse_args()

    tokenizer = RobertaTokenizer.from_pretrained(args.encoder)
    model = AnceEncoder.from_pretrained(args.encoder)
    model.to(args.device)

    index = faiss.IndexFlatIP(args.dimension)

    if not os.path.exists(args.index):
        os.mkdir(args.index)

    texts = []
    with open(os.path.join(args.index, 'docid'), 'w') as id_file:
        for file in sorted(os.listdir(args.corpus)):
            file = os.path.join(args.corpus, file)
            if file.endswith('json') or file.endswith('jsonl'):
                print(f'Loading {file}')
                with open(file, 'r') as corpus:
Esempio n. 27
0
            dev_set.append(line)

    print('building dataloaders ...')
    if args.model == 'visualbert':
        with open(f'{args.images_features}/images_features_dict.pkl',
                  'rb') as f:
            images_features_dict = pickle.load(f)
    else:
        images_features_dict = None

    if args.model == 'visualbert':
        config = BertConfig.from_pretrained('bert-base-uncased')
        tkz = BertTokenizer.from_pretrained('bert-base-uncased')
    else:
        config = RobertaConfig.from_pretrained('roberta-base')
        tkz = RobertaTokenizer.from_pretrained('roberta-base')

    print("train set")
    train_dataloader = create(data=train_set,
                              datatype='train',
                              batch_size=args.train_batch_size,
                              images_features_dict=images_features_dict,
                              tkz=tkz,
                              config=config)
    print("dev set")
    dev_dataloader = create(data=dev_set,
                            datatype='dev',
                            batch_size=args.dev_batch_size,
                            images_features_dict=images_features_dict,
                            tkz=tkz,
                            config=config)
Esempio n. 28
0
        cluster_flag = False
        pass

    if cluster_flag:
        uncompress_object(args.pretrained, ".")
        train_df = pd.read_csv(args.traindata)
        test_df = pd.read_csv(args.testdata)
    else:
        print("local file reading")
        train_df = pd.read_csv('notebooks/files/unlabel_train1.csv')
        test_df = pd.read_csv('notebooks/files/unlabel_test1.csv')

    Num_label = len(train_df.label_id.value_counts())

    device = torch.device(args.device)
    tokenizer = RobertaTokenizer.from_pretrained("./pretrained",
                                                 do_lower_case=False)
    model = TransferRobertaNet(path="./pretrained",
                               embedding_dim=768,
                               num_class=Num_label,
                               num_class1=args.classes)

    criterion = FocalLoss(alpha=0.97, reduce=True)
    model.to(device)
    criterion.to(device)

    optimizer = Adam(model.parameters(), lr=0.00008)
    if args.scheduler == "cosine":
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer=optimizer,
                                                   T_max=10,
                                                   eta_min=0)
    else:
    parser.add_argument("--t_model",
                        default="MIXED",
                        help="Type of trained model ('MRPC', 'MIXED')")
    # parser.add_argument("--f_model", default="MRPC_model.pkl", help="Trained model file")
    # parser.add_argument("--t_model", default="MRPC", help="Type of trained model ('MRPC', 'MIXED')")

    parser.add_argument("--max_seq_length",
                        default=128,
                        help="Max sequence length")
    args = parser.parse_args()

    # Set device
    device = set_device(args.device)

    # Define Tokenizer
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    # Define dataset and data loader
    raw_train_data = load_dataset('glue', 'mrpc', split='train')
    raw_val_data = load_dataset('glue', 'mrpc', split='validation')
    raw_test_data = load_dataset('glue', 'mrpc', split='test')

    # Define model manager
    manager = ModelManager(args.models_dir)

    # Define model
    if args.t_model == 'MRPC':
        model = ROBERTAOnMRPC()
    elif args.t_model == 'MIXED':
        model = ROBERTA_FT_MRPC(ROBERTAOnSTS())
    else:
        raise ("Expected 'MRPC' or 'MIXED', got '{}'".format(args.t_model))
Esempio n. 30
0
from transformers import RobertaTokenizer, RobertaModel
import os
import torch
import xml.etree.ElementTree as ET
import nltk
from nltk.tokenize import sent_tokenize
import numpy as np
from os import listdir
from os.path import isfile, join
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', unk_token='<unk>')
import spacy
nlp = spacy.load("en_core_web_sm")
#model = RobertaModel.from_pretrained('roberta-base')
space = ' '
#dir_name = "/shared/why16gzl/logic_driven/Quizlet/Quizlet_2/LDC2020E20_KAIROS_Quizlet_2_TA2_Source_Data_V1.0/data/ltf/ltf/"
#file_name = "K0C03N4LR.ltf.xml"    # Use ltf_reader
#dir_name = "/home1/w/why16gzl/KAIROS/hievents_v2/processed/"
#file_name = "article-10901.tsvx"   # Use tsvx_reader

# ============================
#         PoS Tagging
# ============================
pos_tags = [
    "ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM",
    "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", "SPACE"
]
identity_matrix = np.identity(len(pos_tags))
postag_to_OneHot = {}
postag_to_OneHot["None"] = np.zeros(len(pos_tags))
for (index, item) in enumerate(pos_tags):