Ejemplo n.º 1
0
    def __init__(self,
                 max_seq_len=MAX_LEN,
                 batch_size=BATCH_SIZE,
                 n_epochs=N_EPOCHS,
                 val_size=0.1,
                 learning_rate=LEARNING_RATE,
                 load_local_pretrained=False):

        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.val_size = val_size
        self.learning_rate = learning_rate

        # Load dataset, tokenizer, model from pretrained model/vocabulary
        self.tokenizer = (DistilBertTokenizerFast.from_pretrained(
            BERTMODEL, do_lower_case=False))

        if load_local_pretrained:
            self.model = (TFDistilBertForSequenceClassification.
                          from_pretrained(MODEL_PATH))

        else:
            config = DistilBertConfig.from_pretrained(BERTMODEL, num_labels=2)
            self.model = (
                TFDistilBertForSequenceClassification.from_pretrained(
                    BERTMODEL, config=config))
            # Freeze distilbert layer
            self.model.distilbert.trainable = False
Ejemplo n.º 2
0
	def __load(self):
		dbertConf = DistilBertConfig.from_pretrained(self.path + '/config.json')
		self.model = TFDistilBertForSequenceClassification.from_pretrained\
		(
			self.path + '/tf_model.h5',
			config=dbertConf,
		)
Ejemplo n.º 3
0
 def load_model(self, model_name: str = "bert_ner_test"):
     # TODO model loaded from mlflow
     # Load model and tokenizer.
     config = DistilBertConfig.from_pretrained(model_name)
     model = DistilBertForTokenClassification(config).from_pretrained(
         model_name)
     tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
     return model, config, tokenizer
Ejemplo n.º 4
0
 def model_load(self, path: str):
     config = DistilBertConfig.from_pretrained(path + "/config.json")
     tokenizer = DistilBertTokenizer.from_pretrained(
         path, do_lower_case=self.do_lower_case)
     model = DistilBertForQuestionAnswering.from_pretrained(path,
                                                            from_tf=False,
                                                            config=config)
     return model, tokenizer
Ejemplo n.º 5
0
    def model_load(self, path):

        s3_model_url = 'https://distilbert-finetuned-model.s3.eu-west-2.amazonaws.com/pytorch_model.bin'
        path_to_model = download_model(s3_model_url, model_name="pytorch_model.bin")

        config = DistilBertConfig.from_pretrained(path + "/config.json")
        tokenizer = DistilBertTokenizer.from_pretrained(path, do_lower_case=self.do_lower_case)
        model = DistilBertForQuestionAnswering.from_pretrained(path_to_model, from_tf=False, config=config)

        return model, tokenizer
Ejemplo n.º 6
0
def download_distilbert_base():
    file = '../input/distilbert-base-uncased'

    config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
    config.save_pretrained(file)
    
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')
    model.save_pretrained(file)

    tkn = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    tkn.save_pretrained(file)
Ejemplo n.º 7
0
def build_model(args):
    if args.clf_model.lower() == "cnn":
        # easy for text tokenization
        tokenizer = DistilBertTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)
        model = CNN_Text(args)

    elif args.clf_model.lower() == "robert":
        print("name is {}".format(args.model_name_or_path))
        tokenizer = RobertaTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)

        config = RobertaConfig.from_pretrained(args.model_name_or_path,
                                               num_labels=args.num_labels,
                                               finetuning_task=args.task_name)

        model = RobertaForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)
        # freeze the weight for transformers
        if args.freeze:
            for n, p in model.named_parameters():
                if "bert" in n:
                    p.requires_grad = False
    elif args.clf_model.lower() == "bert":
        tokenizer = BertTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)

        config = BertConfig.from_pretrained(args.model_name_or_path,
                                            num_labels=args.num_labels,
                                            finetuning_task=args.task_name)

        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)
        # freeze the weight for transformers
        # if args.freeze:
        #     for n, p in model.named_parameters():
        #         if "bert" in n:
        #             p.requires_grad = False

    else:
        tokenizer = DistilBertTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)
        config = DistilBertConfig.from_pretrained(
            args.model_name_or_path,
            num_labels=args.num_labels,
            finetuning_task=args.task_name)
        model = DistilBertForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)

    model.expand_class_head(args.multi_head)
    model = model.to(args.device)
    return tokenizer, model
 def __init__(self, path=None, model_name=None):
     if path:
         self.model = DistilBertForSequenceClassification.from_pretrained(
             path)
         tokenizer_path = os.path.join(path, "model/")
         if os.path.exists(tokenizer_path):
             self.tokenizer = DistilBertTokenizerFast.from_pretrained(
                 tokenizer_path)
         else:
             self.tokenizer = DistilBertTokenizerFast.from_pretrained(
                 "distilbert-base-uncased")
     elif model_name:
         config = DistilBertConfig.from_pretrained(model_name,
                                                   return_dict=True,
                                                   num_labels=2)
         self.model = DistilBertForSequenceClassification.from_pretrained(
             model_name, config=config)
         self.tokenizer = DistilBertTokenizerFast.from_pretrained(
             model_name)
Ejemplo n.º 9
0
def get_bert_config(bert_model_type, output_hidden_states=False):
    if bert_model_type in [
            'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased',
            'bert-large-uncased', 'tune_bert-base-uncased_nsp',
            'bert-large-uncased-whole-word-masking',
            'bert-large-uncased-whole-word-masking-finetuned-squad'
    ]:
        bert_config = BertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'roberta-base', 'prod-roberta-base-cased', 'roberta-large',
            'roberta-large-mnli', 'distilroberta-base'
    ]:
        bert_config = RobertaConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['xlnet-base-cased']:
        bert_config = XLNetConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
            'albert-xxlarge-v1'
    ]:
        bert_config = AlbertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['gpt2', 'gpt2-medium']:
        bert_config = GPT2Config.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['transfo-xl']:
        bert_config = TransfoXLConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'distilbert-base-uncased',
            'distilbert-base-uncased-distilled-squad'
    ]:
        bert_config = DistilBertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    else:
        raise ValueError(
            f'`bert_model_type` not understood: {bert_model_type}')

    bert_config.output_hidden_states = output_hidden_states
    return bert_config
Ejemplo n.º 10
0
  def __init__(self, DatasetClass, weights_path = './.trained_models/frankie_encoder/ec.ckpt'):

    # transformer/model parameters are hardcoded due to usage of pre-trained weights
    self.max_seq_length = 64
    self.model_name = 'distilbert-base-uncased'

    self.tokenizer = tokenizer = DistilBertTokenizer.from_pretrained(
        self.model_name,
        do_lower_case=True,
        add_special_tokens=True,
        max_length=self.max_seq_length,
        pad_to_max_length=True
    )
    self.dataset = DatasetClass(self.tokenizer, self.max_seq_length)

    self.model_config = DistilBertConfig.from_pretrained(self.model_name)
    self.model_config.output_hidden_states = False
    self.model = self._create_sentence_transformer(input_shape=(self.max_seq_length,))
    self.model.load_weights(weights_path)
    print("Initialized Encoder Model")
Ejemplo n.º 11
0
def returnRelevant(researchPaper, query, numSnippets=15):
    # Make sure these are downloaded before using
    config = DistilBertConfig.from_pretrained("distilbert-base-uncased")
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased',
                                            config=config)
    relevantSnippets = []

    with open(researchPaper,
              encoding='utf8') as researchPaperCSV, torch.no_grad():
        researchPaperReader = csv.reader(researchPaperCSV)
        score_max_heap = []
        input_ids = torch.tensor(
            [tokenizer.encode(query, add_special_tokens=True, max_length=512)])
        output_tuple = model(input_ids)
        last_hidden_states = output_tuple[0]
        queryObj = last_hidden_states.mean(1)
        for snippet in researchPaperReader:
            if ('<EOS>' not in snippet):
                snippetStr = " "
                snippetStr = ' '.join([str(elem) for elem in snippet])
                # This implementation will reject snippets of longer than 512 tokens
                input_ids = torch.tensor([
                    tokenizer.encode(snippetStr,
                                     add_special_tokens=True,
                                     max_length=512)
                ])
                output_tuple = model(input_ids)
                last_hidden_states = output_tuple[0]
                snippetObj = last_hidden_states.mean(1)
                qs = QuerySnippet(query, snippet,
                                  similarity(queryObj, snippetObj))
                if len(score_max_heap
                       ) < numSnippets or qs.similarity > score_max_heap[
                           0].similarity:
                    if len(score_max_heap) == numSnippets:
                        heapq.heappop(score_max_heap)
                    heapq.heappush(score_max_heap, qs)
        for qs in score_max_heap:
            relevantSnippets.append(qs.snippet)
    return relevantSnippets
Ejemplo n.º 12
0
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):
    bert_config_file = os.path.join(BERT_PT_PATH,
                                    f'bert_config_{bert_type}.json')
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    init_checkpoint = os.path.join(BERT_PT_PATH,
                                   f'pytorch_model_{bert_type}.bin')

    config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
                                              output_hidden_states=True)
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased', config=config)
    tokenizer = DistilBertTokenizer.from_pretrained(
        'distilbert-base-uncased', do_lower_case=do_lower_case)
    #     distil_model_bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', output_hidden_states=True)
    #     if no_pretraining:
    #         pass
    #     else:
    #         distil_model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
    #         print("Load pre-trained parameters.")
    model.to(device)

    return model, tokenizer, config
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,  #64, 256
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval or not.")
    parser.add_argument("--eval_on",
                        default="dev",
                        help="Whether to run eval on the dev set or test set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {"ner": NerProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()
    num_labels = len(label_list) + 1

    tokenizer = DistilBertTokenizer.from_pretrained(
        args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = 0
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    # Prepare model
    config = DistilBertConfig.from_pretrained(args.bert_model,
                                              num_labels=num_labels,
                                              finetuning_task=args.task_name)
    # print(config)
    model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=num_train_optimization_steps)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    label_map = {i: label for i, label in enumerate(label_list, 1)}
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in train_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids,
                                   all_valid_ids, all_lmask_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids,
                             valid_ids, l_mask)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        model_config = {
            "bert_model": args.bert_model,
            "do_lower": args.do_lower_case,
            "max_seq_length": args.max_seq_length,
            "num_labels": len(label_list) + 1,
            "label_map": label_map
        }
        json.dump(
            model_config,
            open(os.path.join(args.output_dir, "model_config.json"), "w"))
        # Load a trained model and config that you have fine-tuned
    else:
        # Load a trained model and vocabulary that you have fine-tuned
        model = Ner.from_pretrained(args.output_dir)
        tokenizer = DistilBertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)

    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        if args.eval_on == "dev":
            eval_examples = processor.get_dev_examples(args.data_dir)
        elif args.eval_on == "test":
            eval_examples = processor.get_test_examples(args.data_dir)
        else:
            raise ValueError("eval on dev or test set only")
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in eval_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids,
                                  all_valid_ids, all_lmask_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        y_true = []
        y_pred = []
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            valid_ids = valid_ids.to(device)
            label_ids = label_ids.to(device)
            l_mask = l_mask.to(device)

            with torch.no_grad():
                logits = model(input_ids,
                               segment_ids,
                               input_mask,
                               valid_ids=valid_ids,
                               attention_mask_label=l_mask)

            logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            input_mask = input_mask.to('cpu').numpy()

            for i, label in enumerate(label_ids):
                temp_1 = []
                temp_2 = []
                for j, m in enumerate(label):
                    if j == 0:
                        continue
                    elif label_ids[i][j] == len(label_map):
                        y_true.append(temp_1)
                        y_pred.append(temp_2)
                        break
                    else:
                        temp_1.append(label_map[label_ids[i][j]])
                        temp_2.append(label_map[logits[i][j]])

        report = classification_report(y_true, y_pred, digits=4)
        accuracy = accuracy_score(y_true, y_pred)
        logger.info("\n%s", report)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info("\n%s", report)
            writer.write(report)
        with open(output_eval_file, "a") as writer:
            logger.info("***** Eval results *****")
            logger.info("\n%s", accuracy)
            writer.write(str(accuracy))
    class_loss = nn.CrossEntropyLoss()(class_preds, class_labels)
    return start_loss + end_loss + class_loss

#['LONG', 'NO', 'SHORT', 'UNKNOWN', 'YES']

def loss_fn_classifier(preds, labels):
    _,_, class_preds = preds
    _, _,class_labels = labels

    class_weights = [1.0, 1.0, 1.0, 0.6, 1.0]
    class_weights = torch.FloatTensor(class_weights).cuda()
    class_loss = nn.CrossEntropyLoss(class_weights)(class_preds, class_labels)

    return class_loss
# RekhaDist
config = DistilBertConfig.from_pretrained('distilbert-base-uncased-distilled-squad')
config.num_labels = 5
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad', config=config)

model = model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
num_train_optimization_steps = int(n_epochs * train_size / batch_size / accumulation_steps)
print('num_train_optimization_steps=', num_train_optimization_steps)
num_warmup_steps = int(num_train_optimization_steps * warmup)
print('num_warmup_steps', num_warmup_steps)
Ejemplo n.º 15
0
    batch_size = 32
    learning_rate = 1e-06
    max_epochs = 100
    alpha = 0.1  # smoothing parameters for true label
    # /PARAMETERS

    # create log file
    data_folder = '../../data/from-figure-eight/balanced-test-data/tobert/'
    res_path = '../../res/'
    res_path += logfile_name
    with open(res_path, 'w') as f:
        c = 'epoch, iter, loss_train, loss_val, pre_val, rec_val, f01_val, f1_val, f10_val, ece_val'
        f.write(c + '\n')

    # configure DistilBERT model
    config = DistilBertConfig.from_pretrained('distilbert-base-cased')
    config.num_labels = num_labels
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
    model = DistilBertForSequenceClassification(config)
    # load model to GPU if available
    if torch.cuda.is_available():
        model = model.cuda()

    # load datasets
    train_dataset = pd.read_csv(data_folder + train_file)
    val_dataset = pd.read_csv(data_folder + val_file)
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("VAL Dataset: {}".format(val_dataset.shape))
    training_set = DataLoaderSmoothing(train_dataset, alpha)
    validating_set = DataLoaderHard(val_dataset)
Ejemplo n.º 16
0
                    '-m',
                    type=int,
                    default=512,
                    help='maximum length handled by the model')

args = parser.parse_args()

usecfg = False
if usecfg:
    from transformers import (
        DistilBertConfig,
        DistilBertForSequenceClassification,
        DistilBertTokenizer,
    )
    config = DistilBertConfig.from_pretrained(args.model_name,
                                              finetuning_task='sentiment3',
                                              num_labels=3)
    model = DistilBertForSequenceClassification.from_pretrained(
        args.model_name, config=config)
    tokenizer = DistilBertTokenizer.from_pretrained(
        args.model_name, do_lower_case=(not args.keep_case))
else:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name, do_lower_case=(not args.keep_case))
    model = AutoModelForSequenceClassification.from_pretrained(args.model_name)

model.to("cpu")
model.eval()

classes = ["0", "1", "2"]
def main():
    
    
    
    ntasks = len(tasks)
    
    data_args = list()
    configuration = list()
    sub_models = list()
    train_iter = list()
    dev_iter = list()
    test_iter = list()
    sub_optimizer = list()
    metrics = list()
    tokenizer = DistilBertTokenizer.from_pretrained(bert_path, cache_dir=cache_dir)
    
    for i in range(ntasks):    
        logger.info("Tasks:" + tasks[i])
        data_args.append(GlueDataArgs(task_name=tasks[i]))
        configuration.append(DistilBertConfig.from_pretrained(bert_path, num_labels=glue_tasks_num_labels[data_args[i].task_name], 
                                finetuning_task=data_args[i].task_name, cache_dir = cache_dir))
        if use_gpu:
            sub_models.append(SequenceClassification(configuration[i]).cuda())
        else: 
            sub_models.append(SequenceClassification(configuration[i]))
            
        train_iter.append(DataIterator(data_args[i], tokenizer=tokenizer, mode="train", cache_dir=cache_dir, batch_size=batch_size[i]))
        dev_iter.append(DataIterator(data_args[i], tokenizer=tokenizer, mode="dev", cache_dir=cache_dir, batch_size=batch_size_val[i]))
        
        sub_optimizer.append(torch.optim.AdamW(sub_models[i].parameters(), lr=learning_rate))
        
        metrics.append(ComputeMetrics(data_args[i]))
        
        logger.info("*** DataSet Ready ***")
    
    if use_gpu:
        Bert_model = DistilBertModel.from_pretrained(bert_path, return_dict=True).cuda()
    else:
        Bert_model = DistilBertModel.from_pretrained(bert_path, return_dict=True)
    
    bert_optimizer = torch.optim.AdamW(Bert_model.parameters(), lr=learning_rate)
    
    
    # balaned dataset
    train_num = list()    
    for i in range(ntasks):
        train_num.append(len(train_iter[i]))
    #train_nummax = 
    #train_num = [x/train_nummax for x in train_num]
    #print(train_num)
    iterations = (epochs * max(train_num) // bs) + 1
    #print(iterations)
    
    sub_scheduler = list()
    for i in range(ntasks):
        sub_scheduler.append(torch.optim.lr_scheduler.LambdaLR(sub_optimizer[i], lambda step: (1.0-step/iterations)))    
    Bert_scheduler = torch.optim.lr_scheduler.LambdaLR(bert_optimizer, lambda step: (1.0-step/iterations))
    
    
    for i in range(1, iterations+1):
        
        
        if iterations > frozen:
            for p in Bert_model.parameters():
                p.requires_grad = True
            Bert_model.train()
            
        else:
            for p in Bert_model.parameters():
                p.requires_grad = False
            Bert_model.eval()
        
        losses=list()
        for j in range(ntasks):
            sub_models[j].train()
            data = train_iter[j].next()
            
            if use_gpu:
                input_ids=data['input_ids'].cuda()
                attention_mask=data['attention_mask'].cuda()
                #token_type_ids=data['token_type_ids'].cuda()
                label=data['labels'].cuda()
            else:
                input_ids=data['input_ids']
                attention_mask=data['attention_mask']
                #token_type_ids=data['token_type_ids']
                label=data['labels']
                
            output_inter = Bert_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True) # token_type_ids=token_type_ids,
            losses.append(sub_models[j](input=output_inter, labels=label)[0])
   
        
        loss = 0
        printInfo = 'TOTAL/Train {}/{}, lr:{}'.format(i, iterations, Bert_scheduler.get_lr())
        for j in range(ntasks):
            loss += losses[j] * batch_size[j]
            printInfo += ', loss{}-{:.6f}'.format(j,losses[j])
            sub_optimizer[j].zero_grad()
            
        logging.info(printInfo) 
        
        if iterations > frozen:
            bert_optimizer.zero_grad()
        loss.backward()
        
        if iterations > frozen:
            bert_optimizer.step()
            
        for j in range(ntasks):
            sub_optimizer[j].step()
            sub_scheduler[j].step()
        
        if iterations > frozen:
            Bert_scheduler.step()
        
        if (i % eval_interval == 0):
            for j in range(ntasks):
                evaluate(Bert_model, sub_models[j], dev_iter[j], batch_size_val[j], metrics[j])
                sub_models[j].save_pretrained(os.path.join(model_save_dir, "{}-checkpoint-{:06}.pth.tar".format(tasks[j], i)))
            Bert_model.save_pretrained(os.path.join(model_save_dir, "{}-checkpoint-{:06}.pth.tar".format("main", i)))
    
    
    for i in range(ntasks):
        evaluate(Bert_model, sub_models[i], dev_iter[i], batch_size_val[i], metrics[i])
        sub_models[i].save_pretrained(os.path.join(model_save_dir, "{}-checkpoint-{:06}.pth.tar".format(tasks[j], iterations)))
            
    Bert_model.save_pretrained(os.path.join(model_save_dir, "{}-checkpoint-{:06}.pth.tar".format("main", iterations)))    
Ejemplo n.º 18
0
def main():
    """
    main function for conducting Subtask C. Parameters are parsed with argparse.
    Language model should be suitable for German e.g.:
        'bert-base-multilingual-uncased', 
        'bert-base-multilingual-cased',              
        'bert-base-german-cased', 
        'bert-base-german-dbmdz-cased',
        'bert-base-german-dbmdz-uncased',
        'distilbert-base-german-cased',
        'distilbert-base-multilingual-cased'.
    """

    ############################ variable settings #################################
    parser = argparse.ArgumentParser(
        description=
        'Run Subtask C of GermEval 2017 Using Pre-Trained Language Model.')
    parser.add_argument('--seed', type=int, default=42, help='Random seed.')
    parser.add_argument('--lang_model',
                        type=str,
                        default='bert-base-german-dbmdz-uncased',
                        help='The pre-trained language model.')
    parser.add_argument('--epochs',
                        type=int,
                        default=4,
                        help='Number of epochs for training.')
    parser.add_argument('--lr',
                        type=float,
                        default=5e-5,
                        help='The learning rate.')
    parser.add_argument('--max_len',
                        type=int,
                        default=256,
                        help='The maximum sequence length of the input text.')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Your train set batch size.')
    parser.add_argument('--df_path',
                        type=str,
                        default='./data/',
                        help='The data directory.')
    parser.add_argument('--train_data',
                        type=str,
                        default='train_df_cat.tsv',
                        help='The filename of the input train data.')
    parser.add_argument('--dev_data',
                        type=str,
                        default='dev_df_cat.tsv',
                        help='The filename of the input development data.')
    parser.add_argument(
        '--test_data1',
        type=str,
        default='test_syn_df_cat.tsv',
        help='The filename of the first input test data (synchronic).')
    parser.add_argument(
        '--test_data2',
        type=str,
        default='test_dia_df_cat.tsv',
        help='The filename of the second input test data (diachronic).')
    parser.add_argument(
        '--output_path',
        type=str,
        default='./output/subtaskC/',
        help='The output directory of the model and predictions.')
    parser.add_argument("--train",
                        default=True,
                        action="store_true",
                        help="Flag for training.")
    parser.add_argument("--save_prediction",
                        default=False,
                        action="store_true",
                        help="Flag for saving predictions.")
    parser.add_argument("--save_cr",
                        default=False,
                        action="store_true",
                        help="Flag for saving confusion matrix.")
    parser.add_argument("--exclude_general",
                        default=False,
                        action="store_true",
                        help="Flag for excluding category Allgemein.")
    parser.add_argument("--exclude_neutral",
                        default=False,
                        action="store_true",
                        help="Flag for excluding neutral polarity.")
    parser.add_argument("--exclude_general_neutral",
                        default=False,
                        action="store_true",
                        help="Flag for excluding category Allgemein:neutral.")
    args = parser.parse_args()
    ################################################################################
    set_all_seeds(args.seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # Load data
    train_df = pd.read_csv(args.df_path + args.train_data, delimiter='\t')
    dev_df = pd.read_csv(args.df_path + args.dev_data, delimiter='\t')
    test_syn_df = pd.read_csv(args.df_path + args.test_data1, delimiter='\t')
    test_dia_df = pd.read_csv(args.df_path + args.test_data2, delimiter='\t')

    # Create a tokenizer
    lower_case = False
    if args.lang_model[-7:] == "uncased":
        lower_case = True

    if args.lang_model[:4] == "bert":
        model_class = "BERT"
        tokenizer = BertTokenizer.from_pretrained(args.lang_model,
                                                  do_lower_case=lower_case,
                                                  max_length=args.max_len)

    if args.lang_model[:10] == "distilbert":
        model_class = "DistilBERT"
        tokenizer = DistilBertTokenizer.from_pretrained(
            args.lang_model, do_lower_case=lower_case, max_length=args.max_len)

    # get training features
    cats = train_df.columns[5:]
    end = "full"
    # exclude categories if required
    if (args.exclude_general):
        cats = [i for i in list(cats) if "Allgemein" not in i]
        end = "excl_gen"
    if (args.exclude_neutral):
        cats = [i for i in list(cats) if "neutral" not in i]
        end = "excl_neu"
    if (args.exclude_general_neutral):
        cats = [i for i in list(cats) if "Allgemein:neutral" not in i]
        end = "excl_genneu"

    num_labels = len(list(cats))

    # create one hot labels
    train_df['one_hot_labels'] = list(train_df[list(cats)].values)
    dev_df['one_hot_labels'] = list(dev_df[list(cats)].values)
    test_syn_df['one_hot_labels'] = list(test_syn_df[list(cats)].values)
    test_dia_df['one_hot_labels'] = list(test_dia_df[list(cats)].values)

    # retrieve sentences and labels
    df = pd.concat([train_df, dev_df])
    sentences = df.text.values
    labels = list(df.one_hot_labels.values)

    sentences_syn = test_syn_df.text.values
    labels_syn = list(test_syn_df.one_hot_labels.values)

    sentences_dia = test_dia_df.text.values
    labels_dia = list(test_dia_df.one_hot_labels.values)

    print("number of categories:", len(list(cats)))

    # Tokenize all of the sentences and map the tokens to their word IDs.
    input_ids = [
        tokenizer.encode(sent,
                         add_special_tokens=True,
                         truncation=True,
                         max_length=args.max_len) for sent in sentences
    ]
    input_ids = pad_sequences(input_ids,
                              maxlen=args.max_len,
                              dtype="long",
                              value=0.0,
                              truncating="post",
                              padding="post")
    # Create attention masks
    attention_masks = [[int(token_id > 0) for token_id in sent]
                       for sent in input_ids]

    # synchronic test data
    input_ids_syn = [
        tokenizer.encode(sent, add_special_tokens=True, truncation=True)
        for sent in sentences_syn
    ]
    input_ids_syn = pad_sequences(input_ids_syn,
                                  maxlen=args.max_len,
                                  dtype="long",
                                  value=0.0,
                                  truncating="post",
                                  padding="post")
    attention_masks_syn = [[int(token_id > 0) for token_id in sent]
                           for sent in input_ids_syn]

    # diachronic test data
    input_ids_dia = [
        tokenizer.encode(sent, add_special_tokens=True, truncation=True)
        for sent in sentences_dia
    ]
    input_ids_dia = pad_sequences(input_ids_dia,
                                  maxlen=args.max_len,
                                  dtype="long",
                                  value=0.0,
                                  truncating="post",
                                  padding="post")
    attention_masks_dia = [[int(token_id > 0) for token_id in sent]
                           for sent in input_ids_dia]

    # split train, dev
    train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev(
        train_df, dev_df, attention_masks, input_ids, labels)

    # transform to torch tensor
    train_inputs = torch.tensor(train_inputs)
    dev_inputs = torch.tensor(dev_inputs)

    train_labels = torch.tensor(train_labels)
    dev_labels = torch.tensor(dev_labels)

    train_masks = torch.tensor(train_masks)
    dev_masks = torch.tensor(dev_masks)

    test_syn_inputs = torch.tensor(input_ids_syn)
    test_syn_masks = torch.tensor(attention_masks_syn)
    test_syn_labels = torch.tensor(labels_syn)

    test_dia_inputs = torch.tensor(input_ids_dia)
    test_dia_masks = torch.tensor(attention_masks_dia)
    test_dia_labels = torch.tensor(labels_dia)

    # Create the DataLoader
    train_dataloader = create_dataloader(train_inputs,
                                         train_masks,
                                         train_labels,
                                         args.batch_size,
                                         train=True)

    dev_dataloader = create_dataloader(dev_inputs,
                                       dev_masks,
                                       dev_labels,
                                       args.batch_size,
                                       train=False)

    test_syn_dataloader = create_dataloader(test_syn_inputs,
                                            test_syn_masks,
                                            test_syn_labels,
                                            args.batch_size,
                                            train=False)

    test_dia_dataloader = create_dataloader(test_dia_inputs,
                                            test_dia_masks,
                                            test_dia_labels,
                                            args.batch_size,
                                            train=False)

    # Create model
    if args.train:
        if model_class == "BERT":
            config = BertConfig.from_pretrained(args.lang_model,
                                                num_labels=num_labels)
            config.hidden_dropout_prob = 0.1
            model = BertForSequenceClassification.from_pretrained(
                args.lang_model,
                num_labels=num_labels,
                output_attentions=False,
                output_hidden_states=False)

        if model_class == "DistilBERT":
            config = DistilBertConfig.from_pretrained(args.lang_model,
                                                      num_labels=num_labels)
            config.hidden_dropout_prob = 0.1
            model = DistilBertForSequenceClassification.from_pretrained(
                args.lang_model,
                num_labels=num_labels,
                output_attentions=False,
                output_hidden_states=False)
        model.cuda()

        # Create an optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-8)
        # Total number of training steps = number of batches * number of epochs
        total_steps = len(train_dataloader) * args.epochs
        # Create the learning rate scheduler
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=total_steps)

        # train model
        # Main Loop
        print("=================== Train ================")
        print("##### Language Model:", args.lang_model, ",", "learning rate:",
              args.lr)
        print()

        track_time = time.time()
        # trange is a tqdm wrapper around the normal python range
        for epoch in trange(args.epochs, desc="Epoch"):
            print("Epoch: %4i" % epoch, dt.datetime.now())

            model, optimizer, scheduler, tr_loss = train_multilabel(
                train_dataloader=train_dataloader,
                model=model,
                device=device,
                optimizer=optimizer,
                scheduler=scheduler,
                num_labels=num_labels)
            # EVALUATION: TRAIN SET
            pred_bools_train, true_bools_train, f1_train = eval_multilabel(
                train_dataloader, model=model, device=device)
            print("TRAIN: micro F1 %.3f" % (f1_train))

            # EVALUATION: DEV SET
            pred_bools_dev, true_bools_dev, f1_dev = eval_multilabel(
                dev_dataloader, model=model, device=device)
            print("EVAL: micro F1 %.3f" % (f1_dev))

        print("  Training and validation took in total: {:}".format(
            format_time(time.time() - track_time)))

        # EVALUATION: TEST SYN SET
        pred_bools_syn, true_bools_syn, f1_test_syn = eval_multilabel(
            test_syn_dataloader, model=model, device=device)
        print("TEST SYN: micro F1 %.4f" % (f1_test_syn))

        # classification report
        clf_report_syn = classification_report(true_bools_syn,
                                               pred_bools_syn,
                                               target_names=cats,
                                               digits=3)
        print(clf_report_syn)

        # EVALUATION: TEST DIA SET
        pred_bools_dia, true_bools_dia, f1_test_dia = eval_multilabel(
            test_dia_dataloader, model=model, device=device)
        print("TEST DIA: micro F1 %.4f" % (f1_test_dia))

        # classification report
        clf_report_dia = classification_report(true_bools_dia,
                                               pred_bools_dia,
                                               target_names=cats,
                                               digits=3)
        print(clf_report_dia)

        if args.save_cr:
            pickle.dump(
                clf_report_syn,
                open(
                    args.output_path + 'clf_report_' + args.lang_model +
                    '_test_syn_' + str(num_labels) + end + '.txt', 'wb'))
            pickle.dump(
                clf_report_dia,
                open(
                    args.output_path + 'clf_report_' + args.lang_model +
                    '_test_dia_' + str(num_labels) + end + '.txt', 'wb'))

        if args.save_prediction:
            test_syn_df["category_pred"] = pred_bools_syn
            test_dia_df["category_pred"] = pred_bools_dia
            test_syn_df.category_pred.to_csv(args.output_path +
                                             args.lang_model + '_test_syn_' +
                                             str(num_labels) + end + ".tsv",
                                             sep="\t",
                                             index=False,
                                             header=True,
                                             encoding="utf-8-sig")
            test_dia_df.category_pred.to_csv(args.output_path +
                                             args.lang_model + '_test_dia_' +
                                             str(num_labels) + end + ".tsv",
                                             sep="\t",
                                             index=False,
                                             header=True,
                                             encoding="utf-8-sig")
Ejemplo n.º 19
0
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
granola_ids = tokenizer.encode('granola bars')

# Print the IDs
print('granola_ids', granola_ids)
print('type of granola_ids', type(granola_ids))
print('granola_tokens', tokenizer.convert_ids_to_tokens(granola_ids))

# Convert the list of IDs to a tensor of IDs
granola_ids = torch.LongTensor(granola_ids)
# Print the IDs
print('granola_ids', granola_ids)
print('type of granola_ids', type(granola_ids))

config = DistilBertConfig.from_pretrained(model_name,
                                          output_hidden_states=True)
model = DistilBertModel.from_pretrained(model_name, config=config)
# Set the device to GPU (cuda) if available, otherwise stick with CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = model.to(device)
granola_ids = granola_ids.to(device)

model.eval()

print(granola_ids.size())
# unsqueeze IDs to get batch size of 1 as added dimension
granola_ids = granola_ids.unsqueeze(0)
print(granola_ids.size())

print(type(granola_ids))
Ejemplo n.º 20
0
def main():

    ntasks = len(tasks)

    data_args = list()
    configuration = list()
    sub_models = list()
    datasets = list()
    # train_iter = list()
    # dev_iter = list()
    # test_iter = list()
    sub_optimizer = list()
    metrics = list()
    tokenizer = DistilBertTokenizer.from_pretrained(bert_path,
                                                    cache_dir=cache_dir)

    for i in range(ntasks):
        logger.info("Tasks:" + tasks[i])
        data_args.append(GlueDataArgs(task_name=tasks[i]))
        configuration.append(
            DistilBertConfig.from_pretrained(
                bert_path,
                num_labels=glue_tasks_num_labels[tasks[i].lower()],
                finetuning_task=data_args[i].task_name,
                cache_dir=cache_dir))
        if use_gpu:
            sub_models.append(SequenceClassification(configuration[i]).cuda())
        else:
            sub_models.append(SequenceClassification(configuration[i]))

        datasets.append(
            GlueDataSets(data_args[i],
                         tokenizer=tokenizer,
                         cache_dir=cache_dir))
        sub_optimizer.append(
            torch.optim.AdamW(sub_models[i].parameters(), lr=learning_rate_0))
        metrics.append(ComputeMetrics(data_args[i]))
        logger.info("*** DataSet Ready ***")

    if use_gpu:
        Bert_model = DistilBertModel.from_pretrained(bert_path,
                                                     return_dict=True).cuda()
    else:
        Bert_model = DistilBertModel.from_pretrained(bert_path,
                                                     return_dict=True)

    bert_optimizer = torch.optim.AdamW(Bert_model.parameters(),
                                       lr=learning_rate_0)

    # balaned dataset
    train_num = list()
    for i in range(ntasks):
        train_num.append(datasets[i].length("train"))
    #train_nummax =
    #train_num = [x/train_nummax for x in train_num]
    print(train_num)
    iterations = (epochs * max(train_num) // bs) + 1
    #print(iterations)

    sub_scheduler = list()
    for i in range(ntasks):
        sub_scheduler.append(
            torch.optim.lr_scheduler.LambdaLR(
                sub_optimizer[i], lambda step: (1.0 - step / iterations))
        )  #if step <= frozen else learning_rate_1)
    Bert_scheduler = torch.optim.lr_scheduler.LambdaLR(
        bert_optimizer, lambda step:
        (1.0 - step / iterations))  # if step <= frozen else learning_rate_1

    # datasets[i].dataloader("train", batch_size_train[i])
    train_iter = list()
    for i in range(ntasks):
        train_iter.append(
            GlueIterator(datasets[i].dataloader("train", batch_size_train[i])))

    for i in range(1, iterations + 1):

        if i > frozen:
            for p in Bert_model.parameters():
                p.requires_grad = True
            Bert_model.train()
        elif i == frozen:
            for p in Bert_model.parameters():
                p.requires_grad = True
            Bert_model.train()
            logging.info("#####################################")
            logging.info("Release the Traing of the Main Model.")
            logging.info("#####################################")
        else:
            for p in Bert_model.parameters():
                p.requires_grad = False
            Bert_model.eval()

        losses = list()
        loss_rates = list()

        for j in range(ntasks):
            sub_models[j].train()
            data = train_iter[j].next()

            if use_gpu:
                input_ids = data['input_ids'].cuda()
                attention_mask = data['attention_mask'].cuda()
                #token_type_ids=data['token_type_ids'].cuda()
                label = data['labels'].cuda()
            else:
                input_ids = data['input_ids']
                attention_mask = data['attention_mask']
                #token_type_ids=data['token_type_ids']
                label = data['labels']

            output_inter = Bert_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                return_dict=True)  # token_type_ids=token_type_ids,
            losses.append(sub_models[j](input=output_inter, labels=label)[0])

        losssum = sum(losses).item()
        for j in range(ntasks):
            loss_rates.append(losses[j].item() / losssum)

        loss = 0
        printInfo = 'TOTAL/Train {}/{}, lr:{}'.format(i, iterations,
                                                      Bert_scheduler.get_lr())
        for j in range(ntasks):
            loss += losses[j] * batch_size_train[j]  # * loss_rates[j]
            printInfo += ', loss{}-{:.6f}'.format(j, losses[j])
            sub_optimizer[j].zero_grad()

        logging.info(printInfo)

        if i > frozen:
            bert_optimizer.zero_grad()
        loss.backward()

        if i > frozen:
            bert_optimizer.step()

        for j in range(ntasks):
            sub_optimizer[j].step()
            # sub_scheduler[j].step()

        # Bert_scheduler.step()

        if (i % eval_interval == 0):
            evaluate(Bert_model, sub_models, datasets, batch_size_val, metrics,
                     ntasks)
            save_models(Bert_model, sub_models, ntasks, i)

    evaluate(Bert_model, sub_models, datasets, batch_size_val, metrics, ntasks)
    save_models(Bert_model, sub_models, ntasks, iterations)
Ejemplo n.º 21
0
def training_model(args):
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()

    config = DistilBertConfig.from_pretrained(TRANSFORMER_MODEL, num_labels=97 + 1)
    tokenizer = DistilBertTokenizer.from_pretrained(TRANSFORMER_MODEL)
    model = DistilBertForSequenceClassification.from_pretrained(TRANSFORMER_MODEL, config=config)
    model.to(args.device)
    etl = ETL(env.DB_FILE, env.SCHEMA_FILE)
    complaints_users = etl.load_query(SQL_QUERY_STRING)
    features = convert_examples_to_features(
        complaints_users[[COMPLAINT_TEXT, LABEL]].to_dict(orient='records'),
        max_length=128,
        tokenizer=tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    train_dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=32)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    t_total = len(train_dataloader) // args.num_train_epochs
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
                                                num_training_steps=t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch")
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'labels': batch[3]}
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            loss.backward()

            tr_loss += loss.item()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            logs = {}
            if args.evaluate_during_training:
                results = evaluate(args, model, tokenizer)
                for key, value in results.items():
                    eval_key = 'eval_{}'.format(key)
                    logs[eval_key] = value

            loss_scalar = (tr_loss - logging_loss) / args.logging_steps
            learning_rate_scalar = scheduler.get_lr()[0]
            logs['learning_rate'] = learning_rate_scalar
            logs['loss'] = loss_scalar
            logging_loss = tr_loss

    model.eval()

    # Creating the trace
    dummy_all_input_ids = torch.tensor([f.input_ids for f in features[0:1]], dtype=torch.long).to(args.device)
    dummy_all_attention_mask = torch.tensor([f.attention_mask for f in features[0:1]], dtype=torch.long).to(args.device)

    traced_model = torch.jit.trace(model, [dummy_all_input_ids, dummy_all_attention_mask])
    torch.jit.save(traced_model, "traced_bert.pt")
    tokenizer.save_pretrained('tokenizer')
Ejemplo n.º 22
0
def main():
    """
    main function for conducting Subtask A. Parameters are parsed with argparse.
    Language model should be suitable for German e.g.:
        'bert-base-multilingual-uncased', 
        'bert-base-multilingual-cased',              
        'bert-base-german-cased', 
        'bert-base-german-dbmdz-cased',
        'bert-base-german-dbmdz-uncased',
        'distilbert-base-german-cased',
        'distilbert-base-multilingual-cased'.
    """

    ############################ variable settings #################################
    parser = argparse.ArgumentParser(description='Run Subtask A or B of GermEval 2017 Using Pre-Trained Language Model.')
    parser.add_argument('--task', type=str, default='A', help="The task you want to conduct ('A' or 'B').")
    parser.add_argument('--seed', type=int, default=42, help='Random seed.')
    parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.')
    parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.')
    parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.')
    parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.')
    parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.')
    parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.')    
    parser.add_argument('--train_data', type=str, default='train_df.tsv', help='The filename of the input train data.')
    parser.add_argument('--dev_data', type=str, default='dev_df.tsv', help='The filename of the input development data.')
    parser.add_argument('--test_data1', type=str, default='test_syn_df.tsv', help='The filename of the first input test data (synchronic).')
    parser.add_argument('--test_data2', type=str, default='test_dia_df.tsv', help='The filename of the second input test data (diachronic).')
    parser.add_argument('--output_path', type=str, default='./output/subtaskA/', help='The output directory of the model and predictions.')
    parser.add_argument("--train", default=True, action="store_true", help="Flag for training.")
    parser.add_argument("--save_prediction", default=True, action="store_true", help="Flag for saving predictions.")
    args = parser.parse_args()

    ################################################################################
    set_all_seeds(args.seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # Load data
    train_df = pd.read_csv(args.df_path + args.train_data, delimiter = '\t')
    dev_df = pd.read_csv(args.df_path + args.dev_data, delimiter = '\t')
    test_syn_df = pd.read_csv(args.df_path + args.test_data1, delimiter = '\t')
    test_syn_df = test_syn_df.dropna(subset = ["text"])    
    test_dia_df = pd.read_csv(args.df_path + args.test_data2, delimiter = '\t')
    
    # Create a tokenizer
    lower_case = False
    if args.lang_model[-7:] == "uncased":
        lower_case = True

    if args.lang_model[:4] == "bert":
        model_class = "BERT"
        tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len)
    
    if args.lang_model[:10] == "distilbert":
        model_class = "DistilBERT"
        tokenizer = DistilBertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len)
    
    # get training features
    df = pd.concat([train_df, dev_df])
    sentences = df.text.values
    sentences_syn = test_syn_df.text.values    
    sentences_dia = test_dia_df.text.values
    
    if args.task == 'A':
        class_list = [False, True]
        df['relevance_label'] = df.apply(lambda x:  class_list.index(x['relevance']), axis = 1)
        labels = df.relevance_label.values
        test_syn_df['relevance_label'] = test_syn_df.apply(lambda x:  class_list.index(x['relevance']), axis = 1)
        labels_syn = test_syn_df.relevance_label.values
        test_dia_df['relevance_label'] = test_dia_df.apply(lambda x:  class_list.index(x['relevance']), axis = 1)
        labels_dia = test_dia_df.relevance_label.values

    if args.task == 'B':
        class_list = ["negative", "neutral", "positive"]
        df['sentiment_label'] = df.apply(lambda x:  class_list.index(x['sentiment']), axis = 1)
        labels = df.sentiment_label.values
        test_syn_df['sentiment_label'] = test_syn_df.apply(lambda x:  class_list.index(x['sentiment']), axis = 1)
        labels_syn = test_syn_df.sentiment_label.values
        test_dia_df['sentiment_label'] = test_dia_df.apply(lambda x:  class_list.index(x['sentiment']), axis = 1)
        labels_dia = test_dia_df.sentiment_label.values
    
    num_labels = len(set(labels))
    
    # Tokenize all of the sentences and map the tokens to their word IDs.
    input_ids = [tokenizer.encode(sent, add_special_tokens=True, truncation=True, 
                                  max_length=args.max_len) for sent in sentences]
    input_ids = pad_sequences(input_ids, maxlen=args.max_len, dtype="long", 
                          value=0.0, truncating="post", padding="post")
    # Create attention masks
    attention_masks = [[int(token_id > 0) for token_id in sent] for sent in input_ids]

    # synchronic test data
    input_ids_syn = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_syn]
    input_ids_syn = pad_sequences(input_ids_syn, maxlen=args.max_len, dtype="long", 
                          value=0.0, truncating="post", padding="post")
    attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn]
    
    # diachronic test data
    input_ids_dia = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_dia]
    input_ids_dia = pad_sequences(input_ids_dia, maxlen=args.max_len, dtype="long", 
                          value=0.0, truncating="post", padding="post")
    attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia]

    # split train, dev
    train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev(
        train_df, dev_df, attention_masks, input_ids, labels)

    # transform to torch tensor
    train_inputs = torch.tensor(train_inputs)
    dev_inputs = torch.tensor(dev_inputs)

    train_labels = torch.tensor(train_labels)
    dev_labels = torch.tensor(dev_labels)

    train_masks = torch.tensor(train_masks)
    dev_masks = torch.tensor(dev_masks)

    test_syn_inputs = torch.tensor(input_ids_syn)
    test_syn_labels = torch.tensor(labels_syn)
    test_syn_masks = torch.tensor(attention_masks_syn)

    test_dia_inputs = torch.tensor(input_ids_dia)
    test_dia_labels = torch.tensor(labels_dia)
    test_dia_masks = torch.tensor(attention_masks_dia)

    # Create the DataLoader
    train_dataloader = create_dataloader(train_inputs, train_masks, 
                                     train_labels, args.batch_size, train=True)

    dev_dataloader = create_dataloader(dev_inputs, dev_masks, 
                                   dev_labels, args.batch_size, train=False)

    test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, 
                                        test_syn_labels, args.batch_size, 
                                        train=False)

    test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, 
                                        test_dia_labels, args.batch_size, 
                                        train=False)

    # Create model
    if args.train:
        if model_class == "BERT":
            config = BertConfig.from_pretrained(args.lang_model, num_labels=num_labels)   
            config.hidden_dropout_prob = 0.1
            model = BertForSequenceClassification.from_pretrained(
                args.lang_model,
                num_labels = num_labels,
                output_attentions = False,
                output_hidden_states = False
            )

        if model_class == "DistilBERT":
            config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=num_labels)   
            config.hidden_dropout_prob = 0.1 
            model = DistilBertForSequenceClassification.from_pretrained(
                args.lang_model,
                num_labels = num_labels,
                output_attentions = False,
                output_hidden_states = False
            )
        model.cuda()


        # Create an optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                'weight_decay_rate': 0.0}
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args.lr,
            eps=1e-8
        )

        # Total number of training steps = number of batches * number of epochs
        total_steps = len(train_dataloader) * args.epochs
        # Create the learning rate scheduler
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )
    
        # train model
        # Main Loop
        print("=================== Train ================")
        print("##### Language Model:", args.lang_model, ",", "learning rate:", args.lr)
        print()

        track_time = time.time()
        # trange is a tqdm wrapper around the normal python range
        for epoch in trange(args.epochs, desc="Epoch"):
            print("Epoch: %4i"%epoch, dt.datetime.now())

            model, optimizer, scheduler, tr_loss = train(
                train_dataloader, 
                model=model, 
                device=device, 
                optimizer=optimizer, 
                scheduler=scheduler
            )
            # EVALUATION: TRAIN SET
            true_bools_train, pred_bools_train, f1_train = eval(
                train_dataloader, model=model, device=device)
            print("TRAIN: micro F1 %.4f"%(f1_train)) # here: same as accuracy
            print(confusion_matrix(true_bools_train,pred_bools_train))
            
            # EVALUATION: DEV SET
            true_bools_dev, pred_bools_dev, f1_dev = eval(
                dev_dataloader, model=model, device=device)
            print("EVAL: micro F1 %.4f"%(f1_dev))
            print(confusion_matrix(true_bools_dev,pred_bools_dev))
        

        print("  Training and validation took in total: {:}".format(format_time(time.time()-track_time)))

        # EVALUATION: TEST SYN SET
        true_bools_syn, pred_bools_syn, f1_test_syn = eval(
            test_syn_dataloader, model=model, device=device)
        print("TEST SYN: micro F1 %.4f"%(f1_test_syn))
        print(confusion_matrix(true_bools_syn,pred_bools_syn))

        # EVALUATION: TEST DIA SET
        true_bools_dia, pred_bools_dia, f1_test_dia = eval(
            test_dia_dataloader, model=model, device=device)
        print("TEST DIA: micro F1 %.4f"%(f1_test_dia))
        print(confusion_matrix(true_bools_dia, pred_bools_dia))

        if args.save_prediction:
            if args.task == 'A':
                test_syn_df["relevance_pred"] = pred_bools_syn
                test_dia_df["relevance_pred"] = pred_bools_dia
            if args.task == 'B':                
                test_syn_df["sentiment_pred"] = pred_bools_syn
                test_dia_df["sentiment_pred"] = pred_bools_dia
            
            test_syn_df.to_csv(args.output_path+args.lang_model+"_eval_test_syn.tsv", sep="\t", index = False, 
                header = True, encoding = "utf-8-sig")
            test_dia_df.to_csv(args.output_path+args.lang_model+"_eval_test_dia.tsv", sep="\t", index = False, 
                header = True, encoding = "utf-8-sig")
Ejemplo n.º 23
0
fig, ax = plt.subplots(1,2, figsize=(8,4))
ax = ax.flatten()
_ = plot_pr(y_test, y_pred, ax=ax[0],label="Naive Bayes")
_ = plot_roc(y_test, y_pred, ax=ax[1],label="Naive Bayes")


# #  Model 9 - BERT

# In[14]:


from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig
torch.backends.cudnn.benchmark = True

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
config = DistilBertConfig.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")


# First of all let's check possible max length

# In[ ]:


def len_tokens(s):
  return len(s.split())

df = df_reviews_train.copy()
df["len"] = np.vectorize(len_tokens)(df["review_norm"])
print("max len is: {}".format(max(list(map(len, tokenizer.batch_encode_plus(df.sort_values(by="len", ascending=False)[:1]["review_norm"].to_list())["input_ids"])))))
sns.displot(df["len"])
Ejemplo n.º 24
0
def train(argv=None):
    """
    A function that re-trains BERT for sentiment analysis.
    """
    _set_config()

    num_labels = len(glue_processors[FLAGS.task]().get_labels())
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

    # Load dataset via TensorFlow Datasets
    data, info = tensorflow_datasets.load(f'glue/{_get_tfds_task(FLAGS.task)}', with_info=True)
    train_examples = info.splits['train'].num_examples

    # MNLI expects either validation_matched or validation_mismatched
    valid_examples = info.splits['validation'].num_examples

    # Prepare dataset for GLUE as a tf.data.Dataset instance
    train_dataset = glue_convert_examples_to_features(data['train'],
                                                      tokenizer,
                                                      FLAGS.max_length,
                                                      FLAGS.task)

    # MNLI expects either validation_matched or validation_mismatched
    valid_dataset = glue_convert_examples_to_features(data['validation'],
                                                      tokenizer,
                                                      FLAGS.max_length,
                                                      FLAGS.task)
    train_dataset = train_dataset.shuffle(FLAGS.buffer_size).batch(FLAGS.batch_size).repeat(-1)
    valid_dataset = valid_dataset.batch(FLAGS.batch_size * 2)

    # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
    opt = tf.keras.optimizers.Adam(learning_rate=FLAGS.learning_rate, epsilon=FLAGS.epsilon)
    if FLAGS.use_amp:
        # loss scaling is currently required when using mixed precision
        opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')

    if num_labels == 1:
        loss = tf.keras.losses.MeanSquaredError()
    else:
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model_path = f'./{_get_tfds_task(FLAGS.task)}/'

    model.compile(optimizer=opt, loss=loss, metrics=[metric])

    if FLAGS.evaluate:
        print('Model summary:')
        print(model.summary())
        print('Evaluating on the training dataset...')
        model.evaluate(train_dataset, verbose=2, steps=int(_get_train_length(FLAGS.task) / FLAGS.batch_size))
        print('Evaluating on the validation dataset...')
        model.evaluate(valid_dataset, verbose=2)
        return

    if os.path.exists(model_path + 'tf_model.h5') and not FLAGS.force_train:
        print(f'Model in {model_path} already exists. Skipping training. ' + \
              'If you would like to force a re-train, set the force_train flag.')
        local_vars = locals()
        for variable in local_vars:
            if not variable.startswith('-'):
                print(f'{variable}:\t{local_vars[variable]}')
        return

    # Train and evaluate using tf.keras.Model.fit()
    train_steps = train_examples // FLAGS.batch_size
    valid_steps = valid_examples // (FLAGS.batch_size * 2)

    _ = model.fit(train_dataset, epochs=FLAGS.epochs, steps_per_epoch=train_steps,
                  validation_data=valid_dataset, validation_steps=valid_steps)

    # Save TF2 model

    os.makedirs(model_path, exist_ok=True)
    model.save_pretrained(model_path)
Ejemplo n.º 25
0
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

CLASSES = [1, 2, 3, 4, 5]

config = DistilBertConfig.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(CLASSES),
    id2label={
        0: 1,
        1: 2,
        2: 3,
        3: 4,
        4: 5
    },
    label2id={
        1: 0,
        2: 1,
        3: 2,
        4: 3,
        5: 4
    },
)


def list_arg(raw_value):
    """argparse type for a list of strings"""
    return str(raw_value).split(",")

Ejemplo n.º 26
0
 def __init__(self):
     super(DistilBertModelTest, self).__init__()
     config = DistilBertConfig.from_pretrained('models/config.json')
     self.distilbert = DistilBertForSequenceClassification(
         config)  # /bert_pretrain/
     self.device = torch.device("cuda")
Ejemplo n.º 27
0
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

CLASSES = [1, 0, -1]

config = DistilBertConfig.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(CLASSES),
    id2label={
        0: 1,
        1: 0,
        2: -1
    },
    label2id={
        1: 0,
        0: 1,
        -1: 2
    },
)


def to_sentiment(star_rating):
    if star_rating in {1, 2}:  # negative
        return -1
    if star_rating == 3:  # neutral
        return 0
    if star_rating in {4, 5}:  # positive
        return 1
Ejemplo n.º 28
0
def main():
    """
    main function for conducting Subtask D. Parameters are parsed with argparse.
    Language model should be one of the following:
    Language model should be suitable for German e.g.:
        'bert-base-multilingual-uncased', 
        'bert-base-multilingual-cased',              
        'bert-base-german-cased', 
        'bert-base-german-dbmdz-cased',
        'bert-base-german-dbmdz-uncased',
        'distilbert-base-german-cased',
        'distilbert-base-multilingual-cased'.
    """

    parser = argparse.ArgumentParser(description='Run Subtask D of GermEval 2017 Using Pre-Trained Language Model.')
    parser.add_argument('--seed', type=int, default=42, help='Random seed.')
    parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.')
    parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.')
    parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.')
    parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.')
    parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.')
    parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.')    
    parser.add_argument('--train_data', type=str, default='train_df_opinion.tsv', help='The filename of the input train data.')
    parser.add_argument('--dev_data', type=str, default='dev_df_opinion.tsv', help='The filename of the input development data.')
    parser.add_argument('--test_data1', type=str, default='test_syn_df_opinion.tsv', help='The filename of the first input test data (synchronic).')
    parser.add_argument('--test_data2', type=str, default='test_dia_df_opinion.tsv', help='The filename of the second input test data (diachronic).')
    parser.add_argument('--output_path', type=str, default='./output/subtaskD/', help='The output directory of the model and predictions.')
    parser.add_argument("--train", default=True, action="store_true", help="Flag for training.")
    parser.add_argument("--use_crf", default=False, action="store_true", help="Flag for CRF usage.")
    parser.add_argument("--save_cr", default=False, action="store_true", help="Flag for saving classification report.")
    args = parser.parse_args()
    #############################################################################
    # Settings
    set_all_seeds(args.seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    lm = args.lang_model
    if args.use_crf:
        lm = args.lang_model+"_crf"


    #############################################################################
    # Load and prepare data by adding BIO tags
    train_df = bio_tagging_df(pd.read_csv(args.df_path + args.train_data, delimiter = '\t'))
    dev_df = bio_tagging_df(pd.read_csv(args.df_path + args.dev_data, delimiter = '\t'))
    test_syn_df = bio_tagging_df(pd.read_csv(args.df_path + args.test_data1, delimiter = '\t'))
    test_dia_df = bio_tagging_df(pd.read_csv(args.df_path + args.test_data2, delimiter = '\t'))
    
    # 1. Create a tokenizer
    lower_case = False
    if args.lang_model[-7:] == "uncased":
        lower_case = True

    if args.lang_model[:4] == "bert":
        model_class = "BERT"
        tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case = lower_case, max_length=args.max_len)
    
    if args.lang_model[:10] == "distilbert":
        model_class = "DistilBERT"
        tokenizer = DistilBertTokenizer.from_pretrained(args.lang_model, do_lower_case = lower_case, max_length=args.max_len)

    # get training features
    df = pd.concat([train_df, dev_df])
    sentences = df.text.values
    labels = df.bio_tags.values
    tokenized_texts, labels = get_sentences_biotags(tokenizer, sentences, labels, args.max_len)
    
    sentences_syn = test_syn_df.text.values
    labels_syn = test_syn_df.bio_tags
    tokenized_texts_syn, labels_syn = get_sentences_biotags(tokenizer, sentences_syn, labels_syn, args.max_len)
    
    sentences_dia = test_dia_df.text.values
    labels_dia = test_dia_df.bio_tags
    tokenized_texts_dia, labels_dia = get_sentences_biotags(tokenizer, sentences_dia, labels_dia, args.max_len)


    # get tag values and dictionary
    tag_values, tag2idx, entities = get_tags_list(args.df_path)
    
    # pad input_ids and tags
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen = args.max_len, value=0.0, padding="post",
                          dtype="long", truncating="post")
    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=args.max_len, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
    
    
    input_ids_syn = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_syn],
                          maxlen = args.max_len, value=0.0, padding="post",
                          dtype="long", truncating="post")
    tags_syn = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_syn],
                     maxlen=args.max_len, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")    
    
    input_ids_dia = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_dia],
                          maxlen = args.max_len, value=0.0, padding="post",
                          dtype="long", truncating="post")
    tags_dia = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_dia],
                     maxlen=args.max_len, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
    
    # create attention masks
    attention_masks= [[int(token_id > 0) for token_id in sent] for sent in input_ids]    
    attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn]
    attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia]


    # split train, dev
    train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev(
        train_df, dev_df, attention_masks, input_ids, tags)

    # transform to torch tensor
    train_inputs = torch.tensor(train_inputs, dtype = torch.long)
    dev_inputs = torch.tensor(dev_inputs, dtype = torch.long)

    train_labels = torch.tensor(train_labels, dtype = torch.long)
    dev_labels = torch.tensor(dev_labels, dtype = torch.long)

    train_masks = torch.tensor(train_masks, dtype = torch.uint8)
    dev_masks = torch.tensor(dev_masks, dtype = torch.uint8)

    test_syn_inputs = torch.tensor(input_ids_syn, dtype = torch.long)
    test_syn_labels = torch.tensor(tags_syn, dtype = torch.long)
    test_syn_masks = torch.tensor(attention_masks_syn, dtype = torch.uint8)

    test_dia_inputs = torch.tensor(input_ids_dia, dtype = torch.long)
    test_dia_labels = torch.tensor(tags_dia, dtype = torch.long)
    test_dia_masks = torch.tensor(attention_masks_dia, dtype = torch.uint8)

    # create DataLoader
    train_dataloader = create_dataloader(train_inputs, train_masks, train_labels, args.batch_size, train = True)
    dev_dataloader = create_dataloader(dev_inputs, dev_masks, dev_labels, args.batch_size, train = False)  

    test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, test_syn_labels, args.batch_size, train = False)   
    test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, test_dia_labels, args.batch_size, train = False)


    #############################################################################
    # Training
    if args.train:
        # Load Config
        if model_class=="BERT":
            config = BertConfig.from_pretrained(args.lang_model, num_labels=len(tag2idx))
            config.hidden_dropout_prob = 0.1 # dropout probability for all fully connected layers
                                             # in the embeddings, encoder, and pooler; default = 0.1
            model = TokenBERT(
                model_name=args.lang_model, 
                num_labels=len(tag2idx),
                use_crf=args.use_crf)

        if model_class=="DistilBERT":
            config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=len(tag2idx))   
            config.hidden_dropout_prob = 0.1       
            model = TokenDistilBERT(
                model_name=args.lang_model, 
                num_labels=len(tag2idx),
                use_crf=args.use_crf)
        
        model.cuda() 

        # Create an optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                'weight_decay_rate': 0.0}
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args.lr,
            eps=1e-8
        )
        # Total number of training steps = number of batches * number of epochs
        total_steps = len(train_dataloader) * args.epochs
        # Create the learning rate scheduler
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        # Main Loop
        print("=================== Train ================")
        print("##### Language Model:", args.lang_model, ",", "use CRF:", args.use_crf, ",", "learning rate:", args.lr, ",", "DROPOUT:", config.hidden_dropout_prob)
        print()

        track_time = time.time()
                
        for epoch in trange(args.epochs, desc="Epoch"):
            print("Epoch: %4i"%epoch, dt.datetime.now())
            
            # TRAINING
            model, optimizer, scheduler, tr_loss = training(
                train_dataloader, 
                model=model, 
                device=device, 
                optimizer=optimizer, 
                scheduler=scheduler
                )
            
            # EVALUATION: TRAIN SET
            y_true_train, y_pred_train, f1s_train, f1s_overlap_train = evaluation(
                    train_dataloader, model=model, device=device, tag_values=tag_values)
            print("TRAIN: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_train, f1s_overlap_train))
            
            # EVALUATION: DEV SET
            y_true_dev, y_pred_dev, f1s_dev, f1s_overlap_dev = evaluation(
                    dev_dataloader, model=model, device=device, tag_values=tag_values)
            print("EVAL: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_dev, f1s_overlap_dev))
        
        print("  Training and validation took in total: {:}".format(format_time(time.time()-track_time)))

        # EVALUATION: TEST SYN SET
        y_true_test_syn, y_pred_test_syn, f1s_test_syn, f1s_overlap_test_syn = evaluation(
                test_syn_dataloader, model=model, device=device, tag_values=tag_values)
        print("TEST SYN: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_test_syn, f1s_overlap_test_syn))
                
        # EVALUATION: TEST DIA SET
        y_true_test_dia, y_pred_test_dia, f1s_test_dia, f1s_overlap_test_dia = evaluation(
                test_dia_dataloader, model=model, device=device, tag_values=tag_values)
        print("TEST DIA: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_test_dia, f1s_overlap_test_dia))
        
        # Print classification report
        cr_report_syn = seq_classification_report(y_true_test_syn, y_pred_test_syn, digits = 4)
        cr_report_dia = seq_classification_report(y_true_test_dia, y_pred_test_dia, digits = 4)
        cr_report_syn_overlap = seq_classification_report(y_true_test_syn, y_pred_test_syn, digits = 4, overlap = True)
        cr_report_dia_overlap = seq_classification_report(y_true_test_dia, y_pred_test_dia, digits = 4, overlap = True)
        
        print("Classification report for TEST SYN (Exact):", cr_report_syn)
        print("Classification report for TEST SYN (Overlap):", cr_report_dia)
        print("Classification report for TEST DIA (Exact):", cr_report_syn_overlap)
        print("Classification report for TEST DIA (Overlap):", cr_report_dia_overlap)

        if args.save_cr:            
            pickle.dump(cr_report_syn, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_syn_exact.txt','wb'))
            pickle.dump(cr_report_dia, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_dia_exact.txt','wb'))
            pickle.dump(cr_report_syn_overlap, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_syn_overlap.txt','wb'))
            pickle.dump(cr_report_dia_overlap, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_dia_overlap.txt','wb'))
Ejemplo n.º 29
0
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # See if CUDA available
    device = torch.device("cpu")
    if args.n_gpu > 0 and torch.cuda.is_available():
        print("Training on GPU")
        device = torch.device("cuda:0")

    # model configuration
    bert_model = 'distilbert-base-uncased'
    n_epochs = args.n_epochs
    bert_config = DistilBertConfig.from_pretrained(bert_model,
                                                   num_labels=2,
                                                   output_hidden_states=True)

    # wandb initialization
    wandb.init(project="domain-adaptation-twitter-emnlp",
               name=args.run_name,
               config={
                   "epochs": n_epochs,
                   "train_split_percentage": args.train_pct,
                   "bert_model": bert_model,
                   "seed": seed,
                   "tags": ",".join(args.tags)
               })
    #wandb.watch(model)
    #Create save directory for model
    if not os.path.exists(f"{args.model_dir}/{Path(wandb.run.dir).name}"):
Ejemplo n.º 30
0
# print(all_predicate_ids[0])

print("Graph loaded")

# model init
import torch

from transformers import DistilBertTokenizer, DistilBertConfig
from MPBert_sampler_model import MessagePassingHDTBert

DEVICE = 'cuda'

# model configuration
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
config = DistilBertConfig.from_pretrained(model_name, num_labels=1)

E_BEAM = 10
P_BEAM = 100

model = MessagePassingHDTBert(config, topk_entities=E_BEAM, topk_predicates=P_BEAM)

for param in model.bert.parameters():
    param.requires_grad = False


if DEVICE == 'cuda':
    device = torch.device("cuda")
    # run model on the GPU
    model.cuda()
else: