Beispiel #1
1
def train(args, train_loader, test_loader, test_json):
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")
    print("loading bert.")
    model = BertForTokenClassification.from_pretrained("bert-base-cased",
                                                       num_labels=3)
    model.to(device)
    optim = AdamW(model.parameters(), lr=args.lr)
    print("loaded. staring training.")

    # best_rationale_acc = 0
    for epoch in range(args.num_epoch):
        for batch in tqdm(train_loader):
            optim.zero_grad()
            model.train()
            model.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            labels=labels)
            loss = outputs[0]
            loss.backward()
            optim.step()

        if epoch % 3 == 2 and not args.no_logs:
            total, correct, pred_rationale = evaluate(args, model, test_loader)
            print_score(args, total, correct)
            print_human_vs_model(test_json, pred_rationale)
    return model
Beispiel #2
0
    def __init__(self,
                 config_name: str,
                 model_name: str = None,
                 num_tags: int = 2,
                 batch_first: bool = True) -> None:
        self.batch_first = batch_first

        if not os.path.exists(config_name):
            raise ValueError("未找到模型配置文件 '{}'".format(config_name))
        else:
            self.config_name = config_name

        if model_name is not None:
            if not os.path.exists(model_name):
                raise ValueError("未找到模型预训练参数文件 '{}'".format(model_name))
            else:
                self.model_name = model_name
        else:
            self.model_name = None

        super().__init__()
        self.bert_config = BertConfig.from_pretrained(self.config_name)
        self.bert_config.num_labels = num_tags
        self.model_kwargs = {'config': self.bert_config}

        if self.model_name is not None:
            self.bertModel = BertForTokenClassification.from_pretrained(
                self.model_name, **self.model_kwargs)
        else:
            self.bertModel = BertForTokenClassification(self.bert_config)

        self.crf_model = CRF(num_tags=num_tags, batch_first=batch_first)
Beispiel #3
0
    def __init__(self,
                 config_name: str,
                 model_name: str = None,
                 num_tags: int = 2,
                 batch_first: bool = True) -> None:
        self.batch_first = batch_first
        if not os.path.exists(config_name):
            raise ValueError('{} config file not found'.format(config_name))
        else:
            self.config_name = config_name

        if model_name is not None:
            if not os.path.exists(model_name):
                raise ValueError(' {} model file not found'.format(model_name))
            else:
                self.model_name = model_name
        else:
            self.model_name = None

        if num_tags <= 0:
            raise ValueError(f'invalid number of tags:{num_tags}')

        super().__init__()
        #bert config文件
        self.bert_config = BertConfig.from_pretrained(self.config_name)
        self.bert_config.num_tags = num_tags
        self.model_kwargs = {'config': self.bert_config}

        if self.model_name is not None:
            self.bertModel = BertForTokenClassification.from_pretrained(
                self.model_name, **self.model_kwargs)
        else:
            self.bertModel = BertForTokenClassification(self.bert_config)

        self.crfModel = CRF(num_tags=num_tags, batch_first=batch_first)
    def build_model(self, args):
        if args.task == 'BertForTokenClassification':
            # obtain num_label from dataset before assign model
            from transformers import BertForTokenClassification, BertConfig
            config = BertConfig.from_json_file(args.config_file)
            # **YD** mention detection, num_label is by default 3
            assert hasattr(args, 'num_labels')
            config.num_labels = args.num_labels
            model = BertForTokenClassification(config)

            # **YD** add load state_dict from pre-trained model
            # could make only master model to load from state_dict, not quite sure whether this works for single GPU
            # if distributed_utils.is_master(args) and args.hetseq_state_dict is not None:
            if args.hetseq_state_dict is not None:
                state_dict = torch.load(args.hetseq_state_dict,
                                        map_location='cpu')['model']
                if args.load_state_dict_strict:
                    model.load_state_dict(state_dict, strict=True)
                else:
                    model.load_state_dict(state_dict, strict=False)

            elif args.transformers_state_dict is not None:
                state_dict = torch.load(args.transformers_state_dict,
                                        map_location='cpu')
                if args.load_state_dict_strict:
                    model.load_state_dict(state_dict, strict=True)
                else:
                    model.load_state_dict(state_dict, strict=False)
        else:
            raise ValueError('Unknown fine_tunning task!')
        return model
class BertModel(nn.Module):
    def __init__(self,
                 pretrained_model_name_or_dir=None,
                 pretrained_num_classes=None,
                 fine_tune=False,
                 bert_config=None):
        """
        Buils a bert model for token classification
        :param pretrained_model_name_or_dir: Specify the pretrained_model_name_or_dir to load from to start from a pretrained model
        :param pretrained_num_classes: The number of classes for the pretrained model
        :param fine_tune: If fine tune is true, only the classification layer weights are tuned.
        :param bert_config: If this is not none, this config is used to create a BERT model from scratch using the configuration
        """
        super().__init__()
        if bert_config is None:
            self.model = BertForTokenClassification.from_pretrained(
                pretrained_model_name_or_dir,
                num_labels=pretrained_num_classes)
        else:
            self.model = BertForTokenClassification(bert_config)

        # Fine tune, freeze all other weights except classifier
        if fine_tune:
            self._freeze_base_weights()

    def _freeze_base_weights(self):
        for param in self.model.base_model.parameters():
            param.requires_grad = False

    def forward(self, *input):
        return self.model(*input)

    def save(self, path):
        self.model.save_pretrained(save_directory=path)
Beispiel #6
0
def get_text_reader(reader_name, task, num_labels):
    # AILAW Corpus is korean dataset.
    # So, model is fixed to Korean Model such as multilingual-BERT, kobert, koelectra, etc.

    if reader_name == "bert":
        if task == "classification":
            model_name = "bert-base-multilingual-cased"
            text_reader = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        else: # ner
            model_name = "bert-base-multilingual-cased"
            text_reader = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    elif reader_name == "kobert":
        if task == "classification":
            model_name = "monologg/kobert"
            text_reader = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        else: # ner
            model_name = "monologg/kobert"
            text_reader = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    elif reader_name == "koelectra":
        if task == "classification":
            model_name = "monologg/koelectra-base-discriminator"
            text_reader = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        else: # ner
            model_name = "monologg/koelectra-base-discriminator"
            text_reader = ElectraForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    else:
        raise KeyError(reader_name)

    return text_reader
def launch_bert(training_flag, test_flag):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    if training_flag is not None:
        model = BertForTokenClassification.from_pretrained(
            'bert-base-uncased', num_labels=len(tags_vals))
        ## ---------12 . Optimizer -> weight regularization is  a solution to reduce the overfitting of a deep learning
        """ 
        Last keras optimization 2020 (rates from 0.01 seem to be best hyperparamater )for weight regularization for weights layers
            from keras.layers import LSTM
            from keras.regularizers import l2
        model.add(LSTM(32, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01))) 
        Note :  BERT not include beta an gamma parametres for optimization
        """
        FULL_FINETUNING = True
        if FULL_FINETUNING:
            param_optimizer = list(model.named_parameters())
            no_decay = ['bias', 'gamma', 'beta']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay_rate':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay_rate':
                0.0
            }]
        else:
            param_optimizer = list(model.classifier.named_parameters())
            optimizer_grouped_parameters = [{
                "params": [p for n, p in param_optimizer]
            }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr)

        launch_training(training_path=args.training_data,
                        training_epochs=4,
                        valid_path=args.validate_data,
                        training_batch_size=1,
                        model=model,
                        model_path=args.save + '/config.json',
                        tokenizer=tokenizer,
                        optimizer=optimizer)
    if test_flag is not None:
        if (args.save is not None):
            config = BertConfig.from_json_file(args.save + '/config.json')
            model = BertForTokenClassification.from_pretrained(
                pretrained_model_name_or_path=args.save + '/pytorch_model.bin',
                config=config)
        else:
            model = BertForTokenClassification.from_pretrained(
                'bert-base-uncased', num_labels=len(tags_vals))
        launch_test_directory(test_path=test_flag,
                              model=model,
                              tokenizer=tokenizer)
Beispiel #8
0
def main():

    ####################################################################
    ## Data
    ####################################################################

    all_datasets = []
    for dataroot in args.dataroot:
        curr_dataset = BinaryDataset(root_dir=dataroot,
                                     binary_format='elf',
                                     targets='start',
                                     mode='random-chunks',
                                     chunk_length=args.sequence_len)
        all_datasets.append(curr_dataset)

    # TODO: ConcatDataset. This requires the __len__() to be implemented.
    dataset = torch.utils.data.ConcatDataset(all_datasets)
    print("Dataset len() = {0}".format(len(dataset)))

    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=args.batch_size,
                                             shuffle=True)

    ####################################################################
    ## Model
    ####################################################################

    config = BertConfig(
        vocab_size=256,
        hidden_size=args.hidden_size,
        num_hidden_layers=args.hidden_layers,
        num_attention_heads=args.num_attn_heads,
        intermediate_size=args.hidden_size *
        4,  # BERT originally uses 4x hidden size for this, so copying that. 
        hidden_act='gelu',
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=args.sequence_len,  # Sequence length max 
        type_vocab_size=1,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        gradient_checkpointing=False)

    model = BertForTokenClassification(config=config).cuda()
    # model = torch.nn.DataParallel(model, dim=0)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    lossfn = torch.nn.CrossEntropyLoss()

    print("Beginning training")
    for epoch in range(args.epochs):
        train_loss, train_acc = train(model, lossfn, optimizer, dataloader,
                                      epoch)

        print(
            f"Train Loss: {train_loss} | Test Loss: {test_loss} | Test Acc: {test_acc}"
        )
 def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
     config.num_labels = self.num_labels
     model = BertForTokenClassification(config=config)
     model.eval()
     loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
     result = {
         "loss": loss,
         "logits": logits,
     }
     self.parent.assertListEqual(
         list(result["logits"].size()),
         [self.batch_size, self.seq_length, self.num_labels])
     self.check_loss_output(result)
Beispiel #10
0
    def __init__(self, 
                 config_name:str = 'bert-base-chinese', 
                 model_name:str = None, 
                 num_tags: int = 2, 
                 batch_first:bool = True) -> None:

        # 记录batch_first
        self.batch_first = batch_first

        # 加载模型配置文件
        if config_name != 'bert-base-chinese':
            if not os.path.exists(config_name):
                raise ValueError(
                    "Error! No model config file: '{}'".format(config_name)
                )
            else:
                self.config_name = config_name
        else:
            self.config_name = config_name

        # 加载预训练模型
        if model_name is not None:
            if model_name == 'bert-base-chinese':
                self.model_name = model_name
            elif not os.path.exists(model_name):
                raise ValueError(
                    "Error! No pretrained model: '{}'".format(model_name)
                )
            else:
                self.model_name = model_name
        else:
            self.model_name = None

        if num_tags <= 0:
            raise ValueError(f'invalid number of tags: {num_tags}')

        super().__init__()

        self.bert_config = BertConfig.from_pretrained(self.config_name)
        self.bert_config.num_labels = num_tags

        # 如果模型不存在
        if self.model_name is None:
            self.model_kwargs = {'config': self.bert_config}
            self.bertModel = BertForTokenClassification(**self.model_kwargs)
        elif self.model_name == 'bert-base-chinese':
            self.model_kwargs = {'config': self.bert_config, "from_tf": True}
            self.bertModel = BertForTokenClassification.from_pretrained(self.model_name, **self.model_kwargs)

        self.crf_model = CRF(num_tags=num_tags, batch_first=batch_first)
Beispiel #11
0
    def __init__(self):

        self.tag2idx = {
            'B-art': 0,
            'B-eve': 1,
            'B-geo': 2,
            'B-gpe': 3,
            'B-nat': 4,
            'B-org': 5,
            'B-per': 6,
            'B-tim': 7,
            'I-art': 8,
            'I-eve': 9,
            'I-geo': 10,
            'I-gpe': 11,
            'I-nat': 12,
            'I-org': 13,
            'I-per': 14,
            'I-tim': 15,
            'O': 16
        }

        self.idx2tag = {}
        for key in list(self.tag2idx.keys()):
            self.idx2tag[self.tag2idx[key]] = key

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                       do_lower_case=True)
        self.model = BertForTokenClassification.from_pretrained(
            "bert-base-uncased", num_labels=len(self.tag2idx))
        self.model.load_state_dict(
            torch.load("ner.dataset.4.pth", map_location=torch.device('cpu')))
        self.model.eval()
Beispiel #12
0
def main():
    # 各トークンを以下の13クラスのいずれかに分類するような固有表現抽出をしたい.
    labels = [
        'B-corporation', 'B-creative-work', 'B-group', 'B-location',
        'B-person', 'B-product', 'I-corporation', 'I-creative-work', 'I-group',
        'I-location', 'I-person', 'I-product', 'O'
    ]
    id2label = {i: label for i, label in enumerate(labels)}
    # label2id = {label: i for i, label in enumerate(labels)}

    # 利用する学習済みBERTモデルの名前を指定する.
    model_name = 'bert-large-cased'

    # 学習済みモデルに対応したトークナイザを生成する.
    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=model_name, )

    # 学習済みモデルから各トークン分類用モデルのインスタンスを生成する.
    model = BertForTokenClassification.from_pretrained(
        pretrained_model_name_or_path=model_name,
        id2label=id2label,  # 各トークンに対する出力を13次元にしたいのでこれを渡す.
    )
    # 一部の重みが初期化されていませんよという警告が出るが(クラス分類する層が
    # 初期化されていないのは当然)面倒なので無視する.

    print('◆ 適当な文章をモデルに流してみる.→ 14トークン×13クラスの予測結果になっている(サイズが).')
    sentence = 'The Empire State Building officially opened on May 1, 1931.'
    inputs = torch.tensor([tokenizer.encode(sentence)])  # ID列をテンソル化して渡す.
    outputs = model(inputs)
    print(outputs[0].size())
Beispiel #13
0
 def __init__(self):
     super(BertClass, self).__init__()
     self.l1 = BertForTokenClassification.from_pretrained(
         'bert-base-chinese',
         num_labels=21,
         output_attention=False,
         output_hidden_states=False)
def main():
    args = docopt(__doc__)

    processors = {
        'multi-sents': BertMultiSentProcessor,
        'uni-sent': BertUniSentProcessor
    }
    
    max_input_len = int(args['--max-input-len'])
    tokenizer = BertWordPieceTokenizer(str(args['--path-to-vocab']))
    processor_constructor = processors[str(args['--mode'])]
    processor = processor_constructor(max_input_len, tokenizer)
    if args['--ensemble']:
        bert_model = BertEnsemble.load_trained(str(args['--path-to-model-dir']))
    elif args['--crf']:
        bert_model = BertWithCRF.from_pretrained(str(args['--path-to-model-dir']))
    elif not args['--rule']:
        bert_model = BertForTokenClassification.from_pretrained(str(args['--path-to-model-dir']))
    device_no = int(args['--gpu'])
    device = torch.device(f'cuda:{device_no}') if device_no > -1 else torch.device('cpu')
    if args['--crf']:
        bert_extractor = BertWithCRFExtractor(bert_model, tokenizer, max_input_len, device)
    elif not args['--rule']:
        bert_extractor = BertExtractor(bert_model, tokenizer, max_input_len, device)
    else:
        bert_extractor = RuleExtractor()
    corpus = read_corpus(str(args['--path-to-corpus-dir']))
    ents_table = build_ents_table(corpus, processor, bert_extractor, batch_size=int(args['--batch-size']))
    ents_table.to_csv(str(args['--path-to-output']), index=False, sep='\t')
    return
Beispiel #15
0
def _train_bert(training_data_retrieval_func):
    tokenizer = BertTokenizerFast.from_pretrained(BERT_BASE_MODEL)
    tokenizer.add_tokens(ADDITIONAL_SPECIAL_TOKENS)

    tokens, labels = training_data_retrieval_func()
    train_dataset = _get_datasets(tokens, labels, tokenizer)

    model = BertForTokenClassification.from_pretrained(
        BERT_BASE_MODEL, num_labels=len(ALL_LABEL_IDS))
    model.resize_token_embeddings(len(tokenizer))

    run_id = '{}_{}'.format(datetime.datetime.now().strftime('%Y%m%d-%H%M%S'),
                            utils.get_config('logging.filename'))
    training_args = TrainingArguments(
        output_dir=f'./bert/results/{run_id}',
        logging_dir=f'./bert/logs/{run_id}',
        logging_steps=500,
        save_steps=2000,
        per_device_train_batch_size=8,
        num_train_epochs=3,
        learning_rate=5e-5,
        warmup_steps=0,
        weight_decay=0,
    )

    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_dataset)

    trainer.train()
    path_to_model = utils._get_cache_path('bert_for_SE_tagging')
    model.save_pretrained(path_to_model)
    tokenizer.save_pretrained(path_to_model)
Beispiel #16
0
def get_predictions(filename, outputName):
    label_list = ['O', 'B-CLEntity', 'I-CLEntity', 'L-CLEntity', 'U-CLEntity']
    model = BertForTokenClassification.from_pretrained(
        "bert_ner_finetuned_iliad-with-gpu-pattern2.model")
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
    book_lines = []
    book = open(filename)
    for line in book:
        line = line.strip()
        book_lines.append(line)
    book_lines = [line for line in book_lines if line]

    pred = []
    for line in book_lines:
        line_tokens = tokenizer.tokenize(
            tokenizer.decode(tokenizer.encode(line)))
        line_inputs = tokenizer.encode(line, return_tensors="pt")
        line_outputs = model(line_inputs).logits
        line_predictions = torch.argmax(line_outputs, dim=2)
        line_pred_labels = []
        for prediction in line_predictions[0].numpy():
            line_pred_labels.append(label_list[prediction])
        pred.append(line_pred_labels)

    with open(outputName, 'w') as f:
        f.write(json.dumps(pred))
    return
Beispiel #17
0
    def load(self, dirpath):
        """ Loads a trained model from specified folder on disk.

            Parameters
            ----------
            dirpath : str
                directory from which model artifacts should be loaded

            Returns
            -------
            self
        """
        if not os.path.exists(dirpath):
            raise ValueError("Model directory not found: {:s}".format(dirpath))

        label_mappings = joblib.load(
            os.path.join(dirpath, "label_mappings.pkl"))
        self.label2id_ = label_mappings["label2id"]
        self.id2label_ = label_mappings["id2label"]
        self.special_tokens_ = label_mappings["special_tokens"]
        self.model_ = BertForTokenClassification.from_pretrained(
            dirpath,
            num_labels=len(self.label2id_),
            output_attentions=False,
            output_hidden_states=False)
        self.model_.to(self._device)
        self.tokenizer_ = BertTokenizer.from_pretrained(
            dirpath, do_basic_tokenize=False)

        return self
Beispiel #18
0
def load_model(args, test):
    # if the model is for testing, attempt to load previous arguments
    if test:
        try:
            prev_args = torch.load(
                os.path.join(args.model_dir, "train_args.bin")
            )
            args.max_length = prev_args.max_length
            args.do_lower_case = prev_args.do_lower_case
            args.keep_accents = prev_args.keep_accents
        except FileNotFoundError:
            pass

    tokenizer = BertTokenizer.from_pretrained(
        args.model_dir,
        do_lower_case=args.do_lower_case,
        keep_accents=args.keep_accents,
    )
    model = BertForTokenClassification.from_pretrained(
        args.model_dir,
        finetuning_task="conll2002",
        num_labels=len(LABEL_LIST),
    ).to(args.device)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    return model, tokenizer
 def __init__(self,
              num_labels=len(id2label.keys()),
              from_pretrained='bert-base-uncased'):
     super(BertForValueExtraction, self).__init__()
     print(f"Loading BertForTokenClassification as {from_pretrained}")
     self.token_classifier = BertForTokenClassification.from_pretrained(
         from_pretrained, num_labels=num_labels, return_dict=True)
Beispiel #20
0
    def __init__(self, hparams, user_tokens=['<newline>', '<bullet>']):
        super(BertNerSystem, self).__init__()
        self.hparams = hparams
        self.hparams.model_type = self.hparams.model_type.lower()
        tokenizer = BertTokenizer.from_pretrained(
            self.hparams.tokenizer_name if self.hparams.tokenizer_name else
            self.hparams.model_name_or_path,
            never_split=user_tokens,
            do_lower_case=self.hparams.do_lower_case,
            cache_dir=self.hparams.cache_dir
            if self.hparams.cache_dir else None,
        )

        config = AutoConfig.from_pretrained(
            self.hparams.config_name
            if self.hparams.config_name else self.hparams.model_name_or_path,
            cache_dir=self.hparams.cache_dir
            if self.hparams.cache_dir else None,
            output_past=not self.hparams.do_train,
            num_labels=self.hparams.num_labels,
        )
        model = BertForTokenClassification.from_pretrained(
            self.hparams.model_name_or_path,
            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
            config=config,
            cache_dir=self.hparams.cache_dir
            if self.hparams.cache_dir else None,
        )
        self.config, self.tokenizer, self.model = config, tokenizer, model
        self.loss = []  # for keeping track of average loss
        self.metrics = {}

        self.vocab = {v: k for k, v in self.tokenizer.get_vocab().items()}
Beispiel #21
0
def main():
    data_table = pd.read_csv("train_table.csv")

    #define bert tokenizer && bert model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForTokenClassification.from_pretrained(
        'bert-base-uncased', output_hidden_states=True)
    model.eval()
    print(model.config)

    data_embeddings = {}
    for index, row in data_table.iterrows():
        print(index)
        #get sentence_tokens && label_tokens
        sentence = row[1]
        span = row[2]

        #get bert embeddings
        span_embeddings = get_bert_embedding(model, tokenizer, sentence, span)

        #non-propagandistic span embeddings
        if pd.isnull(span):
            data_embeddings[sentence] = span_embeddings
        #propagandistic span embeddings
        else:
            data_embeddings[sentence] = (span, row[3], row[4], span_embeddings)

    print("Writing to output file...")
    torch.save(data_embeddings, "data_embeddings.pt")
    print("Done.")
 def __init__(self):
     super(BERTClass, self).__init__()
     config = BertConfig.from_pretrained("./bert-base-uncased",
                                         num_labels=len(list(
                                             tag2idx.keys())))
     self.l1 = BertForTokenClassification.from_pretrained(
         './bert-base-uncased', config=config)
Beispiel #23
0
def main(num_epochs, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    classes = ["B", "I", "O"]
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                              do_lower_case=False)
    tag_to_idx = {t: i for i, t in enumerate(classes)}
    tag_to_idx['[PAD]'] = -100
    idx_to_tag = {i: t for t, i in tag_to_idx.items()}

    train_dataloader, dev_dataloader, dev_sentences, test_dataloader, test_sentences = parse_data(
        tokenizer, tag_to_idx, batch_size=16)

    print('data loaded and tokenized')

    model = BertForTokenClassification.from_pretrained('bert-base-cased',
                                                       num_labels=len(classes))
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    print('model instantiated')

    model, dev_preds = train_model(tokenizer, tag_to_idx, model, num_epochs,
                                   train_dataloader, optimizer, device,
                                   dev_dataloader, idx_to_tag)
    test_preds = evaluate(model, test_dataloader, device, idx_to_tag)
    save_preds('dev_preds.txt', dev_preds, dev_sentences)
    save_preds('test_preds.txt', test_preds, test_sentences)
Beispiel #24
0
 def __init__(self, model_path, num_labels, label_map, device):
     super().__init__()
     self.model = BertForTokenClassification.from_pretrained(model_path, num_labels=num_labels).to(device)
     self.transitions = torch.nn.Parameter(torch.randn(num_labels, num_labels))
     # Ok, so we're going to add some constraints here
     self.label_map = label_map
     self.num_labels = num_labels
Beispiel #25
0
    def load_model(self,
                   model_filepath,
                   config_filepath,
                   pretrained_model="bert-base-cased"):
        """
        Load cybert model.

        :param model_filepath: Filepath of the model (.pth or .bin) to
        be loaded
        :type model_filepath: str
        :param label_map_filepath: Config file (.json) to be
        used
        :type label_map_filepath: str
        :param pretrained_model: Name of pretrained model to be loaded from
        transformers
        repo, default is bert-base-cased
        :type pretrained_model: str

        Examples
        --------
        >>> from clx.analytics.cybert import Cybert
        >>> cyparse = Cybert()
        >>> cyparse.load_model('/path/to/model.pth', '/path/to/config.json')
        """
        with open(config_filepath) as f:
            config = json.load(f)
        self._label_map = {int(k): v for k, v in config["id2label"].items()}
        model_state_dict = torch.load(model_filepath)
        self._model = BertForTokenClassification.from_pretrained(
            pretrained_model,
            state_dict=model_state_dict,
            num_labels=len(self._label_map),
        )
        self._model.cuda()
        self._model.eval()
Beispiel #26
0
 def __init__(self, model_name, num_labels, lr):
     super().__init__()
     self.save_hyperparameters()
     self.bert_tc = BertForTokenClassification.from_pretrained(
         model_name,
         num_labels=num_labels
     )
Beispiel #27
0
    def __init__(self, hparams: Union[Dict, Namespace]):
        # NOTE: internal code may pass hparams as dict **kwargs
        if isinstance(hparams, Dict):
            hparams = Namespace(**hparams)

        self.label_ids_to_label = LabelTokenAligner.get_ids_to_label(
            hparams.labels)
        num_labels = len(self.label_ids_to_label)

        super().__init__()
        # Enable to access arguments via self.hparams
        self.save_hyperparameters(hparams)

        self.step_count = 0
        self.output_dir = Path(self.hparams.output_dir)
        self.cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
        if self.cache_dir is not None and not os.path.exists(
                self.hparams.cache_dir):
            os.mkdir(self.cache_dir)

        # AutoTokenizer
        # trf>=4.0.0: PreTrainedTokenizerFast by default
        # NOTE: AutoTokenizer doesn't load PreTrainedTokenizerFast...
        self.tokenizer_name = self.hparams.model_name_or_path
        self.tokenizer = BertTokenizerFast.from_pretrained(
            self.tokenizer_name,
            cache_dir=self.cache_dir,
            tokenize_chinese_chars=False,
            strip_accents=False,
        )

        # AutoConfig
        config_name = self.hparams.model_name_or_path
        self.config: PretrainedConfig = BertConfig.from_pretrained(
            config_name,
            **({
                "num_labels": num_labels
            } if num_labels is not None else {}),
            cache_dir=self.cache_dir,
        )
        extra_model_params = (
            "encoder_layerdrop",
            "decoder_layerdrop",
            "dropout",
            "attention_dropout",
        )
        for p in extra_model_params:
            if getattr(self.hparams, p, None) and hasattr(self.config, p):
                setattr(self.config, p, getattr(self.hparams, p, None))

        # AutoModelForTokenClassification
        self.model: PreTrainedModel = BertForTokenClassification.from_pretrained(
            self.hparams.model_name_or_path,
            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
            config=self.config,
            cache_dir=self.cache_dir,
        )

        self.scheduler = None
        self.optimizer = None
def main(args):
    current_path = os.getcwd()
    logging.info(f'current python path {current_path}...')
    logging.info('Load data...')
    
    with open(f"{args.dataset}/train_dataset.pkl", "rb") as f:
        train_dataset = pickle.load(f)
    with open(f"{args.dataset}/valid_dataset.pkl", "rb") as f:
        valid_dataset = pickle.load(f)
    with open(f"{args.dataset}/test_dataset.pkl", "rb") as f:
        test_dataset = pickle.load(f)
    
    logging.info('Making dataloader...')
    train_loader = DataLoader(
        dataset = train_dataset,
        batch_size = args.batch_size,
        shuffle = True,
        collate_fn = lambda x: Bert_dataset.collate_fn(train_dataset, x)
    )

    valid_loader = DataLoader(
        dataset = valid_dataset,
        batch_size = args.batch_size,
        collate_fn = lambda x: Bert_dataset.collate_fn(valid_dataset, x)
    )

    test_loader = DataLoader(
        dataset = test_dataset,
        batch_size = args.batch_size,
        collate_fn = lambda x: Bert_dataset.collate_fn(test_dataset, x)
    )
    
    logging.info('Load model and parameters...')
    model = BertForTokenClassification.from_pretrained("bert-base-chinese",
        num_labels = 3,
        output_attentions = False,
        output_hidden_states = False
    )
    
    trainer = Trainer(model, train_loader, valid_loader)
    
    logging.info('Test validation dataset...')
    acc, total_loss = trainer.evaluation(test=False)
    print(f"device: {trainer.device} classification acc: {acc: .4f} validation loss: {total_loss:.4f}")
    
    logging.info('Start training...')
    trainer.training_process(early_stopping = True, 
                             n_iter_no_change = 5, 
                             max_epoch = args.max_epoch, 
                             save_params = True, 
                             verbose = True, 
                             learning_rate = args.learning_rate, 
                             save_paths = args.save_paths)
    
    logging.info('Training ends!')
    logging.info('Test validation dataset...')
    acc, total_loss = trainer.evaluation(test=False)
    print(f"device: {trainer.device} classification acc: {acc: .4f} validation loss: {total_loss:.4f}")
    logging.info('Finish!')
Beispiel #29
0
 def __init__(self, vocab_size, emb_size, hidden_size, num_labels):
     super(bert_chinese_ner, self).__init__()
     self.bertconfig = BertConfig.from_pretrained(
         bert_chinese_ner.model_path,
         num_labels=num_labels,
         author="lingze")
     self.model = BertForTokenClassification.from_pretrained(
         bert_chinese_ner.model_path, config=self.bertconfig)
Beispiel #30
0
 def load_frozen_bert(
         bert_pretrained_model: str,
         bert_state_dict: str = None,
         bert_config: BertConfig = None) -> BertForTokenClassification:
     if bert_state_dict:
         fine_tuned_state_dict = torch.load(bert_state_dict)
         bert_token_classifier = BertForTokenClassification.from_pretrained(
             pretrained_model_name_or_path=bert_pretrained_model,
             state_dict=fine_tuned_state_dict,
             config=bert_config)
     else:
         bert_token_classifier = BertForTokenClassification.from_pretrained(
             pretrained_model_name_or_path=bert_pretrained_model,
             config=bert_config)
     for p in bert_token_classifier.bert.parameters():
         p.requires_grad = False
     return bert_token_classifier