Ejemplo n.º 1
1
def train(args, train_loader, test_loader, test_json):
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")
    print("loading bert.")
    model = BertForTokenClassification.from_pretrained("bert-base-cased",
                                                       num_labels=3)
    model.to(device)
    optim = AdamW(model.parameters(), lr=args.lr)
    print("loaded. staring training.")

    # best_rationale_acc = 0
    for epoch in range(args.num_epoch):
        for batch in tqdm(train_loader):
            optim.zero_grad()
            model.train()
            model.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            labels=labels)
            loss = outputs[0]
            loss.backward()
            optim.step()

        if epoch % 3 == 2 and not args.no_logs:
            total, correct, pred_rationale = evaluate(args, model, test_loader)
            print_score(args, total, correct)
            print_human_vs_model(test_json, pred_rationale)
    return model
Ejemplo n.º 2
0
def get_text_reader(reader_name, task, num_labels):
    # AILAW Corpus is korean dataset.
    # So, model is fixed to Korean Model such as multilingual-BERT, kobert, koelectra, etc.

    if reader_name == "bert":
        if task == "classification":
            model_name = "bert-base-multilingual-cased"
            text_reader = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        else: # ner
            model_name = "bert-base-multilingual-cased"
            text_reader = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    elif reader_name == "kobert":
        if task == "classification":
            model_name = "monologg/kobert"
            text_reader = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        else: # ner
            model_name = "monologg/kobert"
            text_reader = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    elif reader_name == "koelectra":
        if task == "classification":
            model_name = "monologg/koelectra-base-discriminator"
            text_reader = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        else: # ner
            model_name = "monologg/koelectra-base-discriminator"
            text_reader = ElectraForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    else:
        raise KeyError(reader_name)

    return text_reader
def launch_bert(training_flag, test_flag):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    if training_flag is not None:
        model = BertForTokenClassification.from_pretrained(
            'bert-base-uncased', num_labels=len(tags_vals))
        ## ---------12 . Optimizer -> weight regularization is  a solution to reduce the overfitting of a deep learning
        """ 
        Last keras optimization 2020 (rates from 0.01 seem to be best hyperparamater )for weight regularization for weights layers
            from keras.layers import LSTM
            from keras.regularizers import l2
        model.add(LSTM(32, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01))) 
        Note :  BERT not include beta an gamma parametres for optimization
        """
        FULL_FINETUNING = True
        if FULL_FINETUNING:
            param_optimizer = list(model.named_parameters())
            no_decay = ['bias', 'gamma', 'beta']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay_rate':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay_rate':
                0.0
            }]
        else:
            param_optimizer = list(model.classifier.named_parameters())
            optimizer_grouped_parameters = [{
                "params": [p for n, p in param_optimizer]
            }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr)

        launch_training(training_path=args.training_data,
                        training_epochs=4,
                        valid_path=args.validate_data,
                        training_batch_size=1,
                        model=model,
                        model_path=args.save + '/config.json',
                        tokenizer=tokenizer,
                        optimizer=optimizer)
    if test_flag is not None:
        if (args.save is not None):
            config = BertConfig.from_json_file(args.save + '/config.json')
            model = BertForTokenClassification.from_pretrained(
                pretrained_model_name_or_path=args.save + '/pytorch_model.bin',
                config=config)
        else:
            model = BertForTokenClassification.from_pretrained(
                'bert-base-uncased', num_labels=len(tags_vals))
        launch_test_directory(test_path=test_flag,
                              model=model,
                              tokenizer=tokenizer)
Ejemplo n.º 4
0
def main(num_epochs, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    classes = ["B", "I", "O"]
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                              do_lower_case=False)
    tag_to_idx = {t: i for i, t in enumerate(classes)}
    tag_to_idx['[PAD]'] = -100
    idx_to_tag = {i: t for t, i in tag_to_idx.items()}

    train_dataloader, dev_dataloader, dev_sentences, test_dataloader, test_sentences = parse_data(
        tokenizer, tag_to_idx, batch_size=16)

    print('data loaded and tokenized')

    model = BertForTokenClassification.from_pretrained('bert-base-cased',
                                                       num_labels=len(classes))
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    print('model instantiated')

    model, dev_preds = train_model(tokenizer, tag_to_idx, model, num_epochs,
                                   train_dataloader, optimizer, device,
                                   dev_dataloader, idx_to_tag)
    test_preds = evaluate(model, test_dataloader, device, idx_to_tag)
    save_preds('dev_preds.txt', dev_preds, dev_sentences)
    save_preds('test_preds.txt', test_preds, test_sentences)
Ejemplo n.º 5
0
def main():
    # 各トークンを以下の13クラスのいずれかに分類するような固有表現抽出をしたい.
    labels = [
        'B-corporation', 'B-creative-work', 'B-group', 'B-location',
        'B-person', 'B-product', 'I-corporation', 'I-creative-work', 'I-group',
        'I-location', 'I-person', 'I-product', 'O'
    ]
    id2label = {i: label for i, label in enumerate(labels)}
    # label2id = {label: i for i, label in enumerate(labels)}

    # 利用する学習済みBERTモデルの名前を指定する.
    model_name = 'bert-large-cased'

    # 学習済みモデルに対応したトークナイザを生成する.
    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=model_name, )

    # 学習済みモデルから各トークン分類用モデルのインスタンスを生成する.
    model = BertForTokenClassification.from_pretrained(
        pretrained_model_name_or_path=model_name,
        id2label=id2label,  # 各トークンに対する出力を13次元にしたいのでこれを渡す.
    )
    # 一部の重みが初期化されていませんよという警告が出るが(クラス分類する層が
    # 初期化されていないのは当然)面倒なので無視する.

    print('◆ 適当な文章をモデルに流してみる.→ 14トークン×13クラスの予測結果になっている(サイズが).')
    sentence = 'The Empire State Building officially opened on May 1, 1931.'
    inputs = torch.tensor([tokenizer.encode(sentence)])  # ID列をテンソル化して渡す.
    outputs = model(inputs)
    print(outputs[0].size())
Ejemplo n.º 6
0
def _train_bert(training_data_retrieval_func):
    tokenizer = BertTokenizerFast.from_pretrained(BERT_BASE_MODEL)
    tokenizer.add_tokens(ADDITIONAL_SPECIAL_TOKENS)

    tokens, labels = training_data_retrieval_func()
    train_dataset = _get_datasets(tokens, labels, tokenizer)

    model = BertForTokenClassification.from_pretrained(
        BERT_BASE_MODEL, num_labels=len(ALL_LABEL_IDS))
    model.resize_token_embeddings(len(tokenizer))

    run_id = '{}_{}'.format(datetime.datetime.now().strftime('%Y%m%d-%H%M%S'),
                            utils.get_config('logging.filename'))
    training_args = TrainingArguments(
        output_dir=f'./bert/results/{run_id}',
        logging_dir=f'./bert/logs/{run_id}',
        logging_steps=500,
        save_steps=2000,
        per_device_train_batch_size=8,
        num_train_epochs=3,
        learning_rate=5e-5,
        warmup_steps=0,
        weight_decay=0,
    )

    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_dataset)

    trainer.train()
    path_to_model = utils._get_cache_path('bert_for_SE_tagging')
    model.save_pretrained(path_to_model)
    tokenizer.save_pretrained(path_to_model)
Ejemplo n.º 7
0
def main():
    args = docopt(__doc__)

    processors = {
        'multi-sents': BertMultiSentProcessor,
        'uni-sent': BertUniSentProcessor
    }
    
    max_input_len = int(args['--max-input-len'])
    tokenizer = BertWordPieceTokenizer(str(args['--path-to-vocab']))
    processor_constructor = processors[str(args['--mode'])]
    processor = processor_constructor(max_input_len, tokenizer)
    if args['--ensemble']:
        bert_model = BertEnsemble.load_trained(str(args['--path-to-model-dir']))
    elif args['--crf']:
        bert_model = BertWithCRF.from_pretrained(str(args['--path-to-model-dir']))
    elif not args['--rule']:
        bert_model = BertForTokenClassification.from_pretrained(str(args['--path-to-model-dir']))
    device_no = int(args['--gpu'])
    device = torch.device(f'cuda:{device_no}') if device_no > -1 else torch.device('cpu')
    if args['--crf']:
        bert_extractor = BertWithCRFExtractor(bert_model, tokenizer, max_input_len, device)
    elif not args['--rule']:
        bert_extractor = BertExtractor(bert_model, tokenizer, max_input_len, device)
    else:
        bert_extractor = RuleExtractor()
    corpus = read_corpus(str(args['--path-to-corpus-dir']))
    ents_table = build_ents_table(corpus, processor, bert_extractor, batch_size=int(args['--batch-size']))
    ents_table.to_csv(str(args['--path-to-output']), index=False, sep='\t')
    return
Ejemplo n.º 8
0
    def __init__(self,
                 config_name: str,
                 model_name: str = None,
                 num_tags: int = 2,
                 batch_first: bool = True) -> None:
        self.batch_first = batch_first

        if not os.path.exists(config_name):
            raise ValueError("未找到模型配置文件 '{}'".format(config_name))
        else:
            self.config_name = config_name

        if model_name is not None:
            if not os.path.exists(model_name):
                raise ValueError("未找到模型预训练参数文件 '{}'".format(model_name))
            else:
                self.model_name = model_name
        else:
            self.model_name = None

        super().__init__()
        self.bert_config = BertConfig.from_pretrained(self.config_name)
        self.bert_config.num_labels = num_tags
        self.model_kwargs = {'config': self.bert_config}

        if self.model_name is not None:
            self.bertModel = BertForTokenClassification.from_pretrained(
                self.model_name, **self.model_kwargs)
        else:
            self.bertModel = BertForTokenClassification(self.bert_config)

        self.crf_model = CRF(num_tags=num_tags, batch_first=batch_first)
Ejemplo n.º 9
0
 def __init__(self):
     super(BERTClass, self).__init__()
     config = BertConfig.from_pretrained("./bert-base-uncased",
                                         num_labels=len(list(
                                             tag2idx.keys())))
     self.l1 = BertForTokenClassification.from_pretrained(
         './bert-base-uncased', config=config)
Ejemplo n.º 10
0
def load_model(args, test):
    # if the model is for testing, attempt to load previous arguments
    if test:
        try:
            prev_args = torch.load(
                os.path.join(args.model_dir, "train_args.bin")
            )
            args.max_length = prev_args.max_length
            args.do_lower_case = prev_args.do_lower_case
            args.keep_accents = prev_args.keep_accents
        except FileNotFoundError:
            pass

    tokenizer = BertTokenizer.from_pretrained(
        args.model_dir,
        do_lower_case=args.do_lower_case,
        keep_accents=args.keep_accents,
    )
    model = BertForTokenClassification.from_pretrained(
        args.model_dir,
        finetuning_task="conll2002",
        num_labels=len(LABEL_LIST),
    ).to(args.device)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    return model, tokenizer
Ejemplo n.º 11
0
    def load_model(self,
                   model_filepath,
                   config_filepath,
                   pretrained_model="bert-base-cased"):
        """
        Load cybert model.

        :param model_filepath: Filepath of the model (.pth or .bin) to
        be loaded
        :type model_filepath: str
        :param label_map_filepath: Config file (.json) to be
        used
        :type label_map_filepath: str
        :param pretrained_model: Name of pretrained model to be loaded from
        transformers
        repo, default is bert-base-cased
        :type pretrained_model: str

        Examples
        --------
        >>> from clx.analytics.cybert import Cybert
        >>> cyparse = Cybert()
        >>> cyparse.load_model('/path/to/model.pth', '/path/to/config.json')
        """
        with open(config_filepath) as f:
            config = json.load(f)
        self._label_map = {int(k): v for k, v in config["id2label"].items()}
        model_state_dict = torch.load(model_filepath)
        self._model = BertForTokenClassification.from_pretrained(
            pretrained_model,
            state_dict=model_state_dict,
            num_labels=len(self._label_map),
        )
        self._model.cuda()
        self._model.eval()
 def __init__(self,
              num_labels=len(id2label.keys()),
              from_pretrained='bert-base-uncased'):
     super(BertForValueExtraction, self).__init__()
     print(f"Loading BertForTokenClassification as {from_pretrained}")
     self.token_classifier = BertForTokenClassification.from_pretrained(
         from_pretrained, num_labels=num_labels, return_dict=True)
Ejemplo n.º 13
0
    def __init__(self, hparams: Union[Dict, Namespace]):
        # NOTE: internal code may pass hparams as dict **kwargs
        if isinstance(hparams, Dict):
            hparams = Namespace(**hparams)

        self.label_ids_to_label = LabelTokenAligner.get_ids_to_label(
            hparams.labels)
        num_labels = len(self.label_ids_to_label)

        super().__init__()
        # Enable to access arguments via self.hparams
        self.save_hyperparameters(hparams)

        self.step_count = 0
        self.output_dir = Path(self.hparams.output_dir)
        self.cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
        if self.cache_dir is not None and not os.path.exists(
                self.hparams.cache_dir):
            os.mkdir(self.cache_dir)

        # AutoTokenizer
        # trf>=4.0.0: PreTrainedTokenizerFast by default
        # NOTE: AutoTokenizer doesn't load PreTrainedTokenizerFast...
        self.tokenizer_name = self.hparams.model_name_or_path
        self.tokenizer = BertTokenizerFast.from_pretrained(
            self.tokenizer_name,
            cache_dir=self.cache_dir,
            tokenize_chinese_chars=False,
            strip_accents=False,
        )

        # AutoConfig
        config_name = self.hparams.model_name_or_path
        self.config: PretrainedConfig = BertConfig.from_pretrained(
            config_name,
            **({
                "num_labels": num_labels
            } if num_labels is not None else {}),
            cache_dir=self.cache_dir,
        )
        extra_model_params = (
            "encoder_layerdrop",
            "decoder_layerdrop",
            "dropout",
            "attention_dropout",
        )
        for p in extra_model_params:
            if getattr(self.hparams, p, None) and hasattr(self.config, p):
                setattr(self.config, p, getattr(self.hparams, p, None))

        # AutoModelForTokenClassification
        self.model: PreTrainedModel = BertForTokenClassification.from_pretrained(
            self.hparams.model_name_or_path,
            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
            config=self.config,
            cache_dir=self.cache_dir,
        )

        self.scheduler = None
        self.optimizer = None
Ejemplo n.º 14
0
 def __init__(self, model_path, num_labels, label_map, device):
     super().__init__()
     self.model = BertForTokenClassification.from_pretrained(model_path, num_labels=num_labels).to(device)
     self.transitions = torch.nn.Parameter(torch.randn(num_labels, num_labels))
     # Ok, so we're going to add some constraints here
     self.label_map = label_map
     self.num_labels = num_labels
Ejemplo n.º 15
0
 def __init__(self):
     super(BertClass, self).__init__()
     self.l1 = BertForTokenClassification.from_pretrained(
         'bert-base-chinese',
         num_labels=21,
         output_attention=False,
         output_hidden_states=False)
Ejemplo n.º 16
0
    def __init__(self,
                 config_name: str,
                 model_name: str = None,
                 num_tags: int = 2,
                 batch_first: bool = True) -> None:
        self.batch_first = batch_first
        if not os.path.exists(config_name):
            raise ValueError('{} config file not found'.format(config_name))
        else:
            self.config_name = config_name

        if model_name is not None:
            if not os.path.exists(model_name):
                raise ValueError(' {} model file not found'.format(model_name))
            else:
                self.model_name = model_name
        else:
            self.model_name = None

        if num_tags <= 0:
            raise ValueError(f'invalid number of tags:{num_tags}')

        super().__init__()
        #bert config文件
        self.bert_config = BertConfig.from_pretrained(self.config_name)
        self.bert_config.num_tags = num_tags
        self.model_kwargs = {'config': self.bert_config}

        if self.model_name is not None:
            self.bertModel = BertForTokenClassification.from_pretrained(
                self.model_name, **self.model_kwargs)
        else:
            self.bertModel = BertForTokenClassification(self.bert_config)

        self.crfModel = CRF(num_tags=num_tags, batch_first=batch_first)
Ejemplo n.º 17
0
def main():
    data_table = pd.read_csv("train_table.csv")

    #define bert tokenizer && bert model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForTokenClassification.from_pretrained(
        'bert-base-uncased', output_hidden_states=True)
    model.eval()
    print(model.config)

    data_embeddings = {}
    for index, row in data_table.iterrows():
        print(index)
        #get sentence_tokens && label_tokens
        sentence = row[1]
        span = row[2]

        #get bert embeddings
        span_embeddings = get_bert_embedding(model, tokenizer, sentence, span)

        #non-propagandistic span embeddings
        if pd.isnull(span):
            data_embeddings[sentence] = span_embeddings
        #propagandistic span embeddings
        else:
            data_embeddings[sentence] = (span, row[3], row[4], span_embeddings)

    print("Writing to output file...")
    torch.save(data_embeddings, "data_embeddings.pt")
    print("Done.")
Ejemplo n.º 18
0
def get_predictions(filename, outputName):
    label_list = ['O', 'B-CLEntity', 'I-CLEntity', 'L-CLEntity', 'U-CLEntity']
    model = BertForTokenClassification.from_pretrained(
        "bert_ner_finetuned_iliad-with-gpu-pattern2.model")
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
    book_lines = []
    book = open(filename)
    for line in book:
        line = line.strip()
        book_lines.append(line)
    book_lines = [line for line in book_lines if line]

    pred = []
    for line in book_lines:
        line_tokens = tokenizer.tokenize(
            tokenizer.decode(tokenizer.encode(line)))
        line_inputs = tokenizer.encode(line, return_tensors="pt")
        line_outputs = model(line_inputs).logits
        line_predictions = torch.argmax(line_outputs, dim=2)
        line_pred_labels = []
        for prediction in line_predictions[0].numpy():
            line_pred_labels.append(label_list[prediction])
        pred.append(line_pred_labels)

    with open(outputName, 'w') as f:
        f.write(json.dumps(pred))
    return
Ejemplo n.º 19
0
    def load(self, dirpath):
        """ Loads a trained model from specified folder on disk.

            Parameters
            ----------
            dirpath : str
                directory from which model artifacts should be loaded

            Returns
            -------
            self
        """
        if not os.path.exists(dirpath):
            raise ValueError("Model directory not found: {:s}".format(dirpath))

        label_mappings = joblib.load(
            os.path.join(dirpath, "label_mappings.pkl"))
        self.label2id_ = label_mappings["label2id"]
        self.id2label_ = label_mappings["id2label"]
        self.special_tokens_ = label_mappings["special_tokens"]
        self.model_ = BertForTokenClassification.from_pretrained(
            dirpath,
            num_labels=len(self.label2id_),
            output_attentions=False,
            output_hidden_states=False)
        self.model_.to(self._device)
        self.tokenizer_ = BertTokenizer.from_pretrained(
            dirpath, do_basic_tokenize=False)

        return self
Ejemplo n.º 20
0
    def __init__(self):

        self.tag2idx = {
            'B-art': 0,
            'B-eve': 1,
            'B-geo': 2,
            'B-gpe': 3,
            'B-nat': 4,
            'B-org': 5,
            'B-per': 6,
            'B-tim': 7,
            'I-art': 8,
            'I-eve': 9,
            'I-geo': 10,
            'I-gpe': 11,
            'I-nat': 12,
            'I-org': 13,
            'I-per': 14,
            'I-tim': 15,
            'O': 16
        }

        self.idx2tag = {}
        for key in list(self.tag2idx.keys()):
            self.idx2tag[self.tag2idx[key]] = key

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                       do_lower_case=True)
        self.model = BertForTokenClassification.from_pretrained(
            "bert-base-uncased", num_labels=len(self.tag2idx))
        self.model.load_state_dict(
            torch.load("ner.dataset.4.pth", map_location=torch.device('cpu')))
        self.model.eval()
Ejemplo n.º 21
0
 def __init__(self, model_name, num_labels, lr):
     super().__init__()
     self.save_hyperparameters()
     self.bert_tc = BertForTokenClassification.from_pretrained(
         model_name,
         num_labels=num_labels
     )
Ejemplo n.º 22
0
    def __init__(self, hparams, user_tokens=['<newline>', '<bullet>']):
        super(BertNerSystem, self).__init__()
        self.hparams = hparams
        self.hparams.model_type = self.hparams.model_type.lower()
        tokenizer = BertTokenizer.from_pretrained(
            self.hparams.tokenizer_name if self.hparams.tokenizer_name else
            self.hparams.model_name_or_path,
            never_split=user_tokens,
            do_lower_case=self.hparams.do_lower_case,
            cache_dir=self.hparams.cache_dir
            if self.hparams.cache_dir else None,
        )

        config = AutoConfig.from_pretrained(
            self.hparams.config_name
            if self.hparams.config_name else self.hparams.model_name_or_path,
            cache_dir=self.hparams.cache_dir
            if self.hparams.cache_dir else None,
            output_past=not self.hparams.do_train,
            num_labels=self.hparams.num_labels,
        )
        model = BertForTokenClassification.from_pretrained(
            self.hparams.model_name_or_path,
            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
            config=config,
            cache_dir=self.hparams.cache_dir
            if self.hparams.cache_dir else None,
        )
        self.config, self.tokenizer, self.model = config, tokenizer, model
        self.loss = []  # for keeping track of average loss
        self.metrics = {}

        self.vocab = {v: k for k, v in self.tokenizer.get_vocab().items()}
Ejemplo n.º 23
0
def main(args):
    current_path = os.getcwd()
    logging.info(f'current python path {current_path}...')
    logging.info('Load data...')
    
    with open(f"{args.dataset}/train_dataset.pkl", "rb") as f:
        train_dataset = pickle.load(f)
    with open(f"{args.dataset}/valid_dataset.pkl", "rb") as f:
        valid_dataset = pickle.load(f)
    with open(f"{args.dataset}/test_dataset.pkl", "rb") as f:
        test_dataset = pickle.load(f)
    
    logging.info('Making dataloader...')
    train_loader = DataLoader(
        dataset = train_dataset,
        batch_size = args.batch_size,
        shuffle = True,
        collate_fn = lambda x: Bert_dataset.collate_fn(train_dataset, x)
    )

    valid_loader = DataLoader(
        dataset = valid_dataset,
        batch_size = args.batch_size,
        collate_fn = lambda x: Bert_dataset.collate_fn(valid_dataset, x)
    )

    test_loader = DataLoader(
        dataset = test_dataset,
        batch_size = args.batch_size,
        collate_fn = lambda x: Bert_dataset.collate_fn(test_dataset, x)
    )
    
    logging.info('Load model and parameters...')
    model = BertForTokenClassification.from_pretrained("bert-base-chinese",
        num_labels = 3,
        output_attentions = False,
        output_hidden_states = False
    )
    
    trainer = Trainer(model, train_loader, valid_loader)
    
    logging.info('Test validation dataset...')
    acc, total_loss = trainer.evaluation(test=False)
    print(f"device: {trainer.device} classification acc: {acc: .4f} validation loss: {total_loss:.4f}")
    
    logging.info('Start training...')
    trainer.training_process(early_stopping = True, 
                             n_iter_no_change = 5, 
                             max_epoch = args.max_epoch, 
                             save_params = True, 
                             verbose = True, 
                             learning_rate = args.learning_rate, 
                             save_paths = args.save_paths)
    
    logging.info('Training ends!')
    logging.info('Test validation dataset...')
    acc, total_loss = trainer.evaluation(test=False)
    print(f"device: {trainer.device} classification acc: {acc: .4f} validation loss: {total_loss:.4f}")
    logging.info('Finish!')
Ejemplo n.º 24
0
 def __init__(self, vocab_size, emb_size, hidden_size, num_labels):
     super(bert_chinese_ner, self).__init__()
     self.bertconfig = BertConfig.from_pretrained(
         bert_chinese_ner.model_path,
         num_labels=num_labels,
         author="lingze")
     self.model = BertForTokenClassification.from_pretrained(
         bert_chinese_ner.model_path, config=self.bertconfig)
Ejemplo n.º 25
0
def get_bert_tokenizer_and_model(training_data_retrieval_func):
    path_to_model = utils._get_cache_path('bert_for_SE_tagging')
    if not path_to_model.is_dir():
        _train_bert(training_data_retrieval_func)
    tokenizer = BertTokenizerFast.from_pretrained(path_to_model)
    model = BertForTokenClassification.from_pretrained(path_to_model)
    model.to('cuda')
    return tokenizer, model
Ejemplo n.º 26
0
 def load_frozen_bert(
         bert_pretrained_model: str,
         bert_state_dict: str = None,
         bert_config: BertConfig = None) -> BertForTokenClassification:
     if bert_state_dict:
         fine_tuned_state_dict = torch.load(bert_state_dict)
         bert_token_classifier = BertForTokenClassification.from_pretrained(
             pretrained_model_name_or_path=bert_pretrained_model,
             state_dict=fine_tuned_state_dict,
             config=bert_config)
     else:
         bert_token_classifier = BertForTokenClassification.from_pretrained(
             pretrained_model_name_or_path=bert_pretrained_model,
             config=bert_config)
     for p in bert_token_classifier.bert.parameters():
         p.requires_grad = False
     return bert_token_classifier
Ejemplo n.º 27
0
 def build_model_layers(self):
     ''' builds the layers in the model '''
     self.bert = BertForTokenClassification.from_pretrained(
         "bert-base-cased",
         num_labels=self.num_labels,
         output_attentions=False,
         output_hidden_states=False)
     if self.use_crf:
         self.crf = CRF(self.tag_pad_idx, self.pad_token, self.tag_names)
Ejemplo n.º 28
0
 def __init__(self):
     """
     Inicializa o modelo treinado para a tarefa específicas (extração dos tipos de entidades estabelecidos para
     textos de notícias).
     Utiliza o tokenizador para língua portuguesa neuralmind/bert-base-portuguese-cased.
     """
     super().__init__(
         BertForTokenClassification.from_pretrained(
             stringify_path(config.diretorio_modelo_bert_finetuned)))
Ejemplo n.º 29
0
 def __init__(self, config):
     super().__init__(config)
     self.init_weights()
     self.bert = BertForTokenClassification.from_pretrained(config.model,
                                                            config=config)
     logger.error("\n---------The __init__ of this class will always generate warnings. "\
                  "Even if everything was initialized correctly. "\
                  "If you loaded a pretrained model using .from_pretrained() "\
                  "or loaded the state_dict, ignore the previous errors.\n---------")
Ejemplo n.º 30
0
def load_model(output_path, device, name='model'):
    model_path = os.path.join(output_path, name)
    if not os.path.exists(model_path):
        raise ValueError('Could not find model at: {}. You first need to train a task model.'.format(model_path))

    print(' ➤ Loading fully trained model from: {}'.format(model_path))
    model = BertForTokenClassification.from_pretrained(model_path)
    model.to(device)
    return model