コード例 #1
0
    def __init__(self, model_weight_filename=None):
        """
        Load an instance of BERT model for dimension classification.
        """
        self.num_labels = len(DimensionDataset.label2idx)
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        logging.info('*** Instantiate model ***')
        if model_weight_filename:
            config = BertConfig(vocab_size_or_config_json_file=30522,
                                hidden_size=768,
                                num_hidden_layers=12,
                                num_attention_heads=12,
                                intermediate_size=3072)

            self.model = BertForTokenClassification(config, self.num_labels)

            logging.info('*** Loading model weights ***')
            self.model.load_state_dict(
                torch.load(model_weight_filename, map_location=self.device))
        else:
            # load bert pretrained with empty token classification top layers
            self.model = BertForTokenClassification.from_pretrained(
                "bert-base-uncased", num_labels=self.num_labels)

        logging.info('*** Loading tokenizer ***')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
コード例 #2
0
def load_pretrained_model_tokenizer(model_type="BertForSequenceClassification",
                                    base_model=None,
                                    base_tokenizer=None,
                                    device="cuda",
                                    chinese=False,
                                    num_labels=2):
    # Load pre-trained model (weights)
    if base_model is None:
        if chinese:
            base_model = "bert-base-chinese"
        else:
            base_model = "bert-base-uncased"
    if model_type == "BertForSequenceClassification":
        model = BertForSequenceClassification.from_pretrained(
            base_model, num_labels=num_labels)
        # Load pre-trained model tokenizer (vocabulary)
    elif model_type == "BertForNextSentencePrediction":
        model = BertForNextSentencePrediction.from_pretrained(base_model)
    elif model_type == "BertForTokenClassification":
        model = BertForTokenClassification.from_pretrained(
            base_model, num_labels=num_labels)
    elif model_type == "BertMSE":
        model = BertMSE()
    else:
        print("[Error]: unsupported model type")
        return None, None

    if base_tokenizer is None:
        # Download from huggingface
        tokenizer = BertTokenizer.from_pretrained(base_model)
    else:
        # Load local file
        tokenizer = BertTokenizer.from_pretrained(base_tokenizer)
    model.to(device)
    return model, tokenizer
コード例 #3
0
def predict(name, lang='eng', path='learn', model_dir='models'):
    path, model_dir = Path(path), Path(model_dir)
    print('Loading model...')
    device = 'cpu'
    state = torch.load(path / model_dir / f'{name}.pth', map_location=device)
    bert_model = 'bert-base-cased' if lang == 'eng' else 'bert-base-multilingual-cased'
    print(f'Lang: {lang}\nModel: {bert_model}\nRun: {name}')
    model = BertForTokenClassification.from_pretrained(bert_model,
                                                       num_labels=len(VOCAB),
                                                       cache_dir='bertm')
    model.load_state_dict(state['model'], strict=True)
    print('Done')

    try:
        while True:
            # get sentence
            sent = input('Enter sentence: ')
            words = sent.split()
            x, mask = to_feature(words, bert_model)
            with torch.no_grad():
                # predict named entities
                out = model(x)
                pred = out.argmax(-1).view(-1)
                print(pred)
                active_pred = pred[mask == 1]
                print('Named Entities')
                active_pred = active_pred.tolist()
                for w, l in zip(words, active_pred[1:-1]):
                    print(f'{w} {idx2label[l]}')

    except Exception as e:
        print('See ya')
コード例 #4
0
ファイル: main.py プロジェクト: koukaiu/dlut-nihao
def test(config):
    print('-' * 50)
    print('Loading core model......')
    load_model_name = config['test_model']
    if os.path.exists(config['test_model']):
        pass
    else:
        print('the test model ' + config['test_model'] +
              ' is not exist pls check it in the config.txt')
    print('Core model name is : ' + load_model_name)
    model = BertForTokenClassification.from_pretrained(
        config['model_name'], num_labels=config['tagset_size'] + 1)  # load ptm
    print('-' * 50)
    print('Deploying the test data......')
    config['shuffle'] = False
    test_sents, test_data_loader = prepare_data(config)
    print('Test data loaded done!')
    model.to(config['device'])
    checkpoint = torch.load(load_model_name)
    model.load_state_dict(checkpoint['net'])
    model.eval()
    print('-' * 50)
    print('core model loaded done! Start predicting......')
    with torch.no_grad():
        results = predict(model, test_data_loader, config)
        results = restore_result(test_sents, results)
        if config['output_file']:
            write(results, config['output_file'])
        else:
            print('The output file is not pointed. pls check the config.txt')
            sys.exit(1)
    print('Test process done!')
コード例 #5
0
ファイル: bertOri.py プロジェクト: NotturnoTJ/NER
    def __init__(self, num_labels):
        super(NER1, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=num_labels)
        for param in self.bert.parameters():
            param.requires_grad = True

        self.dp1 = nn.Dropout(0.1)
コード例 #6
0
 def __init__(self, args, params, device):
     super(Net, self).__init__()
     self.bert = BertForTokenClassification.from_pretrained(
         args.bert_model_dir, num_labels=len(params.tag2idx))
     self.bilstm = nn.LSTM(bidirectional=True,
                           num_layers=2,
                           input_size=768,
                           hidden_size=768 // 2,
                           batch_first=True)
     self.fc = nn.Linear(768, len(params.tag2idx))
     self.num_labels = 2
     self.device = device
コード例 #7
0
def main():
    slot2Id = getSlot2Id()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = BertForTokenClassification.from_pretrained(
        config.pretrained_model_name_or_path, num_labels=len(slot2Id))
    model.to(device)

    x, y = processData()
    train_dataloader, val_dataloader = getDataLoader(x, y)

    train(model, device, train_dataloader, val_dataloader, config.epochs,
          config.max_grad_norm)
コード例 #8
0
    def __init__(self, dir_path, max_seq_length=30):
        self.max_seq_length = max_seq_length
        self.processor = NerProcessor.load(
            os.path.join(dir_path, PROCESSOR_NAME))
        self.tokenizer = BertTokenizer.from_pretrained(dir_path)
        self.classifier = BertForTokenClassification.from_pretrained(
            dir_path, len(self.processor.labels))
        self.classifier.eval()
        self.id2label = {
            i: label
            for i, label in enumerate(self.processor.labels)
        }

        global debug_message
        debug_message = False
コード例 #9
0
    def __init__(self, opt):
        super(BERT_REL, self).__init__()
        self.opt = opt
        self.bertForToken = BertForTokenClassification.from_pretrained(self.opt.bert_model_dir, num_labels=self.opt.tag_nums)
        # tag分类
        self.num_labels = self.opt.tag_nums

        # 关系分类
        self.rel_bert = BertModel.from_pretrained(self.opt.bert_model_dir)
        self.rel_fc = nn.Sequential(nn.Linear(768, 1024), nn.ReLU(), nn.Linear(1024, self.opt.rel_nums))

        self.id2tag = json.loads(open(opt.id2tag_dir, 'r').readline())
        self.type2types = json.loads(open(opt.type2types_dir, 'r').readline())
        self.sep1 = torch.LongTensor([1]).to("cuda")
        self.sep2 = torch.LongTensor([2]).to("cuda")
        self.init_weights()
コード例 #10
0
def BuildModel(config, weight=None):
    # change the forward method: do not consider 'X' when computing loss
    def new_forward(self,
                    input_ids,
                    token_type_ids=None,
                    attention_mask=None,
                    add_masks=None,
                    labels=None,
                    weight=weight):
        sequence_output, _ = self.bert(input_ids,
                                       token_type_ids,
                                       attention_mask,
                                       output_all_encoded_layers=False)

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        if labels is not None:
            if weight is not None:
                weight = weight.to(torch.float).to(config['device'])
            loss_fct = nn.CrossEntropyLoss(weight=weight,
                                           ignore_index=self.num_labels - 1)
            # Only keep active parts of the loss
            if attention_mask is not None or add_masks is not None:
                if add_masks is None:
                    add_masks = 1
                if attention_mask is None:
                    attention_mask = 1
                active_loss = (attention_mask.view(-1)
                               == 1) * (add_masks.view(-1) == 1)
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels),
                                labels.view(-1))
            return loss
        else:
            return logits

    BertForTokenClassification.forward = new_forward
    model = BertForTokenClassification.from_pretrained(
        config['name'], num_labels=config['num_labels'])
    model.to(config['device'])

    return model
コード例 #11
0
    def __init__(self, opt):
        super(BERT_CNN_CRF, self).__init__()
        self.opt = opt
        self.bertForToken = BertForTokenClassification.from_pretrained(self.opt.bert_model_dir, num_labels=self.opt.tag_nums)
        # tag分类
        self.num_labels = self.opt.tag_nums
        self.crf = CRF(self.opt.tag_nums, batch_first=True)

        # 关系分类
        self.type_emb = nn.Embedding(3, self.opt.bert_hidden_size)
        self.rel_cnns = Encoder(enc_method='cnn', filters_num=self.opt.filter_num, filters=self.opt.filters, f_dim=self.opt.bert_hidden_size)
        self.classifier_rels = nn.Linear(len(self.opt.filters)*self.opt.filter_num, self.opt.rel_nums)

        self.id2tag = json.loads(open(opt.id2tag_dir, 'r').readline())
        self.type2types = json.loads(open(opt.type2types_dir, 'r').readline())

        self.init_weights()
コード例 #12
0
def keywordextract(sentence,
                   model_path='./pretrained/keyword_extraction_pretrained.pt'):
    # returns a single keyword of given sentence

    device = torch.device('cpu')

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    model = BertForTokenClassification.from_pretrained("bert-base-uncased",
                                                       num_labels=3)
    model.to(device)

    text = sentence
    tkns = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tkns)
    segments_ids = [0] * len(tkns)
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids]).to(device)
    model = torch.load(model_path, map_location=device)
    model.eval()
    prediction = []
    logit = model(tokens_tensor,
                  token_type_ids=None,
                  attention_mask=segments_tensors)
    #logit = model(tokens_tensor)
    logit = logit.detach().cpu().numpy()
    prediction.extend([list(p) for p in np.argmax(logit, axis=2)])

    keyword = None
    for k, j in enumerate(prediction[0]):
        if j == 1 or j == 0:
            # print(tokenizer.convert_ids_to_tokens(tokens_tensor[0].to('cpu').numpy())[k])
            keyword = tokenizer.convert_ids_to_tokens(
                tokens_tensor[0].to('cpu').numpy())[k]

    if "#" in keyword:
        keyword = keyword.replace("#", "")
        for word in sentence.split():
            if keyword in word:
                keyword = word.lower()

    return keyword
コード例 #13
0
ファイル: bert_srl.py プロジェクト: naxda/BERT_for_Korean_SRL
def train():
    model_path = dir_path + '/models/'
    print('your model would be saved at', model_path)

    model = BertForTokenClassification.from_pretrained(
        "bert-base-multilingual-cased", num_labels=len(bert_io.tag2idx))
    model.to(device)

    trn_data = bert_io.convert_to_bert_input(trn)
    sampler = RandomSampler(trn_data)
    trn_dataloader = DataLoader(trn_data,
                                sampler=sampler,
                                batch_size=batch_size)

    # load optimizer
    FULL_FINETUNING = True
    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
            0.0
        }]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{
            "params": [p for n, p in param_optimizer]
        }]
    optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

    # train
    epochs = 10
    max_grad_norm = 1.0
    num_of_epoch = 0
    for _ in trange(epochs, desc="Epoch"):
        # TRAIN loop
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(trn_dataloader):
            # add batch to gpu
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_orig_tok_to_maps, b_input_args, b_input_masks = batch
            # forward pass
            loss = model(b_input_ids,
                         token_type_ids=None,
                         attention_mask=b_input_masks,
                         labels=b_input_args)
            # backward pass
            loss.backward()
            # track train loss
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1
            # gradient clipping
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(),
                                           max_norm=max_grad_norm)
            # update parameters
            optimizer.step()
            model.zero_grad()
#             break
#         break

# print train loss per epoch
        print("Train loss: {}".format(tr_loss / nb_tr_steps))
        model_saved_path = model_path + 'ko-srl-epoch-' + str(
            num_of_epoch) + '.pt'
        torch.save(model, model_saved_path)
        num_of_epoch += 1
    print('...training is done')
コード例 #14
0
def main():
    """Training pipeline"""

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()


    data = pd.read_csv("train_m.txt", sep='\t', encoding="latin1").fillna(method="ffill")

    getter = SentenceGetter(data)

    sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
    labels = [[s[1] for s in sent] for sent in getter.sentences]

    tags_vals = list(set(data["tag"].values))
    tag2idx = {t: i for i, t in enumerate(tags_vals)}

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

    attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

    tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
    tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)
    
    tr_inputs = torch.tensor(tr_inputs)
    val_inputs = torch.tensor(val_inputs)
    tr_tags = torch.tensor(tr_tags)
    val_tags = torch.tensor(val_tags)
    tr_masks = torch.tensor(tr_masks)
    val_masks = torch.tensor(val_masks)

    train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

    valid_data = TensorDataset(val_inputs, val_masks, val_tags)
    valid_sampler = SequentialSampler(valid_data)
    valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

    model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))

    model.cuda();

    FULL_FINETUNING = True
    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0}
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters()) 
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
    optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

    epochs = 5
    max_grad_norm = 1.0

    for _ in trange(epochs, desc="Epoch"):
        # TRAIN loop
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(train_dataloader):
            # add batch to gpu
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            # forward pass
            loss = model(b_input_ids, token_type_ids=None,
                         attention_mask=b_input_mask, labels=b_labels)
            # backward pass
            loss.backward()
            # track train loss
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1
            # gradient clipping
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
            # update parameters
            optimizer.step()
            model.zero_grad()
        # print train loss per epoch
        print("Train loss: {}".format(tr_loss/nb_tr_steps))
        # VALIDATION on validation set
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions , true_labels = [], []
        for batch in valid_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            with torch.no_grad():
                tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                      attention_mask=b_input_mask, labels=b_labels)
                logits = model(b_input_ids, token_type_ids=None,
                               attention_mask=b_input_mask)
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.append(label_ids)
            
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            
            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy
            
            nb_eval_examples += b_input_ids.size(0)
            nb_eval_steps += 1
        eval_loss = eval_loss/nb_eval_steps
        print("Validation loss: {}".format(eval_loss))
        print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
        pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
        valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
        print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
        print("Precision-Score: {}".format(precision_score(pred_tags, valid_tags)))
        print("Recall-Score: {}".format(recall_score(pred_tags, valid_tags)))
        print(classification_report(pred_tags, valid_tags))
        

    model.eval()
    predictions = []
    true_labels = []
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
                
        logits = logits.detach().cpu().numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        label_ids = b_labels.to('cpu').numpy()
        true_labels.append(label_ids)
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1

    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("Validation loss: {}".format(eval_loss/nb_eval_steps))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print("Precision-Score: {}".format(precision_score(pred_tags, valid_tags)))
    print("Recall-Score: {}".format(recall_score(pred_tags, valid_tags)))
    print(classification_report(pred_tags, valid_tags))

    true_positives_O = 0
    predicted_positives_O = 0
    real_positives_O = 0
    for pred, valid in zip(pred_tags, valid_tags):
        if pred == 'I-claim' and valid == 'I-claim':
            true_positives_O += 1
        if pred == 'I-claim':
            predicted_positives_O += 1
        if valid == 'I-claim':
            real_positives_O += 1

    print("True positives I: {}".format(true_positives_O))
    print("predicted positives I: {}".format(predicted_positives_O))
    print("real positives I: {}".format(real_positives_O))
    
    true_positives_B = 0
    predicted_positives_B = 0
    real_positives_B = 0
    for pred, valid in zip(pred_tags, valid_tags):
        if pred == 'B-claim' and valid == 'B-claim':
            true_positives_B += 1
        if pred == 'B-claim':
            predicted_positives_B += 1
        if valid == 'B-claim':
            real_positives_B += 1
    
    print("True positives B: {}".format(true_positives_B))
    print("predicted positives B: {}".format(predicted_positives_B))
    print("real positives B: {}".format(real_positives_B))
    
    with open("resultados", 'w') as out:
        out.write("Predictions:\n")
        out.write("{}".format(list(zip(list(val_inputs), pred_tags, valid_tags))))
コード例 #15
0
def run_ner(
        lang: str = 'eng',
        log_dir: str = 'logs',
        task: str = NER,
        batch_size: int = 1,
        epochs: int = 1,
        dataset: str = 'data/conll-2003/',
        loss: str = 'cross',
        max_seq_len: int = 128,
        do_lower_case: bool = False,
        warmup_proportion: float = 0.1,
        rand_seed: int = None,
        ds_size: int = None,
        data_bunch_path: str = 'data/conll-2003/db',
        tuned_learner: str = None,
        do_train: str = False,
        do_eval: str = False,
        save: bool = False,
        nameX: str = 'ner',
        mask: tuple = ('s', 's'),
):
    name = "_".join(
        map(str, [
            nameX, task, lang, mask[0], mask[1], loss, batch_size, max_seq_len,
            do_train, do_eval
        ]))
    log_dir = Path(log_dir)
    log_dir.mkdir(parents=True, exist_ok=True)
    init_logger(log_dir, name)

    if rand_seed:
        random.seed(rand_seed)
        np.random.seed(rand_seed)
        torch.manual_seed(rand_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(rand_seed)

    trainset = dataset + lang + '/train.txt'
    devset = dataset + lang + '/dev.txt'
    testset = dataset + lang + '/test.txt'

    bert_model = 'bert-base-cased' if lang == 'eng' else 'bert-base-multilingual-cased'
    print(f'Lang: {lang}\nModel: {bert_model}\nRun: {name}')
    model = BertForTokenClassification.from_pretrained(bert_model,
                                                       num_labels=len(VOCAB),
                                                       cache_dir='bertm')
    if tuned_learner:
        print('Loading pretrained learner: ', tuned_learner)
        model.bert.load_state_dict(torch.load(tuned_learner))

    model = torch.nn.DataParallel(model)
    model_lr_group = bert_layer_list(model)
    layers = len(model_lr_group)
    kwargs = {'max_seq_len': max_seq_len, 'ds_size': ds_size, 'mask': mask}

    train_dl = DataLoader(dataset=NerDataset(trainset,
                                             bert_model,
                                             train=True,
                                             **kwargs),
                          batch_size=batch_size,
                          shuffle=True,
                          collate_fn=partial(pad, train=True))

    dev_dl = DataLoader(dataset=NerDataset(devset, bert_model, **kwargs),
                        batch_size=batch_size,
                        shuffle=False,
                        collate_fn=pad)

    test_dl = DataLoader(dataset=NerDataset(testset, bert_model, **kwargs),
                         batch_size=batch_size,
                         shuffle=False,
                         collate_fn=pad)

    data = DataBunch(train_dl=train_dl,
                     valid_dl=dev_dl,
                     test_dl=test_dl,
                     collate_fn=pad,
                     path=Path(data_bunch_path))

    train_opt_steps = int(len(train_dl.dataset) / batch_size) * epochs
    optim = BertAdam(model.parameters(),
                     lr=0.01,
                     warmup=warmup_proportion,
                     t_total=train_opt_steps)

    loss_fun = ner_loss_func if loss == 'cross' else partial(ner_loss_func,
                                                             zero=True)
    metrics = [Conll_F1()]

    learn = Learner(
        data,
        model,
        BertAdam,
        loss_func=loss_fun,
        metrics=metrics,
        true_wd=False,
        layer_groups=model_lr_group,
        path='learn' + nameX,
    )

    learn.opt = OptimWrapper(optim)

    lrm = 1.6

    # select set of starting lrs
    lrs_eng = [0.01, 5e-4, 3e-4, 3e-4, 1e-5]
    lrs_deu = [0.01, 5e-4, 5e-4, 3e-4, 2e-5]

    startlr = lrs_eng if lang == 'eng' else lrs_deu
    results = [['epoch', 'lr', 'f1', 'val_loss', 'train_loss', 'train_losses']]
    if do_train:
        learn.freeze()
        learn.fit_one_cycle(1, startlr[0], moms=(0.8, 0.7))
        learn.freeze_to(-3)
        lrs = learn.lr_range(slice(startlr[1] / (1.6**15), startlr[1]))
        learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7))
        learn.freeze_to(-6)
        lrs = learn.lr_range(slice(startlr[2] / (1.6**15), startlr[2]))
        learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7))
        learn.freeze_to(-12)
        lrs = learn.lr_range(slice(startlr[3] / (1.6**15), startlr[3]))
        learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7))
        learn.unfreeze()
        lrs = learn.lr_range(slice(startlr[4] / (1.6**15), startlr[4]))
        learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7))

    if do_eval:
        res = learn.validate(test_dl, metrics=metrics)
        met_res = [f'{m.__name__}: {r}' for m, r in zip(metrics, res[1:])]
        print(f'Validation on TEST SET:\nloss {res[0]}, {met_res}')
        results.append(['val', '-', res[1], res[0], '-', '-'])

    with open(log_dir / (name + '.csv'), 'a') as resultFile:
        wr = csv.writer(resultFile)
        wr.writerows(results)
コード例 #16
0
ファイル: train.py プロジェクト: rollben/bert-NER
    data_loader = DataLoader(args.data_dir,
                             args.bert_model_dir,
                             params,
                             token_pad_idx=0)

    # Load training data and test data
    train_data = data_loader.load_data('train')
    val_data = data_loader.load_data('val')

    # Specify the training and validation dataset sizes
    params.train_size = train_data['size']
    params.val_size = val_data['size']

    # Prepare model
    model = BertForTokenClassification.from_pretrained(args.bert_model_dir,
                                                       num_labels=len(
                                                           params.tag2idx))
    model.to(params.device)
    if args.fp16:
        model.half()

    if params.n_gpu > 1 and args.multi_gpu:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if params.full_finetuning:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        # no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [{
            'params': [
コード例 #17
0
 def __init__(self, opt):
     super(BertNer, self).__init__()
     self.opt = opt
     self.num_labels = self.opt.tag_nums
     self.bertForToken = BertForTokenClassification.from_pretrained(
         self.opt.bert_model_dir, num_labels=self.opt.tag_nums)
コード例 #18
0
                    help='path to save the final model')
args = parser.parse_args()
if args.lang == "en":
    bert_model = "bert-base-uncased"
    model_path = "/workdir/pretrain-model/bert-torch"
elif args.lang == "cn":
    bert_model = "bert-base-chinese"
    model_path = "/workdir/pretrain-model/bert-torch-cn"
bert_model = "bert-base-uncased"
# model_path = "D:/Github/BERT-Keyword-Extractor/model/en_model.pt"
tag2idx = {'B': 0, 'I': 1, 'O': 2}
tags_vals = ['B', 'I', 'O']

tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
# cache_dir='/workdir/pretrain-model/bert-torch-cn')
model = BertForTokenClassification.from_pretrained(bert_model,
                                                   num_labels=len(tag2idx))


# cache_dir='/workdir/pretrain-model/bert-torch-cn')
def casting(tkns, prediction):
    # 把命中peace的值扩张到整个单词
    for k, j in enumerate(prediction):
        if j == 1 or j == 0:
            # 中间是短词
            if not tkns[k].find('##') == -1:
                prediction[k] = 1
                forwd = False
                backward = False
                for i in range(int(len(tkns) / 2)):
                    # forward
                    if k - i >= 0:
コード例 #19
0
def train(args: Dict):
    MAX_LEN = int(args['--max-len'])
    bs = int(args['--batch-size'])
    model_root = args['--model-root'] if args['--model-root'] else './models'

    dataLoader= sentence.Sentence(args['--train-src'])

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)
    if args['--cuda']:
        n_gpu = torch.cuda.device_count()
        torch.cuda.get_device_name(0)

    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

    tokenized_texts = [tokenizer.tokenize(sent) for sent in dataLoader.sentences]

    print(dataLoader.sentences[0])
    print(tokenized_texts[0])

    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    tags = pad_sequences([[dataLoader.tag2idx.get(l) for l in lab] for lab in dataLoader.labels],
                         maxlen=MAX_LEN, value=dataLoader.tag2idx["O"], padding="post",
                         dtype="long", truncating="post")

    attention_masks = [[float(i > 0) for i in ii] for ii in input_ids]

    """
    The BERT Model requires us to have a [SEP] token at the end of each sentence as a part of its preprocessing. 102 is the index BERT recognizes as the index of [SEP]. Hence, I am adding it to the end of the sentence after padding/truncating
    (as it might have been removed if the sequences were greater than 75 in length) to be compatible with BERT's requirement. I didn't have it in the beginning and I thought it would be the reason for the poor results but changing it didn't help and I chose to keep it anyways as it felt right. :)
    """
    for i, inp in enumerate(input_ids):
        if (102 not in inp):
            inp[-1] = 102
            tags[i][-1] = dataLoader.tag2idx.get("O")

    tts = float(args['--train-test-split'])

    tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                                random_state=10, test_size=tts)
    tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                 random_state=10, test_size=tts)

    tr_inputs = torch.tensor(tr_inputs).to(torch.int64)
    val_inputs = torch.tensor(val_inputs).to(torch.int64)
    tr_tags = torch.tensor(tr_tags).to(torch.int64)
    val_tags = torch.tensor(val_tags).to(torch.int64)
    tr_masks = torch.tensor(tr_masks)
    val_masks = torch.tensor(val_masks)

    train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

    valid_data = TensorDataset(val_inputs, val_masks, val_tags)
    valid_sampler = SequentialSampler(valid_data)
    valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

    model = BertForTokenClassification.from_pretrained(
        "bert-base-multilingual-cased", num_labels=len(dataLoader.tag2idx))

    if args['--cuda']:
        model.cuda()

    FULL_FINETUNING = True if args['--full-finetuning'] else False
    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0}
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

    optimizer = Adam(optimizer_grouped_parameters, lr=float(args['--lr']))

    epochs = int(args['--max-epoch'])
    max_grad_norm = 1.0
    hist_valid_scores = []

    for _ in trange(epochs, desc="Epoch"):
        # TRAIN loop
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(train_dataloader):
            # add batch to gpu
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            # forward pass
            loss = model(b_input_ids, token_type_ids=None,
                         attention_mask=b_input_mask, labels=b_labels)
            # backward pass
            loss.backward()
            # track train loss
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1
            # gradient clipping
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
            # update parameters
            optimizer.step()
            model.zero_grad()
        # print train loss per epoch
        print("Train loss: {}".format(tr_loss / nb_tr_steps))
        # VALIDATION on validation set
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions, true_labels = [], []
        for batch in valid_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            with torch.no_grad():
                tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                      attention_mask=b_input_mask, labels=b_labels)
                logits = model(b_input_ids, token_type_ids=None,
                               attention_mask=b_input_mask)
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.append(label_ids)

            tmp_eval_accuracy = flat_accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += b_input_ids.size(0)
            nb_eval_steps += 1
        eval_loss = eval_loss / nb_eval_steps
        print("Validation loss: {}".format(eval_loss))
        print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
        pred_tags = [dataLoader.tags_vals[p_i] for p in predictions for p_i in p]
        valid_tags = [dataLoader.tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
        f1=f1_score(valid_tags,pred_tags)
        print("F1-Score: {}".format(f1))

        is_better = len(hist_valid_scores) == 0 or f1 > max(hist_valid_scores)
        hist_valid_scores.append(f1)
        if is_better:
            output_model_file = os.path.join(model_root, "model_file.bin")
            output_config_file = os.path.join(model_root, "config_file.bin")
            output_vocab_file = model_root

            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(model_to_save.state_dict(), output_model_file)
            model_to_save.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(output_vocab_file)

    print('reached maximum number of epochs!', file=sys.stderr)
    exit(0)
コード例 #20
0
                                 device=device)

    training_dataset = TensorDataset(all_input_ids, all_input_mask,
                                     all_segment_ids, all_label_ids)
    training_sampler = RandomSampler(training_dataset)
    training_dataloader = DataLoader(training_dataset,
                                     sampler=training_sampler,
                                     batch_size=training_batch_size)

    num_train_steps = int(
        len(training_features) / training_batch_size /
        gradient_accumulation_steps * num_train_epochs)
    t_total = num_train_steps

    if if_bert:
        model = BertForTokenClassification.from_pretrained(
            '../../model/bert-base-chinese.tar.gz', num_labels=3).to(device)
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
コード例 #21
0
def run_ner(
        lang: str = 'eng',
        log_dir: str = 'logs',
        task: str = NER,
        batch_size: int = 1,
        lr: float = 5e-5,
        epochs: int = 1,
        dataset: str = 'data/conll-2003/',
        loss: str = 'cross',
        max_seq_len: int = 128,
        do_lower_case: bool = False,
        warmup_proportion: float = 0.1,
        grad_acc_steps: int = 1,
        rand_seed: int = None,
        fp16: bool = False,
        loss_scale: float = None,
        ds_size: int = None,
        data_bunch_path: str = 'data/conll-2003/db',
        bertAdam: bool = False,
        freez: bool = False,
        one_cycle: bool = False,
        discr: bool = False,
        lrm: int = 2.6,
        div: int = None,
        tuned_learner: str = None,
        do_train: str = False,
        do_eval: str = False,
        save: bool = False,
        name: str = 'ner',
        mask: tuple = ('s', 's'),
):
    name = "_".join(
        map(str, [
            name, task, lang, mask[0], mask[1], loss, batch_size, lr,
            max_seq_len, do_train, do_eval
        ]))

    log_dir = Path(log_dir)
    log_dir.mkdir(parents=True, exist_ok=True)
    init_logger(log_dir, name)

    if rand_seed:
        random.seed(rand_seed)
        np.random.seed(rand_seed)
        torch.manual_seed(rand_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(rand_seed)

    trainset = dataset + lang + '/train.txt'
    devset = dataset + lang + '/dev.txt'
    testset = dataset + lang + '/test.txt'

    bert_model = 'bert-base-cased' if lang == 'eng' else 'bert-base-multilingual-cased'
    print(f'Lang: {lang}\nModel: {bert_model}\nRun: {name}')
    model = BertForTokenClassification.from_pretrained(bert_model,
                                                       num_labels=len(VOCAB),
                                                       cache_dir='bertm')

    model = torch.nn.DataParallel(model)
    model_lr_group = bert_layer_list(model)
    layers = len(model_lr_group)
    kwargs = {'max_seq_len': max_seq_len, 'ds_size': ds_size, 'mask': mask}

    train_dl = DataLoader(dataset=NerDataset(trainset,
                                             bert_model,
                                             train=True,
                                             **kwargs),
                          batch_size=batch_size,
                          shuffle=True,
                          collate_fn=partial(pad, train=True))

    dev_dl = DataLoader(dataset=NerDataset(devset, bert_model, **kwargs),
                        batch_size=batch_size,
                        shuffle=False,
                        collate_fn=pad)

    test_dl = DataLoader(dataset=NerDataset(testset, bert_model, **kwargs),
                         batch_size=batch_size,
                         shuffle=False,
                         collate_fn=pad)

    data = DataBunch(train_dl=train_dl,
                     valid_dl=dev_dl,
                     test_dl=test_dl,
                     collate_fn=pad,
                     path=Path(data_bunch_path))

    loss_fun = ner_loss_func if loss == 'cross' else partial(ner_loss_func,
                                                             zero=True)
    metrics = [Conll_F1()]

    learn = Learner(
        data,
        model,
        BertAdam,
        loss_func=loss_fun,
        metrics=metrics,
        true_wd=False,
        layer_groups=None if not freez else model_lr_group,
        path='learn',
    )

    # initialise bert adam optimiser
    train_opt_steps = int(len(train_dl.dataset) / batch_size) * epochs
    optim = BertAdam(model.parameters(),
                     lr=lr,
                     warmup=warmup_proportion,
                     t_total=train_opt_steps)

    if bertAdam: learn.opt = OptimWrapper(optim)
    else: print("No Bert Adam")

    # load fine-tuned learner
    if tuned_learner:
        print('Loading pretrained learner: ', tuned_learner)
        learn.load(tuned_learner)

    # Uncomment to graph learning rate plot
    # learn.lr_find()
    # learn.recorder.plot(skip_end=15)

    # set lr (discriminative learning rates)
    if div: layers = div
    lrs = lr if not discr else learn.lr_range(slice(lr / lrm**(layers), lr))

    results = [['epoch', 'lr', 'f1', 'val_loss', 'train_loss', 'train_losses']]

    if do_train:
        for epoch in range(epochs):
            if freez:
                lay = (layers // (epochs - 1)) * epoch * -1
                if lay == 0:
                    print('Freeze')
                    learn.freeze()
                elif lay == layers:
                    print('unfreeze')
                    learn.unfreeze()
                else:
                    print('freeze2')
                    learn.freeze_to(lay)
                print('Freezing layers ', lay, ' off ', layers)

            # Fit Learner - eg train model
            if one_cycle: learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7))
            else: learn.fit(1, lrs)

            results.append([
                epoch,
                lrs,
                learn.recorder.metrics[0][0],
                learn.recorder.val_losses[0],
                np.array(learn.recorder.losses).mean(),
                learn.recorder.losses,
            ])

            if save:
                m_path = learn.save(f"{lang}_{epoch}_model", return_path=True)
                print(f'Saved model to {m_path}')
    if save: learn.export(f'{lang}.pkl')

    if do_eval:
        res = learn.validate(test_dl, metrics=metrics)
        met_res = [f'{m.__name__}: {r}' for m, r in zip(metrics, res[1:])]
        print(f'Validation on TEST SET:\nloss {res[0]}, {met_res}')
        results.append(['val', '-', res[1], res[0], '-', '-'])

    with open(log_dir / (name + '.csv'), 'a') as resultFile:
        wr = csv.writer(resultFile)
        wr.writerows(results)
コード例 #22
0
ファイル: builder.py プロジェクト: phretor/TwiTi
def build_model(full_fine_tunning=True, batch_size=32, epochs=3):
    df = load_data(DATA_PATH)
    df["word"] = df["word"].str.lower()

    data = getter(df)

    processed_texts = []
    processed_tags = []
    for item in data:
        string, tags = process_terms(item)
        processed_texts.append(string)
        processed_tags.append(tags)

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                              do_lower_case=True)
    tokenized_sents = [
        tokenizer.tokenize("[CLS] " + sent + " [SEP]")
        for sent in processed_texts
    ]
    tokenized_tags = label_tokenize(tokenized_sents, processed_tags)

    bert_sents, bert_labels = remove_long_sent(tokenized_sents, tokenized_tags)

    # indexing
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in bert_sents]
    label_ids = [[LABELS.get(l) for l in lab] for lab in bert_labels]

    input_ids_pad = pad_sequences(
        input_ids,
        maxlen=BERT_INPUT_SEQUENCE_LENGTH,
        dtype="long",
        truncating="post",
        padding="post",
    )
    labels_ids_pad = pad_sequences(
        label_ids,
        maxlen=BERT_INPUT_SEQUENCE_LENGTH,
        value=LABELS["O"],
        dtype="long",
        truncating="post",
        padding="post",
    )

    attention_masks = []
    for seq in input_ids_pad:
        mask = [float(i > 0) for i in seq]
        attention_masks.append(mask)

    train_data = TensorDataset(
        torch.tensor(input_ids_pad),
        torch.tensor(attention_masks),
        torch.tensor(labels_ids_pad),
    )
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=batch_size)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("GPU: {}".format(device))
    print("Number of GPUs: {}".format(n_gpu))
    if device == torch.device("cuda"):
        board = torch.cuda.get_device_name()
        print("Board: {}".format(board))

    model = BertForTokenClassification.from_pretrained("bert-base-uncased",
                                                       num_labels=len(LABELS))

    if device == torch.device("cuda"):
        model.cuda()

    if full_fine_tunning:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "gamma", "beta"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate":
                0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate":
                0.0,
            },
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{
            "params": [p for n, p in param_optimizer]
        }]

    optimizer = BertAdam(optimizer_grouped_parameters, lr=5e-5, warmup=0.1)

    tr_loss_set = []

    for epoch in range(epochs):
        # train
        model.train()

        tr_loss = 0
        nb_tr_steps = 0

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_masks, b_labels = batch

            logits = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_masks)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, len(LABELS)), b_labels.view(-1))

            tr_loss_set.append(loss.item())

            loss.backward()

            # gradient clipping
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(),
                                           max_norm=1.0)

            optimizer.step()
            model.zero_grad()

            tr_loss += loss.item()
            nb_tr_steps += 1

        print(f"# of EPOCH: {epoch}")
        print("Train loss: {}".format(tr_loss / nb_tr_steps))

    torch.save(model.state_dict(), str(MODEL_PATH))
    model.config.to_json_file(MODEL_CONFIG_PATH)
コード例 #23
0
 def load_token_classifier(self, classifier_model_name, tag2idx):
     self.model = BertForTokenClassification.from_pretrained(classifier_model_name, num_labels=len(tag2idx))
コード例 #24
0
### CONFIG ###
MAX_LEN = 50
BATCH_SIZE = 32
TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-improved-sentiwordnet-arguingfullindiv-pos.tsv?token=AD7GEDK3MI27HVJPQWOE74C6FBZHA'
DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-improved-sentiwordnet-arguingfullindiv-pos.tsv?token=AD7GEDM3LOMZM6MP4HZS4MS6FBZHK'
EPOCHS = 3
MAX_GRAD_NORM = 1.0
##############


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(torch.cuda.get_device_name(0))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)
model = BertForTokenClassification.from_pretrained('bert-base-uncased',
                                                   num_labels=2)
model.cuda()


def get_comments(filename, url=True):
    if url:
        comments = []
        with urllib.request.urlopen(filename) as f:
            for line in f:
                if line.startswith(b'#'):
                    comments.append(line.decode("utf-8"))
                else:
                    break
        return comments
    with open(filename, 'r', encoding='utf8') as f:
        commentiter = takewhile(lambda s: s.startswith('#'), f)
コード例 #25
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default='/home/adzuser/user_achyuta/BERT_NER_Test/BERT-NER/NERdata/',
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default='NER',
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default='ner_output',
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run test on the test set.")
    parser.add_argument("--do_pred",
                        action='store_true',
                        help="Whether to run pred on the pred set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        "--num_train_epochs",
        default=4.0,  #3.0,
        type=float,
        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--clip',
                        type=float,
                        default=0.5,
                        help="gradient clipping")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")

    parser.add_argument('--text_a', type=str, default='', help="input text_a.")
    parser.add_argument('--text_b', type=str, default='', help="input text_b.")

    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {"ner": NerProcessor}

    num_labels_task = {
        "ner": 17  #6#12
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval and not args.do_pred:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        #train_examples = train_examples[:1000]
        print("train_examples :: ", len(list(train_examples)))
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(
            args.local_rank))
    #imodel = BertForSequenceClassification.from_pretrained(args.bert_model,
    #          cache_dir=cache_dir,
    #          num_labels = num_labels)
    model = BertForTokenClassification.from_pretrained(args.bert_model,
                                                       cache_dir=cache_dir,
                                                       num_labels=num_labels)

    if args.fp16:
        model.half()
    #model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)

        #all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        #all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        #all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        #all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

        all_input_ids = [f.input_ids for f in train_features]
        all_input_mask = [f.input_mask for f in train_features]
        all_segment_ids = [f.segment_ids for f in train_features]
        all_label_ids = [f.label_id for f in train_features]

        # convert to cuda
        #all_input_ids = all_input_ids.to(device)
        #all_input_mask = all_input_mask.to(device)
        #all_segment_ids = all_segment_ids.to(device)
        #all_label_ids = all_label_ids.to(device)

        #train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        #if args.local_rank == -1:
        #    train_sampler = RandomSampler(train_data)
        #else:
        #    train_sampler = DistributedSampler(train_data)
        #train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        # create model
        #model.train()
        model = KerasClassifier(build_fn=create_model(model), verbose=0)
        # define the grid search parameters
        batch_size = [10, 20, 40, 60, 80, 100]
        epochs = [10, 50, 100]
        param_grid = dict(batch_size=batch_size, epochs=epochs)
        grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)

        #keras_input = KerasInputFormatter([
        #    ('input_one', all_input_ids),
        #    ('input_two', all_segment_ids),
        #    ('input_three', all_input_mask),])
        #input_one = Input(name = 'input_one', shape = (128,))
        #input_two = Input(name = 'input_two', shape = (128,))
        #input_three = Input(name = 'input_three', shape = (128,))
        #input_four = Input(name = 'input_four', shape = (128,))

        #output = y # define model here

        #self.model = Model(inputs = [input_one, input_two, input_three], outputs = output)

        grid_result = grid.fit(
            [[all_input_ids, all_segment_ids, all_input_mask]],
            [all_label_ids])
        # summarize results
        print("Best: %f using %s" %
              (grid_result.best_score_, grid_result.best_params_))
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, param))
        '''
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                #print(input_ids.shape,input_mask.shape,segment_ids.shape,label_ids.shape)
                #print(input_ids[0])
                #print(label_ids[0])
                #logits = model(input_ids, segment_ids, input_mask)
                #import pdb;pdb.set_trace()
                #print(logits.view(-1, num_labels).shape, label_ids.view(-1).shape)
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                
                # added clip
                if args.clip is not None:
                    _ = torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
       '''

    if args.do_train:
        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())

        # Load a trained model and config that you have fine-tuned
        config = BertConfig(output_config_file)
        #model = BertForSequenceClassification(config, num_labels=num_labels)
        model = BertForTokenClassification(config, num_labels=num_labels)
        model.load_state_dict(torch.load(output_model_file))
    else:
        #model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
        # Load a trained model and config that you have fine-tuned
        print('for eval only......................')
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        config = BertConfig(output_config_file)
        #model = BertForSequenceClassification(config, num_labels=num_labels)
        model = BertForTokenClassification(config, num_labels=num_labels)
        model.load_state_dict(torch.load(output_model_file))
    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        #import pdb;pdb.set_trace()
        print("dev_eaxmples :: ", len(list(eval_examples)))
        eval_features = convert_examples_to_features_pred(
            eval_examples, label_list, args.max_seq_length, tokenizer)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions, true_labels = [], []
        #predictions1 , true_labels1 = [], []

        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss = model(input_ids, segment_ids, input_mask,
                                      label_ids)
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            # get index till '[SEP]'
            #print("label_list index SEP : ",label_list.index('[SEP]'))
            pred_xx = [list(p) for p in np.argmax(logits, axis=2)]
            pred_xx = [i[:i.index(label_list.index('[SEP]'))] for i in pred_xx]
            label_ids_xx = [
                i[:i.index(label_list.index('[SEP]'))]
                for i in label_ids.tolist()
            ]
            #print(label_ids_xx)
            #print(pred_xx)

            # new add
            tmp_s = [
                max(len(i), len(j)) for i, j in zip(label_ids_xx, pred_xx)
            ]
            tmp_u = [(i + [31] * (k - len(i)) if len(i) != k else i,
                      j + [31] * (k - len(j)) if len(j) != k else j)
                     for i, j, k in zip(label_ids_xx, pred_xx, tmp_s)]
            tmp_d1 = [h[0] for h in tmp_u]
            tmp_d2 = [h[1] for h in tmp_u]

            #print([list(p) for p in np.argmax(logits, axis=2)][:5])
            #tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            tmp_eval_accuracy = flat_accc(pred_xx, label_ids_xx)
            #tmp_eval_accuracy = flat_accc(tmp_d1, tmp_d2)
            predictions.extend(tmp_d2)
            true_labels.append(tmp_d1)
            #predictions1.extend(pred_xx)
            #true_labels1.append(label_ids_xx)

            #print("tmp accuracy : ",tmp_eval_accuracy)
            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_steps
        loss = tr_loss / nb_tr_steps if args.do_train else None

        pred_tags = [[label_list[p_i] if p_i != 31 else 'XXX' for p_i in p]
                     for p in predictions]
        valid_tags = [[
            label_list[l_ii] if l_ii != 31 else 'YYY' for l_ii in l_i
        ] for l in true_labels for l_i in l]
        print("valid_tags : ", valid_tags[:10])
        print("pred_tags : ", pred_tags[:10])
        print("Validation F1-Score: {}".format(f1_score(valid_tags,
                                                        pred_tags)))
        print("Validation accuracy_score : {}".format(
            accuracy_score(valid_tags, pred_tags)))
        print("Validation classification_report : {}".format(
            classification_report(valid_tags, pred_tags)))

        #print("X Validation F1-Score: {}".format(f1_score(true_labels1, predictions1)))
        #print("X Validation accuracy_score : {}".format(accuracy_score(true_labels1, predictions1)))
        #print("X Validation classification_report : {}".format(classification_report(true_labels1, predictions1)))

        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'global_step': global_step,
            'loss': loss
        }
        print(result)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("%s = %s\n" % (key, str(result[key])))

    if args.do_test and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_test_examples(args.data_dir)
        print('test examples len : {}'.format(len(eval_examples)))
        #import pdb;pdb.set_trace()
        eval_features = convert_examples_to_features_pred(
            eval_examples, label_list, args.max_seq_length, tokenizer)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        test_loss, test_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions, true_labels = [], []

        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss = model(input_ids, segment_ids, input_mask,
                                      label_ids)
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            # get index till '[SEP]'
            #print("label_list index SEP : ",label_list.index('[SEP]'))
            pred_xx = [list(p) for p in np.argmax(logits, axis=2)]
            pred_xx = [i[:i.index(label_list.index('[SEP]'))] for i in pred_xx]
            label_ids_xx = [
                i[:i.index(label_list.index('[SEP]'))]
                for i in label_ids.tolist()
            ]
            #print(label_ids_xx)
            #print(pred_xx)

            # new add
            tmp_s = [
                max(len(i), len(j)) for i, j in zip(label_ids_xx, pred_xx)
            ]
            tmp_u = [(i + [31] * (k - len(i)) if len(i) != k else i,
                      j + [31] * (k - len(j)) if len(j) != k else j)
                     for i, j, k in zip(label_ids_xx, pred_xx, tmp_s)]
            tmp_d1 = [h[0] for h in tmp_u]
            tmp_d2 = [h[1] for h in tmp_u]

            #print([list(p) for p in np.argmax(logits, axis=2)][:5])
            #tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            tmp_eval_accuracy = flat_accc(pred_xx, label_ids_xx)
            #tmp_eval_accuracy = flat_accc(tmp_d1, tmp_d2)
            predictions.extend(tmp_d2)
            true_labels.append(tmp_d1)
            #print("tmp accuracy : ",tmp_eval_accuracy)
            test_loss += tmp_eval_loss.mean().item()
            test_accuracy += tmp_eval_accuracy
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        test_loss = test_loss / nb_eval_steps
        test_accuracy = test_accuracy / nb_eval_steps
        loss = tr_loss / nb_tr_steps if args.do_train else None

        pred_tags = [[label_list[p_i] if p_i != 31 else 'XXX' for p_i in p]
                     for p in predictions]
        valid_tags = [[
            label_list[l_ii] if l_ii != 31 else 'YYY' for l_ii in l_i
        ] for l in true_labels for l_i in l]
        print("valid_tags : ", valid_tags[:10])
        print("pred_tags : ", pred_tags[:10])
        print("Test F1-Score: {}".format(f1_score(valid_tags, pred_tags)))
        print("Test accuracy_score : {}".format(
            accuracy_score(valid_tags, pred_tags)))
        print("Test classification_report : {}".format(
            classification_report(valid_tags, pred_tags)))

        #print("X Test F1-Score: {}".format(f1_score(true_labels, predictions)))
        #print("X Test accuracy_score : {}".format(accuracy_score(true_labels, predictions)))
        #print("X Test classification_report : {}".format(classification_report(true_labels, predictions)))

        result = {
            'test_loss': test_loss,
            'test_accuracy': test_accuracy,
            'global_step': global_step,
            'loss': loss
        }
        print(result)
        output_test_file = os.path.join(args.output_dir, "test_results.txt")
        with open(output_test_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("%s = %s\n" % (key, str(result[key])))

    if args.do_pred and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        #eval_examples = processor.get_dev_examples(args.data_dir)
        model.eval()
        while True:
            print(
                'enter a text to get NER. otherwise press Ctrl+C to close session.'
            )
            text_a = input('>>>')
            #"Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday . ."
            eval_examples = {
                'text_a': text_a,
                'text_b':
                "The foodservice pie business does not fit our long-term growth strategy .",
                'label': '1',
                'guid': '12345'
            }

            eval_features = convert_examples_to_features_test(
                eval_examples, label_list, args.max_seq_length, tokenizer)

            all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.long)
            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label_ids)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            #model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            predictions, true_labels = [], []

            for input_ids, input_mask, segment_ids, label_ids in tqdm(
                    eval_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    tmp_eval_loss = model(input_ids, segment_ids, input_mask,
                                          label_ids)
                    logits = model(input_ids, segment_ids, input_mask)

                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()

                pred_xx = [list(p) for p in np.argmax(logits, axis=2)]
                pred_xx = [
                    i[:i.index(label_list.index('[SEP]'))] for i in pred_xx
                ]

                print(pred_xx)
                print([[label_list[p_i] if p_i != 31 else 'XXX' for p_i in p]
                       for p in pred_xx])
コード例 #26
0
def do_07_bert(data: pd.DataFrame, cv=5):
    # This has multiple issues, that we couldn't fix at this moment

    # TODO: Try with cased model
    # TODO: Try with data already tokenized by BERT tokenizer
    # TODO: Fix Out uf memory error if possible
    model_name = 'bert-base-uncased'

    getter = SentenceGetter(data)

    # we need actual sentences this time as bert provides a tagger we will re-use
    sentences = [
        " ".join([s[0] for s in sentence]) for sentence in getter.sentences
    ]
    labels = [[s[2] for s in sent] for sent in getter.sentences]

    tags_vals = list(set(data["Tag"].values))
    tag2idx = {t: i for i, t in enumerate(tags_vals)}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()

    bs = batch_size = 32

    # use berts tokenizer
    tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    MAX_LEN = max(len(s) for s in tokenized_texts)
    print("MAX_LEN: %d" % MAX_LEN)

    # NOTE: The tutorial seems to assume that bert and our input are basically tokenized
    #       to the same units making the labels still applicable to the input
    #       Result of the below for the original texts is:
    #           Mean: 2.68, differing: 0.73
    # for i in [4, 7, 58, 1200]:
    #     print(sentences[i])
    #     print(tokenized_texts[i])
    #     print("---")
    # differences = [len(tokenized_texts[i]) - len(labels[i]) for i in range(len(sentences))]
    # differences = [d * -1  if d < 0 else d for d in differences]
    # mean = sum(differences) / len(differences)
    # print("Mean: %.2f, differing: %.2f" % (mean, len([d for d in differences if d != 0]) / len(sentences)))

    # Pad the inputs
    input_ids = pad_sequences(
        [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
        maxlen=MAX_LEN,
        dtype="long",
        truncating="post",
        padding="post")

    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                         value=tag2idx["O"],
                         maxlen=MAX_LEN,
                         dtype="long",
                         padding="post",
                         truncating="post")

    # Prepare test and training data
    attention_masks = [[float(i > 0) for i in ii] for ii in input_ids]
    tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(
        input_ids, tags, random_state=2018, test_size=0.1)
    tr_masks, val_masks, _, _ = train_test_split(attention_masks,
                                                 input_ids,
                                                 random_state=2018,
                                                 test_size=0.1)
    tr_inputs = torch.tensor(tr_inputs)
    val_inputs = torch.tensor(val_inputs)
    tr_tags = torch.tensor(tr_tags)
    val_tags = torch.tensor(val_tags)
    tr_masks = torch.tensor(tr_masks)
    val_masks = torch.tensor(val_masks)

    # training will be shuffled
    train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=bs)

    # test data will be given sequentially
    valid_data = TensorDataset(val_inputs, val_masks, val_tags)
    valid_sampler = SequentialSampler(valid_data)
    valid_dataloader = DataLoader(valid_data,
                                  sampler=valid_sampler,
                                  batch_size=bs)

    # load the model and send params to gpu if available
    model = BertForTokenClassification.from_pretrained("bert-base-uncased",
                                                       num_labels=len(tag2idx))
    if device.type == "cuda":
        model.cuda()

    # Paramaters for finetuning
    FULL_FINETUNING = True
    if FULL_FINETUNING:
        # "We also add some weight_decay as regularization to the main weight matrices."
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
            0.0
        }]
    else:
        # "If you have limited resources, you can also try to just train the linear classifier on
        # top of Bert and keep all other weights fixed. This will still give you a good performance."
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{
            "params": [p for n, p in param_optimizer]
        }]
    optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

    # A function for finetuning
    def flat_accuracy(preds, labels):
        pred_flat = np.argmax(preds, axis=2).flatten()
        labels_flat = labels.flatten()
        return np.sum(pred_flat == labels_flat) / len(labels_flat)

    # RUN FINE-TUNING
    epochs = 5
    max_grad_norm = 1.0

    for _ in trange(epochs, desc="Epoch"):
        # TRAIN loop
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(train_dataloader):
            # add batch to gpu
            print(batch)
            batch = tuple(t.to(device) for t in batch)
            print(batch)
            b_input_ids, b_input_mask, b_labels = batch
            # forward pass
            loss = model(b_input_ids,
                         token_type_ids=None,
                         attention_mask=b_input_mask,
                         labels=b_labels)
            # backward pass
            loss.backward()
            # track train loss
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1
            # gradient clipping
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(),
                                           max_norm=max_grad_norm)
            # update parameters
            optimizer.step()
            model.zero_grad()
        # print train loss per epoch
        print("Train loss: {}".format(tr_loss / nb_tr_steps))
        # VALIDATION on validation set
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions, true_labels = [], []
        for batch in valid_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            with torch.no_grad():
                tmp_eval_loss = model(b_input_ids,
                                      token_type_ids=None,
                                      attention_mask=b_input_mask,
                                      labels=b_labels)
                logits = model(b_input_ids,
                               token_type_ids=None,
                               attention_mask=b_input_mask)
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.append(label_ids)

            tmp_eval_accuracy = flat_accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += b_input_ids.size(0)
            nb_eval_steps += 1
        eval_loss = eval_loss / nb_eval_steps
        print("Validation loss: {}".format(eval_loss))
        print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
        pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
        valid_tags = [
            tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i
        ]
        print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

    # EVALUATION
    model.eval()
    predictions = []
    true_labels = []
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids,
                                  token_type_ids=None,
                                  attention_mask=b_input_mask,
                                  labels=b_labels)
            logits = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_mask)

        logits = logits.detach().cpu().numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        label_ids = b_labels.to('cpu').numpy()
        true_labels.append(label_ids)
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1

    pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
    valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels
                  for l_i in l]
    print("Validation loss: {}".format(eval_loss / nb_eval_steps))
    print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

    exit()
コード例 #27
0
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

model = BertForTokenClassification.from_pretrained("bert-base-uncased",
                                                   num_labels=len(tag2idx))
model.cuda()

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
コード例 #28
0
ファイル: main.py プロジェクト: koukaiu/dlut-nihao
def train(config):
    print('-' * 50)
    print('Loading pre-trained transfer model......')
    model = BertForTokenClassification.from_pretrained(
        config['model_name'], num_labels=config['tagset_size'] + 1)
    if config['update_model']:
        load_model_name = config['update_model']
        print('load update model name is : ' + load_model_name)
        checkpoint = torch.load(load_model_name)
        model.load_state_dict(checkpoint['net'])
    model.to(config['device'])
    print('Load pre-trained transfer model done!')
    print('-' * 50)
    print('Deploying the training data......')
    train_data_loader = prepare_data(config)
    print('The training data is %d batches with %d batch sizes' %
          (len(train_data_loader), config['batch_size']))
    if os.path.isfile(config['valid_file']):
        print('Dev process set ! Deploying the Dev data......')
        config['mode'] = config['mode'].replace('train', 'valid')
        valid_sents, valid_data_loader = prepare_data(config)
        print('The validation data has loaded done!')
        config['mode'] = config['mode'].replace('valid', 'train')
    else:
        print('the valid file ' + config['valid_file'] +
              ' is not exist pls check it in the config.txt')
    print('-' * 50)
    print('Train step! The model runs on ' + str(config['device']))
    loss_list = dict()
    # train set
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config['lr'],
                         schedule=None)
    loss_function = nn.NLLLoss(ignore_index=0)
    best_f1 = 0.0
    for epoch in range(config['epochs']):
        model.train()
        model.zero_grad()
        total_loss = 0
        batch_step = 0
        for batch, data in enumerate(train_data_loader):
            batch_step += 1
            inputs, labels, seq_len = data
            batch_size = inputs.size(0)
            seql = inputs.size(1) - 2
            segments_tensors = torch.zeros(inputs.size(0),
                                           inputs.size(1),
                                           dtype=torch.int64)
            logits = model(input_ids=inputs.to(config['device']),
                           token_type_ids=segments_tensors.to(
                               config['device']),
                           attention_mask=None,
                           labels=None)
            logits = logits[:, 1:-1, :]
            logits = logits.reshape(batch_size * seql, -1)
            logits = F.log_softmax(logits, 1)
            labels = labels.to(config['device'])
            loss = loss_function(logits, labels.view(batch_size * seql))
            total_loss += float(loss)
            loss.backward()
            optimizer.step()
            model.zero_grad()
            print("\rEpoch: %d ! the process is in %d of %d ! " %
                  (epoch + 1, batch + 1, len(train_data_loader)),
                  end='')
        loss_avg = total_loss / batch_step
        loss_list[epoch] = loss_avg
        print("The loss is %f ! " % (loss_avg))

        # valid process
        if os.path.isfile(config['valid_file']) and os.path.isfile(
                config['gold_file']):
            model.eval()
            with torch.no_grad():
                valid_results = predict(model, valid_data_loader, config)
                valid_results = restore_result(valid_sents, valid_results)
                tmp_filename = ''.join(
                    random.sample(string.ascii_letters + string.digits, 8))
                write(valid_results, tmp_filename + '.txt')
                # create an empty file
                ftmp = open(tmp_filename + '_dict.txt', 'w', encoding='utf8')
                ftmp.close()
                res = score_shell(tmp_filename + '_dict.txt',
                                  config['gold_file'], tmp_filename + '.txt',
                                  tmp_filename + '_score.txt')
                if res == 0:
                    get_score_cmd = 'grep \'F MEASURE\' ' + tmp_filename + '_score.txt'
                    f1 = os.popen(get_score_cmd).read().replace('\n', '')
                    print('The evaluation of epoch {} is {} !'.format(
                        str(epoch + 1), f1))
                else:
                    print(
                        'The command of score failed, pls check or remove the validation step'
                    )
                os.system('rm ' + tmp_filename + '_dict.txt')
                os.system('rm ' + tmp_filename + '.txt')
                os.system('rm ' + tmp_filename + '_score.txt')
        # model save process
        if config['model_path'] and (epoch + 1) % int(
                config['save_model_epochs']) == 0:
            state = {
                'net': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch
            }
            if config['save_model_name']:
                model_name = os.path.join(
                    config['model_path'],
                    config['save_model_name'] + '_' + str(epoch + 1) + '.pkl')
            else:
                model_name = os.path.join(config['model_path'],
                                          str(epoch + 1) + '.pkl')
            torch.save(state, model_name)
            print('The epoch %d is saved successfully, named %s !' %
                  (epoch + 1, model_name))

    print('train done!')
コード例 #29
0
ファイル: train_ner.py プロジェクト: zatcsc/neuralnet-data
    test_y_tensor = torch.tensor(test_output_ids).to(device)
    test_mask_tensor = torch.tensor(test_attention_masks).to(device)

    train_data = TensorDataset(train_x_tensor, train_mask_tensor,
                               train_y_tensor)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=bs)

    test_data = TensorDataset(test_x_tensor, test_mask_tensor, test_y_tensor)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data,
                                 sampler=test_sampler,
                                 batch_size=bs)
    model = BertForTokenClassification.from_pretrained(
        "bert-base-cased", num_labels=len(label2id)).to(device)
    FULL_FINETUNING = False
    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
コード例 #30
0
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

model = BertForTokenClassification.from_pretrained(bert_model,
                                                   num_labels=len(tag2idx),
                                                   cache_dir=utiler.cache_dir)

model = model.cuda()
# model = torch.nn.DataParallel(model)

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {