Beispiel #1
0
    def __init__(self, embedding_dir, model_name="bert-base-multilingual-cased", layer=-2):
        super(BertEncoder, self).__init__(embedding_dir)

        # Load pre-trained model (weights) and set to evaluation mode (no more training)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval()

        # Load word piece tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

        # Layer from which to get the embeddings
        self.layer = layer
Beispiel #2
0
vocab, embd = data_helpers.build_vocab(FLAGS.dataset,
                                       FLAGS.pretrained_embeddings_path)
if len(FLAGS.pretrained_embeddings_path) > 0:
    assert (embd.shape[1] == FLAGS.embedding_dim)
    with open('{}/embd.pkl'.format(FLAGS.dataset), 'wb') as fout:
        pickle.dump(embd, fout)
with open('{}/vocab.pkl'.format(FLAGS.dataset), 'wb') as fout:
    pickle.dump(vocab, fout)
alist = data_helpers.read_alist_standalone(FLAGS.dataset, "vocab.txt",
                                           FLAGS.max_sequence_length_a,
                                           FLAGS.padding)
raw, raw_dict = data_helpers.read_raw_bert(FLAGS.dataset)
devList = data_helpers.loadTestSet(FLAGS.dataset, "valid.data")
testList = data_helpers.loadTestSet(FLAGS.dataset, "test.data")
testallList = data_helpers.loadTestSet(FLAGS.dataset, "test.data")  # testall
tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                          do_lower_case=False)

print("Load done...")
if not os.path.exists('./log/'):
    os.mkdir('./log/')
log_precision = 'log/{}.test.gan_precision.{}.log'.format(
    FLAGS.prefix, timeStamp)
log_loss = 'log/{}.test.gan_loss.{}.log'.format(FLAGS.prefix, timeStamp)


def log_time_delta(func):
    @wraps(func)
    def _deco(*args, **kwargs):
        start = time.time()
        ret = func(*args, **kwargs)
        end = time.time()
Beispiel #3
0
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 파일 읽어 오기
with open('train.txt', mode='r', encoding='utf-8') as file:
    sentence = list()
    raw_sentence = list()
    category = list()
    
    while True:
        data = file.readline()
        
        if data:
            category.append(data.split('\t')[0])
            sentence.append(data.split('\t')[1])
            raw_sentence.append(data.split('\t')[1])
        else:
            break

tokenized_input_text = list()
indexed_input_text = list()

tokenized_output_text = list()
indexed_output_text = list()

for (sentence1, sentence2) in zip(sentence, category):
    sentence1 = "[CLS] " + sentence1 + " [SEP]"
    sentence2 = "[CLS] " + sentence2 + " [SEP]"
    
Beispiel #4
0
import numpy as np
import pandas as pd
from pprint import pprint
from typing import Tuple, Callable, List
import pickle
import json
from tqdm import tqdm
from collections import OrderedDict
import re

from pathlib import Path
from pytorch_pretrained_bert import BertTokenizer
from vocab import Vocabulary
from pad_sequence import keras_pad_fn, my_pad

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")


class NamedEntityRecognitionDataset(Dataset):
    def __init__(self,
                 train_data_dir: str,
                 vocab,
                 tokenizer=bert_tokenizer,
                 maxlen=30,
                 model_dir=Path('data_in')) -> None:
        """
        :param train_data_in:
        :param transform_fn:
        """
        self.model_dir = model_dir
Beispiel #5
0
model.to(device)

test_data = pd.read_table('../cnews/cnews.test.txt',
                          encoding='utf-8',
                          names=['label', 'text'])
le = LabelEncoder()
le.fit(test_data.label.tolist())
# 标签ID化
test_data['label_id'] = le.transform(test_data.label.tolist())
labels_data = test_data.groupby(['label', 'label_id']).count().reset_index()
labels_map = labels_data[['label', 'label_id']]
test_data = test_data[['text', 'label_id']]

# 转换为tensor
# 分词工具
bert_tokenizer = BertTokenizer.from_pretrained('../chinese_wwm_ext_pytorch',
                                               do_lower_case=False)
# 类初始化
processor = DataPrecessForSingleSentence(bert_tokenizer=bert_tokenizer)
test_seqs, test_seq_masks, test_seq_segments, test_labels = processor.get_input(
    dataset=test_data)
test_seqs = torch.tensor(test_seqs, dtype=torch.long)
test_seq_masks = torch.tensor(test_seq_masks, dtype=torch.long)
test_seq_segments = torch.tensor(test_seq_segments, dtype=torch.long)
test_labels = torch.tensor(test_labels, dtype=torch.long)
test_data = TensorDataset(test_seqs, test_seq_masks, test_seq_segments,
                          test_labels)
test_dataloder = DataLoader(dataset=test_data, batch_size=192)
# 用于存储预测标签与真实标签
true_labels = []
pred_labels = []
model.eval()
def preprocessor(
    data_dir,
    task_name,
    split,
    bert_model_name="bert-base-uncased",
    max_data_samples=None,
    max_sequence_length=128,
):

    sentences, labels = parse_tsv(data_dir, task_name, split, max_data_samples)

    labels = torch.from_numpy(np.array(labels))

    do_lower_case = "uncased" in bert_model_name

    tokenizer = BertTokenizer.from_pretrained(
        bert_model_name, do_lower_case=do_lower_case
    )

    bert_token_ids = []
    bert_token_masks = []
    bert_token_segments = []

    for sentence in sentences:
        if len(sentence) not in [1, 2]:
            logger.error("Sentence length doesn't match.")

        # Tokenize sentences
        tokenized_sentence = [tokenizer.tokenize(sent) for sent in sentence]
        sent1_tokens = tokenized_sentence[0]
        sent2_tokens = tokenized_sentence[1] if len(tokenized_sentence) == 2 else None

        # One sentence case
        if len(tokenized_sentence) == 1:
            # Remove tokens that exceeds the max_sequence_length
            if len(sent1_tokens) > max_sequence_length - 2:
                # Account for [CLS] and [SEP] with "- 2"
                sent1_tokens = sent1_tokens[: max_sequence_length - 2]
        # Two sentences case
        else:
            # Remove tokens that exceeds the max_sequence_length
            while True:
                total_length = len(sent1_tokens) + len(sent2_tokens)
                # Account for [CLS], [SEP], [SEP] with "- 3"
                if total_length <= max_sequence_length - 3:
                    break
                if len(sent1_tokens) > len(sent2_tokens):
                    sent1_tokens.pop()
                else:
                    sent2_tokens.pop()

        # Convert to BERT manner
        tokens = ["[CLS]"] + sent1_tokens + ["[SEP]"]
        token_segments = [0] * len(tokens)

        if sent2_tokens:
            tokens += sent2_tokens + ["[SEP]"]
            token_segments += [1] * (len(sent2_tokens) + 1)

        token_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Generate mask where 1 for real tokens and 0 for padding tokens
        token_masks = [1] * len(token_ids)

        bert_token_ids.append(torch.LongTensor(token_ids))
        bert_token_masks.append(torch.LongTensor(token_masks))
        bert_token_segments.append(torch.LongTensor(token_segments))

    return bert_token_ids, bert_token_segments, bert_token_masks, labels
def main():
    # train_df = pd.read_csv(TRAIN_PATH).sample(frac=1.0, random_state=seed)
    # train_size = int(len(train_df) * 0.9)
    train_df = pd.read_csv(TRAIN_PATH).sample(train_size + valid_size,
                                              random_state=seed)
    LOGGER.info(f'data_size is {len(train_df)}')
    LOGGER.info(f'train_size is {train_size}')

    y = np.where(train_df['target'] >= 0.5, 1, 0)
    y_aux = train_df[AUX_COLUMNS].values

    identity_columns_new = []
    for column in identity_columns + ['target']:
        train_df[column + "_bin"] = np.where(train_df[column] >= 0.5, True,
                                             False)
        if column != "target":
            identity_columns_new.append(column + "_bin")

    sample_weights = np.ones(len(train_df), dtype=np.float32)
    sample_weights += train_df[identity_columns_new].sum(axis=1)
    sample_weights += train_df['target_bin'] * (
        ~train_df[identity_columns_new]).sum(axis=1)
    sample_weights += (~train_df['target_bin']
                       ) * train_df[identity_columns_new].sum(axis=1) * 5
    sample_weights /= sample_weights.mean()

    with timer('preprocessing text'):
        # df["comment_text"] = [analyzer_embed(text) for text in df["comment_text"]]
        train_df['comment_text'] = train_df['comment_text'].astype(str)
        train_df = train_df.fillna(0)

    with timer('load embedding'):
        tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH,
                                                  cache_dir=None,
                                                  do_lower_case=True)
        X_text = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"),
                               max_len, tokenizer)

    test_df = train_df[train_size:]

    with timer('train'):
        X_train, y_train, y_aux_train, w_train = X_text[:
                                                        train_size], y[:
                                                                       train_size], y_aux[:
                                                                                          train_size], sample_weights[:
                                                                                                                      train_size]
        X_val, y_val, y_aux_val, w_val = X_text[train_size:], y[
            train_size:], y_aux[train_size:], sample_weights[train_size:]
        model = BertForSequenceClassification.from_pretrained(
            WORK_DIR, cache_dir=None, num_labels=n_labels)
        model.zero_grad()
        model = model.to(device)

        train_dataset = torch.utils.data.TensorDataset(
            torch.tensor(X_train, dtype=torch.long),
            torch.tensor(y_train, dtype=torch.float))
        valid = torch.utils.data.TensorDataset(
            torch.tensor(X_val, dtype=torch.long),
            torch.tensor(y_val, dtype=torch.float))
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True)
        valid_loader = torch.utils.data.DataLoader(valid,
                                                   batch_size=batch_size * 2,
                                                   shuffle=False)

        sample_weight_train = [w_train.values, np.ones_like(w_train)]
        sample_weight_val = [w_val.values, np.ones_like(w_val)]

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        num_train_optimization_steps = int(epochs * train_size / batch_size /
                                           accumulation_steps)
        total_step = int(epochs * train_size / batch_size)

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=2e-5,
                             warmup=0.05,
                             t_total=num_train_optimization_steps)

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O1",
                                          verbosity=0)
        criterion = torch.nn.BCEWithLogitsLoss().to(device)

        LOGGER.info(f"Starting 1 epoch...")
        tr_loss, train_losses = train_one_epoch(model, train_loader, criterion,
                                                optimizer, device,
                                                accumulation_steps, total_step,
                                                n_labels)
        LOGGER.info(f'Mean train loss: {round(tr_loss,5)}')

        torch.save(model.state_dict(), '{}_dic'.format(exp))

        valid_loss, oof_pred = validate(model, valid_loader, criterion, device,
                                        n_labels)
        del model
        gc.collect()
        torch.cuda.empty_cache()

    test_df["pred"] = oof_pred.reshape(-1)
    test_df = convert_dataframe_to_bool(test_df)
    bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns)
    LOGGER.info(bias_metrics_df)

    score = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df))
    LOGGER.info(f'final score is {score}')

    test_df.to_csv("oof.csv", index=False)

    xs = list(range(1, len(train_losses) + 1))
    plt.plot(xs, train_losses, label='Train loss')
    plt.legend()
    plt.xticks(xs)
    plt.xlabel('Iter')
    plt.savefig("loss.png")
Beispiel #8
0
def prewin(path):
    dirname = os.path.dirname(path)
    name = os.path.basename(path)
    rawname = os.path.splitext(name)[0]  # without extension

    if 'lit' in name or 'literal' in name or 'LOCATION' in name:
        label = 0
    else:
        if 'met' in name or 'metonymic' in name or 'mixed' in name:
            label = 1  # 1 is for METONYMY/NON-LITERAL, 0 is for LITERAL
        elif 'INSTITUTE' in name:
            label = 1
        elif 'TEAM' in name:
            label = 2
        elif 'ARTIFACT' in name:
            label = 3
        elif 'EVENT' in name:
            label = 4

    bert_version = 'bert-base-uncased'
    model = BertModel.from_pretrained(bert_version)
    model.eval()
    spacy_tokenizer = English(parser=False)
    bert_tokenizer = BertTokenizer.from_pretrained(bert_version)
    en_nlp = spacy.load('en')
    inp = codecs.open(path, mode="r", encoding="utf-8")
    # PLEASE FORMAT THE INPUT FILE AS ONE SENTENCE PER LINE. SEE BELOW:
    # ENTITY<SEP>sentence<ENT>ENTITY<ENT>rest of sentence.
    # Germany<SEP>Their privileges as permanent Security Council members, especially the right of veto,
    # had been increasingly questioned by <ENT>Germany<ENT> and Japan which, as major economic powers.
    out = []
    seq_length = 5  # A window of 5 is the DEFAULT for the PUBLICATION methodology. Feel free to experiment.

    for line in inp:
        line = line.split(u"<SEP>")
        sentence = line[1].split(u"<ENT>")
        entity = [t.text for t in spacy_tokenizer(sentence[1])]
        en_doc = en_nlp(u"".join(sentence).strip())
        words = []
        index = locate_entity(en_doc, entity,
                              spacy_tokenizer(sentence[0].strip()),
                              spacy_tokenizer(sentence[2].strip()))
        start = find_start(en_doc[index])

        # --------------------------------------------------------------------
        # Token map will be an int -> int mapping
        #    between the `spacy_tokens` index and the `bert_tokens` index.
        spacy_to_bert_map = []
        bert_tokens = []
        spacy_tokens = [token.text for token in en_doc]
        '''
            According to https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
                [CLS] amd [SEP] tokens are important.
            Also, use the segment_ids to inform BERT
                that the input is just one sentence.
        '''
        spacy_tokens = ["[CLS]"] + spacy_tokens + ["[SEP]"]

        for orig_token in spacy_tokens:
            spacy_to_bert_map.append(len(bert_tokens))
            bert_tokens.extend(bert_tokenizer.tokenize(orig_token))

        segments_ids = [1] * len(bert_tokens)

        try:
            token_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens)
            tokens_tensor = torch.tensor([token_ids])
            segments_tensors = torch.tensor([segments_ids])
            with torch.no_grad():
                encoded_layers, _ = model(tokens_tensor,
                                          segments_tensors,
                                          output_all_encoded_layers=True)
            '''
                According to http://jalammar.github.io/illustrated-bert/
                    concatenating the last four hidden four layers
                    is a good choice as a contextualised ELMo-like word embeddings.

                Concatenation leads to very long tensors.
                So I decided to take sum of the last four hiddden layers.
                This is the second best approach according to the blog.
            '''
            bert_emb = torch.add(encoded_layers[-1], encoded_layers[-2]).add(
                encoded_layers[-3]).add(encoded_layers[-4]).squeeze()
            bert_emb_length = bert_emb.shape[-1]
            '''
                Perform summation of subword embeddings to compute word embeddings
                Another choice is to compute the average of the subword embeddings.
                Concatenation is obviously not a good choice here.
                Source: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/

                Here, we perform summation of subword embeddings.
            '''
            cond_bert_emb = torch.zeros(len(spacy_tokens), bert_emb_length)
            for spacy_index in range(len(spacy_tokens)):
                start_bert_index = spacy_to_bert_map[spacy_index]
                try:
                    end_bert_index = spacy_to_bert_map[spacy_index + 1]
                except IndexError:
                    end_bert_index = len(bert_tokens)
                for foo in range(start_bert_index, end_bert_index):
                    cond_bert_emb[spacy_index] = cond_bert_emb[
                        spacy_index].add(bert_emb[foo])
        except ValueError:
            cond_bert_emb = torch.zeros(len(spacy_tokens), 768)
            print('ValueError Exception caught!')
        '''
            Since the two special tokens are added,
                strip bert embeddings appropriately.
            Now bert embeddings are in sync in spacy parse.
        '''
        cond_bert_emb = cond_bert_emb[1:-1]
        assert (len(cond_bert_emb) == len(en_doc))
        # --------------------------------------------------------------------

        left = seq_length * ["0.0"]
        right = seq_length * ["0.0"]
        dep_left = seq_length * ["0.0"]
        dep_right = seq_length * ["0.0"]
        bert_left = torch.zeros((seq_length, bert_emb_length))
        bert_right = torch.zeros_like(bert_left)

        if start.i > index:
            if index + 1 < len(en_doc) and en_doc[index + 1].dep_ in [u"case", u"compound", u"amod"] \
                    and en_doc[index + 1].head == en_doc[index]:  # any neighbouring word that links to it
                right = pad(
                    [en_doc[index + 1].text] +
                    [t.text for t in en_doc[start.i:][:seq_length - 1]], False,
                    seq_length)
                dep_right = pad([en_doc[index + 1].dep_] +
                                [t.dep_
                                 for t in en_doc[start.i:]][:seq_length - 1],
                                False, seq_length)
                bert_right = bert_pad(
                    torch.cat((torch.unsqueeze(cond_bert_emb[index + 1], 0),
                               cond_bert_emb[start.i:][:seq_length - 1])),
                    False, seq_length)
            else:
                right = pad([t.text for t in en_doc[start.i:][:seq_length]],
                            False, seq_length)
                dep_right = pad([t.dep_
                                 for t in en_doc[start.i:]][:seq_length],
                                False, seq_length)
                bert_right = bert_pad(cond_bert_emb[start.i:][:seq_length],
                                      False, seq_length)
        else:
            if index - len(entity) >= 0 and en_doc[index - len(entity)].dep_ in [u"case", u"compound", u"amod"] \
                    and en_doc[index - len(entity)].head == en_doc[index]:  # any neighbouring word that links to it
                left = pad(
                    [t.text
                     for t in en_doc[:start.i + 1][-(seq_length - 1):]] +
                    [en_doc[index - len(entity)].text], True, seq_length)
                dep_left = pad(
                    [t.dep_
                     for t in en_doc[:start.i + 1]][-(seq_length - 1):] +
                    [en_doc[index - len(entity)].dep_], True, seq_length)
                bert_left = bert_pad(
                    torch.cat(
                        (cond_bert_emb[:start.i + 1][-(seq_length - 1):],
                         torch.unsqueeze(cond_bert_emb[index - len(entity)],
                                         0))), True, seq_length)
            else:
                left = pad(
                    [t.text for t in en_doc[:start.i + 1][-seq_length:]], True,
                    seq_length)
                dep_left = pad([t.dep_
                                for t in en_doc[:start.i + 1]][-seq_length:],
                               True, seq_length)
                bert_left = bert_pad(cond_bert_emb[:start.i + 1][-seq_length:],
                                     True, seq_length)
        assert (bert_left.shape == bert_right.shape)
        assert (len(left) == len(dep_left) == len(bert_left))
        assert (len(right) == len(dep_right) == len(bert_right))
        out.append(
            (left, dep_left, bert_left, right, dep_right, bert_right, label))
        #print(left, right)
        #print(dep_left, dep_right)
        #print(bert_left, bert_right)
        #print(label)
        #print(line[1])
    print("Processed:{} lines/sentences.".format(len(out)))
    dump_to_hdf5("{}/bert_pickles/{}_prewin.hdf5".format(dirname, rawname),
                 out)
Beispiel #9
0
    def __init__(self, model_file='https://convlab.blob.core.windows.net/convlab-2/comer.zip',
                 embed_file='https://convlab.blob.core.windows.net/convlab-2/comer_embed.zip'):
        super().__init__()
        parser = argparse.ArgumentParser(description='predict.py')
        parser.add_argument('-config', default='config.yaml', type=str,
                            help="config file")
        parser.add_argument('-gpus', default=[0], nargs='+', type=int,
                            help="Use CUDA on the listed devices.")
        parser.add_argument('-restore', default='data/log/norml_mwNestedNOND128NOsaA1FNN/checkpoint.pt', type=str,
                            help="restore checkpoint")
        parser.add_argument('-seed', type=int, default=1234,
                            help="Random seed")
        parser.add_argument('-model', default='seq2seq', type=str,
                            help="Model selection")
        parser.add_argument('-score', default='', type=str,
                            help="score_fn")
        parser.add_argument('-pretrain', action='store_true',
                            help="load pretrain embedding")
        parser.add_argument('-limit', type=int, default=0,
                            help="data limit")
        parser.add_argument('-log', default='predict', type=str,
                            help="log directory")
        parser.add_argument('-unk', action='store_true',
                            help="replace unk")
        parser.add_argument('-memory', action='store_true',
                            help="memory efficiency")
        parser.add_argument('-beam_size', type=int, default=1,
                            help="beam search size")

        self.root_path = os.path.dirname(os.path.abspath(__file__))
        opt = parser.parse_args([])
        config = utils.read_config(os.path.join(self.root_path, opt.config))
        torch.manual_seed(opt.seed)
        use_cuda = True
        bert_type = 'bert-large-uncased'

        if os.path.exists(os.path.join(self.root_path, 'data/mwoz2_dm.dict')) and \
                os.path.exists(os.path.join(self.root_path, 'data/mwoz2_sl.dict')) and \
                os.path.exists(os.path.join(self.root_path, 'data/save_data.tgt.dict')) and \
                os.path.exists(os.path.join(self.root_path, 'data/log/norml_mwNestedNOND128NOsaA1FNN/checkpoint.pt')):
            pass
        else:
            output_dir = os.path.join(self.root_path, 'data')
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            print('Load from model_file param')
            archive_file = cached_path(model_file)
            archive = zipfile.ZipFile(archive_file, 'r')
            archive.extractall(self.root_path)
            archive.close()
        if not os.path.exists(os.path.join(self.root_path, 'data/emb_tgt_mw.pt')):
            output_dir = os.path.join(self.root_path, 'data')
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            print('Load from embed_file param')
            archive_file = cached_path(embed_file)
            archive = zipfile.ZipFile(archive_file, 'r')
            archive.extractall(self.root_path)
            archive.close()

        self.sl_dict = json.load(open(os.path.join(self.root_path, 'data/mwoz2_sl.dict')))
        self.dm_dict = json.load(open(os.path.join(self.root_path, 'data/mwoz2_dm.dict')))
        self.tokenizer = BertTokenizer.from_pretrained(bert_type)
        self.vocab = torch.load(os.path.join(self.root_path, 'data/save_data.tgt.dict'))
        self.reversed_vocab = {i: j for j, i in self.vocab.items()}

        pretrain_embed = {}
        pretrain_embed['slot'] = torch.load(os.path.join(self.root_path, 'data/emb_tgt_mw.pt'))

        print('building model...\n')
        bmodel = BertModel.from_pretrained(bert_type)
        bmodel.eval()
        if use_cuda:
            bmodel.to('cuda')
        self.model = getattr(models, opt.model)(config, self.vocab, self.vocab, use_cuda, bmodel,
                                                pretrain=pretrain_embed, score_fn=opt.score)

        print('loading checkpoint...\n')
        print(os.path.join(self.root_path, opt.restore))
        import sys
        sys.path.append(self.root_path)
        checkpoints = torch.load(os.path.join(self.root_path, opt.restore))
        self.model.load_state_dict(checkpoints['model'])
        self.model.cuda()
        self.model.eval()

        self.state = None
        self.init_session()
Beispiel #10
0

config = Config(
    testing=False,
    bert_model_name="bert-base-chinese",
    #Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
    max_lr=3e-5,  #学习率
    epochs=5,
    use_fp16=False,  #fastai里可以方便地调整精度,加快训练速度:learner.to_fp16()
    bs=8,  #batch size
    max_seq_len=128,  #选取合适的seq_length,较大的值可能导致训练极慢报错等
)

from pytorch_pretrained_bert import BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")


#使用Bert分词器分词的适配器
class FastAiBertTokenizerAdapter(BaseTokenizer):
    """包装BertTokenizer为FastAI中的BaseTokenizer"""
    def __init__(self,
                 tokenizer: BertTokenizer,
                 max_seq_len: int = 128,
                 **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self
Beispiel #11
0
 def __init__(self, df, train_mode=True, labeled=True):
     super(ToxicDataset, self).__init__()
     self.df = df
     self.train_mode = train_mode
     self.labeled = labeled
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
Beispiel #12
0
dev_data_file = open(path + 'en_ewt-ud-dev.conllu', 'r', encoding='utf-8')
trainPos = joinParse(train_data_file)
devPos = joinParse(dev_data_file)

tagsTrain = list(set(word_pos[1] for sent in trainPos for word_pos in sent))

tag2idx = {tag: idx for idx, tag in enumerate(tagsTrain)}
tag2idx['<pad>'] = -1
idx2tag = {idx: tag for idx, tag in enumerate(tagsTrain)}
idx2tag[-1] = '<pad>'
idx2tag[17] = '<pad>'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
if args.large:
    tokenizer = BertTokenizer.from_pretrained(
        'bert-large-cased', do_lower_case=False
    ) if case == 'case' else BertTokenizer.from_pretrained(
        'bert-large-uncased')
else:
    tokenizer = BertTokenizer.from_pretrained(
        'bert-base-cased', do_lower_case=False
    ) if case == 'case' else BertTokenizer.from_pretrained('bert-base-uncased')

model = Net(vocab_size=len(tag2idx), device=device, case=case)
model.to(device)
model = nn.DataParallel(model)

train_dataset = PosDataset(trainPos)
dev_dataset = PosDataset(devPos)

for epoch in range(args.epochs):
Beispiel #13
0
def main(args):

    output_mode = OUTPUT_MODE
    cache_dir = CACHE_DIR

    # if os.path.exists(REPORTS_DIR) and os.listdir(REPORTS_DIR):
    #     REPORTS_DIR += '/report_%d' % (len(os.listdir(REPORTS_DIR)))
    #     os.makedirs(REPORTS_DIR)
    # if not os.path.exists(REPORTS_DIR):
    #     os.makedirs(REPORTS_DIR)
    #     REPORTS_DIR += '/report_%d' % (len(os.listdir(REPORTS_DIR)))
    #     os.makedirs(REPORTS_DIR)

    if os.path.exists(OUTPUT_DIR) and os.listdir(OUTPUT_DIR):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                OUTPUT_DIR))
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    processor = BinaryClassificationProcessor()
    train_examples = processor.get_train_examples(DATA_DIR)
    train_examples_len = len(train_examples)

    label_list = processor.get_labels()  # [0, 1] for binary classification
    num_labels = len(label_list)

    num_train_optimization_steps = int(
        train_examples_len / TRAIN_BATCH_SIZE /
        GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS

    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                              do_lower_case=False)

    label_map = {label: i for i, label in enumerate(label_list)}
    train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH,
                                      tokenizer, OUTPUT_MODE)
                                     for example in train_examples]

    train_features = []
    for train_example in train_examples_for_processing:
        train_features.append(
            convert_examples_to_features.convert_example_to_feature(
                train_example))
    with open(DATA_DIR + "train_features.pkl", "wb") as f:
        pickle.dump(train_features, f)

    # Load pre-trained model (weights)
    model = BertForSequenceClassification.from_pretrained(
        BERT_MODEL, cache_dir=CACHE_DIR, num_labels=num_labels)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=LEARNING_RATE,
                         warmup=WARMUP_PROPORTION,
                         t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", train_examples_len)
    logger.info("  Batch size = %d", TRAIN_BATCH_SIZE)
    logger.info("  Num steps = %d", num_train_optimization_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)

    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_ids)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=TRAIN_BATCH_SIZE)

    model.train()
    for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(
                tqdm_notebook(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            logits = model(input_ids, segment_ids, input_mask, labels=None)

            label_weights = torch.FloatTensor(np.asarray([1.0, 2.0]))
            label_weights_tensor = torch.autograd.Variable(
                label_weights, volatile=True).cuda()
            loss_fct = CrossEntropyLoss(weight=label_weights_tensor)

            # loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))

            if GRADIENT_ACCUMULATION_STEPS > 1:
                loss = loss / GRADIENT_ACCUMULATION_STEPS

            loss.backward()
            # print("\r%f" % loss)

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
        print('Avg train loss:%.4f' % (tr_loss * 1.0 / nb_tr_steps))

    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self

    # If we save using the predefined names, we can load using `from_pretrained`
    output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
    output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME)

    torch.save(model_to_save.state_dict(), output_model_file)
    model_to_save.config.to_json_file(output_config_file)

    tokenizer.save_vocabulary(OUTPUT_DIR)
Beispiel #14
0
def train(args, dataset, text_process, generator, discriminator, inception_score):
    step = int(math.log2(args.init_size)) - 2
    resolution = 4 * 2 ** step
    loader = sample_data(
        dataset, args.batch.get(resolution, args.batch_default), resolution
    )
    
    data_loader = iter(loader)

    adjust_lr(g_optimizer, args.lr.get(resolution, 0.0001))
    adjust_lr(d_optimizer, args.lr.get(resolution, 0.0004))

    pbar = tqdm(range(0, 300_000))
    
    
    requires_grad(text_process, False)
    requires_grad(generator, False)
    requires_grad(discriminator, True)

    disc_loss_val = 0
    gen_loss_val = 0
    grad_loss_val = 0
    kl_loss_val = 0
    score = 0
    
    alpha = 0
    used_sample = 0

    max_step = int(math.log2(args.max_size)) - 2
    final_progress = False

    # fixed output
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    _, fixed_caption = next(data_loader)
    fixed_gen_i, fixed_gen_j = (10, 5)
    fixed_caption = fixed_caption.cuda()
    # fixed_c_code, _, _, _, _ = text_process(fixed_caption)
    descriptions = get_sentence(tokenizer,fixed_caption[:fixed_gen_j])
    
    with open(os.path.join(args.out, 'fixed_sentence.txt'), 'w') as f:
        for item in descriptions:
            f.write("%s\n" % item)
    
    # Training iteration
    for i in pbar:
        discriminator.zero_grad()

        alpha = min(1, 1 / args.phase * (used_sample + 1))
#         if (resolution == args.init_size and args.ckpt is None) or final_progress:
        if (resolution == args.init_size and args.ckpt is None) or final_progress:
            alpha = 1

        if used_sample > args.phase * 2:
            used_sample = 0
            step += 1

            if step > max_step:
                step = max_step
                final_progress = True
                ckpt_step = step + 1

            else:
                alpha = 0
                ckpt_step = step

            resolution = 4 * 2 ** step

            loader = sample_data(
                dataset, args.batch.get(resolution, args.batch_default), resolution
            )
            data_loader = iter(loader)

            torch.save(
                {
                    'text_process': text_process.module.state_dict(),
                    'generator': generator.module.state_dict(),
                    'discriminator': discriminator.module.state_dict(),
                    't_optimizer': t_optimizer.state_dict(),
                    'g_optimizer': g_optimizer.state_dict(),
                    'd_optimizer': d_optimizer.state_dict(),
                    'g_running': g_running.state_dict(),
                    't_running': t_running.state_dict(),
                },
                os.path.join(args.out, f'checkpoint/train_step-{ckpt_step}.model')
            )

            adjust_lr(t_optimizer, args.lr.get(resolution, 0.0001))
            adjust_lr(g_optimizer, args.lr.get(resolution, 0.0001))
            adjust_lr(d_optimizer, args.lr.get(resolution, 0.0004))

        try:
            real_image, caption = next(data_loader)

        except (OSError, StopIteration):
            data_loader = iter(loader)
            real_image, caption = next(data_loader)

        used_sample += real_image.shape[0]

        b_size = real_image.size(0)
        real_image = real_image.cuda()
        caption = caption.cuda()
#         c_code, sent_emb, words_embs, mu, log_var = text_process(caption)
        sent_emb, words_embs = text_process(caption)

        if args.loss == 'wgan-gp':
            real_predict = discriminator(real_image, sent_emb, step=step, alpha=alpha)
            real_predict = real_predict.mean() - 0.001 * (real_predict ** 2).mean()
            (-real_predict).backward(retain_graph=True)
        
        ####Todo: fit for condition gan###
        elif args.loss == 'r1':
            real_image.requires_grad = True
            real_scores = discriminator(real_image, sent_emb, step=step, alpha=alpha)
            real_predict = F.softplus(-real_scores).mean()
            real_predict.backward(retain_graph=True)

            grad_real = grad(
                outputs=real_scores.sum(), inputs=real_image, create_graph=True
            )[0]
            grad_penalty = (
                grad_real.view(grad_real.size(0), -1).norm(2, dim=1) ** 2
            ).mean()
            grad_penalty = 10 / 2 * grad_penalty
            grad_penalty.backward()
            if (i+1)%10 == 0:
                grad_loss_val = grad_penalty.item()
                
        ####Todo: fit for condition gan###
        if args.mixing and random.random() < 0.9:
            gen_in11, gen_in12, gen_in21, gen_in22 = torch.randn(
                4, b_size, code_size, device='cuda'
            ).chunk(4, 0)
            gen_in1 = [gen_in11.squeeze(0), gen_in12.squeeze(0)]
            gen_in2 = [gen_in21.squeeze(0), gen_in22.squeeze(0)]

        else:
            gen_in1, gen_in2 = torch.randn(2, b_size, 384, device='cuda').chunk(
                2, 0
            )
            gen_in1, gen_in2 = gen_in1.squeeze(0), gen_in2.squeeze(0)
            gen_in1 = torch.cat([gen_in1, sent_emb], dim=1)
            gen_in2 = torch.cat([gen_in2, sent_emb], dim=1)

        fake_image = generator(gen_in1, step=step, alpha=alpha)
        fake_predict = discriminator(fake_image, sent_emb, step=step, alpha=alpha)
        #mismatch_predict = discriminator(real_image[:(b_size-1)], sent_emb[1:b_size], step=step, alpha=alpha)

        if args.loss == 'wgan-gp':
            fake_predict = fake_predict.mean()
#             mismatch_predict = mismatch_predict.mean() / 2.
#             (fake_predict + mismatch_predict).backward(retain_graph=True)
            (fake_predict).backward(retain_graph=True)

            eps = torch.rand(b_size, 1, 1, 1).cuda()
            x_hat = eps * real_image.data + (1 - eps) * fake_image.data
            x_hat.requires_grad = True
            hat_predict = discriminator(x_hat, sent_emb, step=step, alpha=alpha)
            grad_x_hat = grad(
                outputs=hat_predict.sum(), inputs=x_hat, create_graph=True
            )[0]
            grad_penalty = (
                (grad_x_hat.view(grad_x_hat.size(0), -1).norm(2, dim=1) - 1) ** 2
            ).mean()
            grad_penalty = 10 * grad_penalty
            grad_penalty.backward()
            if (i+1)%10 == 0:
                grad_loss_val = grad_penalty.item()
                disc_loss_val = (-real_predict + fake_predict).item()

        ####Todo: fit for condition gan###
        elif args.loss == 'r1':
            fake_predict = F.softplus(fake_predict).mean()
            fake_predict.backward()
            if i%10 == 0:
                disc_loss_val = (real_predict + fake_predict).item()

        d_optimizer.step()

        if (i + 1) % n_critic == 0:
            text_process.zero_grad()
            generator.zero_grad()
            
            requires_grad(text_process.module.bert_embedding.fc, True)
            requires_grad(generator, True)
            requires_grad(discriminator, False)

            fake_image = generator(gen_in2, step=step, alpha=alpha)

            predict = discriminator(fake_image, sent_emb, step=step, alpha=alpha)

            if args.loss == 'wgan-gp':
                loss = (-predict).mean()
                

            elif args.loss == 'r1':
                loss = F.softplus(-predict).mean()
                
#             kl_loss = KL_loss(mu, log_var)
            (loss).backward()
            
            if (i+1) % 10 == 0:
                gen_loss_val = loss.item()
#                 kl_loss_val = kl_loss.item()

            t_optimizer.step()
            g_optimizer.step()
            accumulate(t_running, text_process.module)
            accumulate(g_running, generator.module)
            
            requires_grad(text_process, False)
            requires_grad(generator, False)
            requires_grad(discriminator, True)

        if (i + 1) % 1000 == 0:
            images = []
            with torch.no_grad():
#                 fixed_c_code, _, _, _, _ = t_running(fixed_caption)
                sent_emb, _ = t_running(fixed_caption)
                for _ in range(fixed_gen_i):
                    images.append(
                        g_running(
                            torch.cat([torch.randn(fixed_gen_j, 384).cuda(), sent_emb[:fixed_gen_j]], dim=1), step=step, alpha=alpha
                        ).data.cpu()
                    )
                    
            images = torch.cat(images, 0)
            score = inception_score.cal(images)
            
            utils.save_image(
                images,
                os.path.join(args.out, f'{str(i+1).zfill(6)}-{4 * 2 ** step}x{4 * 2 ** step}.png'),
                nrow=fixed_gen_i,
                normalize=True,
                range=(-1, 1),
            )

        if (i + 1) % 10000 == 0:
            torch.save(
                t_running.state_dict(), os.path.join(args.out, f'checkpoint/t_{str(i + 1).zfill(6)}.model')
            )
            torch.save(
                g_running.state_dict(), os.path.join(args.out, f'checkpoint/g_{str(i + 1).zfill(6)}.model')
            )

        state_msg = (
            f'Size: {4 * 2 ** step}; G: {gen_loss_val:.4f}; D: {disc_loss_val:.4f}; '
            f'KL: {kl_loss_val:.4f}; Grad: {grad_loss_val:.4f}; '
            f'IS: {score: .4f}; '
            f'Alpha: {alpha:.4f};'
        )

        pbar.set_description(state_msg)
Beispiel #15
0
                        'O': O,
                        'C': C,
                        'P': P
                    },
                    ignore_index=True)
            cur_idx += 1

        step += 1
    return result


if __name__ == '__main__':
    EP = 100
    SAVING_DIR = '../models/'
    tokenizer = BertTokenizer.from_pretrained(
        '/home/zydq/.torch/models/bert/chinese_wwm_ext_pytorch',
        do_lower_case=True)
    test_dataset = ReviewDataset('../data/TEST/Test_reviews.csv',
                                 None,
                                 tokenizer,
                                 type='laptop')
    test_loader = DataLoader(test_dataset,
                             12,
                             collate_fn=test_dataset.batchify,
                             shuffle=False,
                             num_workers=5)

    model = OpinioNet.from_pretrained(
        '/home/zydq/.torch/models/bert/chinese_wwm_ext_pytorch')
    model.load_state_dict(torch.load('../models/saved_best_model_wwm_ext'))
    model.cuda()
Beispiel #16
0
def train_and_eval_model(
    args,
    saved_pickle_path=parent_folder_path +
    "/data_generated/squad_retrieval_data_seed_0_dev_2000.pickle"):
    # TODO: later think about a way to pass this folder directory in a clever way.

    N_EPOCH = args.n_epoch
    BATCH_SIZE_TRAIN = args.batch_size_train
    BATCH_SIZE_EVAL = args.batch_size_eval
    NUM_WORKERS = args.n_worker
    N_NEG_FACT = args.n_neg_sample
    DEVICE = torch.device(
        args.device) if torch.cuda.is_available() else torch.device("cpu")

    # Instantiate BERT retriever, optimizer and tokenizer.
    bert_retriever = BertSQuADRetriever(N_NEG_FACT, DEVICE, BATCH_SIZE_TRAIN,
                                        BATCH_SIZE_EVAL)
    bert_retriever.to(DEVICE)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    optimizer = optim.Adam(bert_retriever.parameters(), lr=0.00001)

    now = datetime.datetime.now()
    date_time = str(now)[:10] + '_' + str(now)[11:13] + str(now)[14:16]

    if args.dataset == "squad":
        # Load SQuAD dataset and dataloader.
        squad_retrieval_data = squad_retrieval.convert_squad_to_retrieval(
            tokenizer, random_seed=args.seed, num_dev=args.num_dev)

        squad_retrieval_train_dataset = squad_retrieval.SQuADRetrievalDatasetTrain(
            instance_list=squad_retrieval_data["train_list"],
            sent_list=squad_retrieval_data["sent_list"],
            doc_list=squad_retrieval_data["doc_list"],
            resp_list=squad_retrieval_data["resp_list"],
            tokenizer=tokenizer,
            random_seed=args.seed,
            n_neg_sample=N_NEG_FACT)

        retrieval_train_dataloader = DataLoader(
            squad_retrieval_train_dataset,
            batch_size=BATCH_SIZE_TRAIN,
            shuffle=True,
            num_workers=NUM_WORKERS,
            collate_fn=squad_retrieval.PadCollateSQuADTrain())

        squad_retrieval_dev_dataset = squad_retrieval.SQuADRetrievalDatasetEvalQuery(
            instance_list=squad_retrieval_data["dev_list"],
            sent_list=squad_retrieval_data["sent_list"],
            doc_list=squad_retrieval_data["doc_list"],
            resp_list=squad_retrieval_data["resp_list"],
            tokenizer=tokenizer)

        retrieval_dev_dataloader = DataLoader(
            squad_retrieval_dev_dataset,
            batch_size=BATCH_SIZE_EVAL,
            shuffle=False,
            num_workers=NUM_WORKERS,
            collate_fn=squad_retrieval.PadCollateSQuADEvalQuery())

        squad_retrieval_test_dataset = squad_retrieval.SQuADRetrievalDatasetEvalQuery(
            instance_list=squad_retrieval_data["test_list"],
            sent_list=squad_retrieval_data["sent_list"],
            doc_list=squad_retrieval_data["doc_list"],
            resp_list=squad_retrieval_data["resp_list"],
            tokenizer=tokenizer)

        retrieval_test_dataloader = DataLoader(
            squad_retrieval_test_dataset,
            batch_size=BATCH_SIZE_EVAL,
            shuffle=False,
            num_workers=NUM_WORKERS,
            collate_fn=squad_retrieval.PadCollateSQuADEvalQuery())

        squad_retrieval_eval_fact_dataset = squad_retrieval.SQuADRetrievalDatasetEvalFact(
            instance_list=squad_retrieval_data["resp_list"],
            sent_list=squad_retrieval_data["sent_list"],
            doc_list=squad_retrieval_data["doc_list"],
            resp_list=squad_retrieval_data["resp_list"],
            tokenizer=tokenizer)

        retrieval_eval_fact_dataloader = DataLoader(
            squad_retrieval_eval_fact_dataset,
            batch_size=BATCH_SIZE_EVAL,
            shuffle=False,
            num_workers=NUM_WORKERS,
            collate_fn=squad_retrieval.PadCollateSQuADEvalFact())

        save_folder_path = parent_folder_path + '/data_generated/squad_retrieval_seed_' + str(
            args.seed) + "_" + date_time + "/"

    elif args.dataset == "openbook":
        train_list, dev_list, test_list, kb = openbook_retrieval.construct_retrieval_dataset_openbook(
            num_neg_sample=N_NEG_FACT, random_seed=args.seed)

        openbook_retrieval_train_dataset = openbook_retrieval.OpenbookRetrievalDatasetTrain(
            instance_list=train_list,
            kb=kb,
            tokenizer=tokenizer,
            num_neg_sample=N_NEG_FACT)

        retrieval_train_dataloader = DataLoader(
            openbook_retrieval_train_dataset,
            batch_size=BATCH_SIZE_TRAIN,
            shuffle=True,
            num_workers=NUM_WORKERS,
            collate_fn=openbook_retrieval.PadCollateOpenbookTrain())

        openbook_retrieval_dev_dataset = openbook_retrieval.OpenbookRetrievalDatasetEvalQuery(
            instance_list=dev_list, tokenizer=tokenizer)

        retrieval_dev_dataloader = DataLoader(
            openbook_retrieval_dev_dataset,
            batch_size=BATCH_SIZE_EVAL,
            shuffle=False,
            num_workers=NUM_WORKERS,
            collate_fn=openbook_retrieval.PadCollateOpenbookEvalQuery())

        openbook_retrieval_test_dataset = openbook_retrieval.OpenbookRetrievalDatasetEvalQuery(
            instance_list=test_list, tokenizer=tokenizer)

        retrieval_test_dataloader = DataLoader(
            openbook_retrieval_test_dataset,
            batch_size=BATCH_SIZE_EVAL,
            shuffle=False,
            num_workers=NUM_WORKERS,
            collate_fn=openbook_retrieval.PadCollateOpenbookEvalQuery())

        openbook_retrieval_eval_fact_dataset = openbook_retrieval.OpenbookRetrievalDatasetEvalFact(
            kb=kb, tokenizer=tokenizer)

        retrieval_eval_fact_dataloader = DataLoader(
            openbook_retrieval_eval_fact_dataset,
            batch_size=BATCH_SIZE_EVAL,
            shuffle=False,
            num_workers=NUM_WORKERS,
            collate_fn=openbook_retrieval.PadCollateOpenbookEvalFact())

        save_folder_path = parent_folder_path + '/data_generated/openbook_retrieval_seed_' + str(
            args.seed) + "_" + date_time + "/"

    else:
        return 0

    if not os.path.exists(save_folder_path):
        os.makedirs(save_folder_path)

    # Start evaluation.
    best_mrr = 0
    main_result_array = np.zeros((N_EPOCH, 3))
    for epoch in range(N_EPOCH):
        print("=" * 20)
        print("Epoch ", epoch + 1)
        train_loss = bert_retriever.train_epoch(optimizer,
                                                retrieval_train_dataloader)
        dev_result_dict, test_result_dict = bert_retriever.eval_epoch(
            retrieval_dev_dataloader, retrieval_test_dataloader,
            retrieval_eval_fact_dataloader)

        dev_mrr = sum(dev_result_dict["mrr"]) / len(dev_result_dict["mrr"])
        test_mrr = sum(test_result_dict["mrr"]) / len(test_result_dict["mrr"])

        print("\t\tepoch " + str(epoch + 1) + " training loss:" +
              str(train_loss) + " dev mrr:" + str(dev_mrr) + " test mrr:" +
              str(test_mrr))

        main_result_array[epoch, :] = [train_loss, dev_mrr, test_mrr]

        if dev_mrr > best_mrr:
            best_mrr = dev_mrr

            torch.save(bert_retriever,
                       save_folder_path + "saved_bert_retriever")

            with open(save_folder_path + "dev_dict.pickle", "wb") as handle:
                pickle.dump(dev_result_dict, handle)

            with open(save_folder_path + "test_dict.pickle", "wb") as handle:
                pickle.dump(test_result_dict, handle)

        np.save(save_folder_path + "main_result.npy", main_result_array)

    return 0
Beispiel #17
0
    def _Get_Bert_Representation(self):

        count = 0
        bert_map = {}
        for root, dirs, files in os.walk("./data/test"):
            for file in files:
                file_path = os.path.join(root, file)
                print(file_path)

                file = open(file_path, "r")
                while True:
                    line = file.readline()
                    if not line:
                        break
                    line = line[:len(line) - 1]
                    line = line.split(" ")
                    line = line[:len(line) - 1]
                    line = " ".join(line)

                    if (line in bert_map.keys()):
                        continue

                    if (self.bert_tokenizer == None):
                        self.bert_tokenizer = BertTokenizer.from_pretrained(
                            'bert-base-uncased')

                    text = "[CLS] " + line + " [SEP]"
                    text = text.replace("  ", " ")
                    tokenized_text = self.bert_tokenizer.tokenize(text)

                    indexed_tokens = self.bert_tokenizer.convert_tokens_to_ids(
                        tokenized_text)
                    segments_ids = [0 for _ in tokenized_text]

                    tokens_tensor = torch.tensor([indexed_tokens])
                    segments_tensors = torch.tensor([segments_ids])

                    if (self.bert == None):
                        self.bert = BertModel.from_pretrained(
                            'bert-base-uncased')
                        self.bert.eval()

                    with torch.no_grad():
                        representation, sum = [], 0

                        encoded_layers, _ = self.bert(tokens_tensor,
                                                      segments_tensors)

                        Len = len(encoded_layers[-1].numpy()[0])
                        representation = np.zeros(768)
                        for i in range(1, Len - 1):
                            representation += encoded_layers[-1].numpy()[0][i]
                            sum += 1
                        representation = representation * 1.0 / sum

                        bert_map[line] = representation

                        count += 1
                        if (count % 100 == 0):
                            print(count)

        with open("./bert_map", 'wb') as file:
            pickle.dump(bert_map, file)
Beispiel #18
0
def load_data(data_folder, pretrained_model):
    tokenizer = BertTokenizer.from_pretrained(pretrained_model)
    with open(data_folder / 'tokenized_docs_bert.pkl', 'rb') as tok_docs_file:
        docs = pickle.load(tok_docs_file)

    return docs, tokenizer
Beispiel #19
0
######################################################
# Loading Data
######################################################


ddt = DDT()

conllu_format = ddt.load_as_conllu()
L = [(i, token.form, token.misc.get("name").pop()) for i, sent in enumerate(conllu_format) for token in sent]
df = pd.DataFrame(L, columns=['sentence_id', 'words', 'labels'])

######################################################
# to bert tokens 
######################################################
sent_str = [sent.text for sent in conllu_format]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sent_str]

# Convert tokens to indexes
with open("/home/au554730/Desktop/BERT_test/danish_bert_uncased/vocab.txt") as f:
    vocab = f.read()
vocab = vocab.split("\n")
vocab_d = {e: i for i, e in enumerate(vocab)}

def sentence_to_idx(sent):
    return [vocab_d.get(token, vocab_d["[UNK]"]) for token in sent]

input_ids = [sentence_to_idx(t) for t in tokenized_texts]

max_len = 128
Beispiel #20
0
    def __init__(self, csv_paths, tokenizer):
        from pytorch_pretrained_bert import BertTokenizer

        self.csv_paths = csv_paths
        self.tokenizer = tokenizer
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataroot = opt.dataroot

tag_vocab_dir = dataroot + '/vocab.slot' 
class_vocab_dir = dataroot + '/vocab.intent'
train_data_dir = dataroot + '/train'
valid_data_dir = dataroot + '/valid'
test_data_dir = dataroot + '/test'

if not opt.testing:
    tag_to_idx, idx_to_tag = vocab_reader.read_vocab_file(tag_vocab_dir, bos_eos=False)
    class_to_idx, idx_to_class = vocab_reader.read_vocab_file(class_vocab_dir, bos_eos=False)
else:
    tag_to_idx, idx_to_tag = vocab_reader.read_vocab_file(opt.read_vocab+'.tag', bos_eos=False, no_pad=True, no_unk=True)
    class_to_idx, idx_to_class = vocab_reader.read_vocab_file(opt.read_vocab+'.class', bos_eos=False, no_pad=True, no_unk=True)

tokenizer = BertTokenizer.from_pretrained(opt.bert_model_name)

logger.info("Vocab size: %s %s" % (len(tag_to_idx), len(class_to_idx)))
if not opt.testing:
    vocab_reader.save_vocab(idx_to_tag, os.path.join(exp_path, opt.save_vocab+'.tag'))
    vocab_reader.save_vocab(idx_to_class, os.path.join(exp_path, opt.save_vocab+'.class'))

opt.word_lowercase = False
if not opt.testing:
    train_feats, train_tags, train_class = data_reader.read_seqtag_data_with_class(train_data_dir, tag_to_idx, class_to_idx, multiClass=opt.multiClass, lowercase=opt.word_lowercase)
    valid_feats, valid_tags, valid_class = data_reader.read_seqtag_data_with_class(valid_data_dir, tag_to_idx, class_to_idx, multiClass=opt.multiClass, keep_order=opt.testing, lowercase=opt.word_lowercase)
    test_feats, test_tags, test_class = data_reader.read_seqtag_data_with_class(test_data_dir, tag_to_idx, class_to_idx, multiClass=opt.multiClass, keep_order=opt.testing, lowercase=opt.word_lowercase)
else:
    valid_feats, valid_tags, valid_class = data_reader.read_seqtag_data_with_class(valid_data_dir, tag_to_idx, class_to_idx, multiClass=opt.multiClass, keep_order=opt.testing, lowercase=opt.word_lowercase)
    test_feats, test_tags, test_class = data_reader.read_seqtag_data_with_class(test_data_dir, tag_to_idx, class_to_idx, multiClass=opt.multiClass, keep_order=opt.testing, lowercase=opt.word_lowercase)
Beispiel #22
0
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging

logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

sent = "hi mary"
tokens = tokenizer.tokenize(sent)
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)

print(tokens)
#['hi', 'ma', '##ry']
print(tokens_ids)
#[20844, 12477, 1616]
Beispiel #23
0
def bert_embeddings(sentences, tokenized_contents, output_file=None):
    # Using bert_tokenizer for checking for sequence wordpeice tokens length > 512
    bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
    if output_file:
        f = open(output_file, 'w')
    # init embedding
    # init multilingual BERT
    bert_embedding = TransformerWordEmbeddings('bert-large-uncased')
    long_sent = False
    for i, (sent, sent_tokens) in enumerate(zip(sentences,
                                                tokenized_contents)):
        print("Encoding the {}th input sentence for BERT embedding!".format(i))
        # getting the length of bert tokenized sentence after wordpeice tokenization
        if len(bert_tokenizer.tokenize(sent[0])) >= 510:
            long_sent = True
            truncated_tokens = sent_tokens[:len(sent_tokens) // 2]
            sent_tokens = sent_tokens[len(sent_tokens) // 2:]

        # Using our own tokens (our own tokenization)
        tokens: List[Token] = [Token(token) for token in sent_tokens]

        # create an empty sentence
        sentence = Sentence()

        # add tokens from our own tokenization
        sentence.tokens = tokens

        bert_embedding.embed(sentence)

        for j, (token, st) in enumerate(zip(sentence, sent_tokens)):
            if token.text != st:
                raise ValueError("Invalid token text")
            if output_file:
                f.write(
                    token.text + " " +
                    " ".join([str(num)
                              for num in token.embedding.tolist()]) + '\n')
            else:
                print(token.text + " " +
                      " ".join([str(num)
                                for num in token.embedding.tolist()]) + '\n')

        if long_sent:
            # tokenization for the rest of the sentence
            truncated_tokens: List[Token] = [
                Token(token) for token in truncated_tokens
            ]
            # Create empty sentence
            truncated_sentence = Sentence()
            # add tokens from our own tokenization
            truncated_sentence.tokens = truncated_tokens
            bert_embedding.embed(truncated_sentence)
            for token in truncated_sentence:
                if output_file:
                    f.write(token.text + " " + " ".join(
                        [str(num) for num in token.embedding.tolist()]) + '\n')
                else:
                    print(token.text + " " + " ".join(
                        [str(num) for num in token.embedding.tolist()]) + '\n')
            long_sent = False

        f.write('\n')
Beispiel #24
0
def main():
    """
    To run this code with default settings and example data, do
       $ python experiment.py data/example.csv
    """
    if not os.path.exists('output'):
        os.mkdir('output')
    if not os.path.exists('output/auxiliary'):
        os.mkdir('output/auxiliary')

    ## Argument parsing
    args = parser.parse_args()
    if args.factors is not None:
        args.factors = args.factors.split(",")
        if len(args.factors) > 2:
            print(
                "WARNING: Cannot plot more than 2 factors at a time. Trimming to",
                args.factors[:2])
            args.factors = args.factors[:2]
    if args.out is not None:
        if os.path.exists(args.out):
            pass
        #     if args.no_overwrite:
        #         quit()
        #     if input('Output directory {} already exists. Risk overwriting files? N/y'.format(args.out)) != 'y':
        #         quit()
        else:
            os.mkdir(args.out)

    if args.raw_out is None:
        args.raw_out = 'output/auxiliary/{}_{}{}{}{}.pkl'.format(
            os.path.basename(args.data)[:-4],
            args.method,
            "_chain"
            if args.combine == "chain" else "",  # cumsum can use same as no
            '_norm'
            if args.method == 'attention' and args.normalize_heads else '',
            ('_' + str(args.n_items)) if args.n_items is not None else '')

    if args.trees_out is None:
        args.trees_out = 'output/auxiliary/{}_SPANNING_{}{}{}{}{}{}.pkl'.format(
            os.path.basename(args.data)[:-4],
            args.method,
            '_' + args.combine if args.combine != "no" else "",
            '_norm'
            if args.method == 'attention' and args.normalize_heads else '',
            ('_' + str(args.n_items)) if args.n_items is not None else '',
            '_' + args.group_merger,
            '_' + 'transpose' if args.transpose else '',
        )

    if args.pearson_out is None:
        args.pearson_out = 'output/auxiliary/{}_PEARSON_{}{}{}{}{}{}.pkl'.format(
            os.path.basename(args.data)[:-4],
            args.method,
            '_' + args.combine if args.combine != "no" else "",
            '_norm'
            if args.method == 'attention' and args.normalize_heads else '',
            ('_' + str(args.n_items)) if args.n_items is not None else '',
            '_' + args.group_merger,
            '_' + 'transpose' if args.transpose else '',
        )

    ## Do we need to apply BERT (anew)?
    apply_BERT = True
    if os.path.exists(args.raw_out):
        if args.no_overwrite:
            apply_BERT = False
        elif input('Raw output file exists. Overwrite? (N/y)') != "y":
            apply_BERT = False

    ## Do we need to compute spanning trees (anew)?
    compute_trees = True
    if os.path.exists(args.trees_out):
        if args.no_overwrite:
            compute_trees = False
        elif input('Trees output file exists. Overwrite? (N/y)') != "y":
            compute_trees = False

    ## Do we need to compute pearson coefficients (anew)?
    compute_pearson = True
    if os.path.exists(args.pearson_out):
        if args.no_overwrite:
            compute_pearson = False
        elif input('Pearson output file exists. Overwrite? (N/y)') != "y":
            compute_pearson = False

    ## Set up tokenizer, data
    tokenizer = BertTokenizer.from_pretrained(args.bert,
                                              do_lower_case=("uncased"
                                                             in args.bert))
    items, dependency_trees = data_utils.parse_data(args.data,
                                                    tokenizer,
                                                    max_items=args.n_items,
                                                    words_as_groups=True,
                                                    dependencies=True)

    ## Store for convenience
    args.factors = args.factors or items.factors[:
                                                 2]  # by default use the first two factors from the data

    ## Now that args.factors is known, finally choose output directory
    if args.out is None:
        dirname = 'temp'
        out_idx = 0
        if not os.path.exists("output"):
            os.mkdir('output')
        if not os.path.exists("data/auxiliary/"):
            os.mkdir('data/auxiliary')
        while any(x.startswith(dirname) for x in os.listdir('output')):
            out_idx += 1
            dirname = 'temp{}'.format(out_idx)
        dirname += "_{}{}{}{}{}".format(
            args.method,
            "-" + args.combine if args.combine != "no" else "",
            "_normalized" if
            (args.method == "attention" and args.normalize_heads) else "",
            '_' + '-x-'.join(args.factors) if len(args.factors) > 0 else '',
            "_transposed" if args.transpose else "",
        )
        args.out = os.path.join("output", dirname)
        os.mkdir(args.out)

    ## Apply BERT or, if available, load results saved from previous run
    if apply_BERT:
        data_for_all_items = interface_BERT.apply_bert(items, tokenizer, args)
        with open(args.raw_out, 'wb') as file:
            pickle.dump(data_for_all_items, file)
            print('BERTs raw outputs saved as', args.raw_out)
    else:
        with open(args.raw_out, 'rb') as file:
            print('BERTs raw outputs loaded from', args.raw_out)
            data_for_all_items = pickle.load(file)
    n_layers = data_for_all_items[0].shape[0]  # for convenience

    # The list data_for_all_items now contains, for each item, weights (n_layers, n_tokens, n_tokens)

    ## If some computation needs to be done, we need to process the BERT outputs a bit
    if compute_trees or compute_pearson:

        print("Processing the data from BERT...")

        ## Take cumsum if needed (placed outside the foregoing, to avoid having to save/load separate file for this
        if args.combine == "cumsum":
            for i in range(len(data_for_all_items)):
                data_for_all_items[i] = np.cumsum(data_for_all_items[i],
                                                  axis=0)

        ## Take averages over groups of tokens
        if not args.ignore_groups and not len(items.groups) == 0:
            data_for_all_items = data_utils.merge_grouped_tokens(
                items, data_for_all_items, method=args.group_merger)

        ## Compute balances (though whether they will be plotted depends on args.balance)
        # (Re)compute balance: how much token influences minus how much is influenced
        balance_for_all_items = []
        for data_for_item in data_for_all_items:
            balance_for_item = []
            for data_for_layer in data_for_item:
                balance = np.nansum(data_for_layer -
                                    data_for_layer.transpose(),
                                    axis=1)
                balance_for_item.append(balance)
            balance_for_all_items.append(np.stack(balance_for_item))
        # At this point we have two lists of numpy arrays: for each item, the weights & balance across layers.

        ## Store the weights in dataframe together with original data
        # TODO All of this feels terribly hacky...
        # First flatten the numpy array per item
        data_for_all_items = [
            data.reshape(-1).tolist() for data in data_for_all_items
        ]
        balance_for_all_items = [
            data.reshape(-1).tolist() for data in balance_for_all_items
        ]
        # And then concatenate them (still per item per layer)
        data_and_balance_for_all_items = [
            array1 + array2 for array1, array2 in zip(data_for_all_items,
                                                      balance_for_all_items)
        ]
        # Concatenate onto original data rows (with each row repeated n_layers times)
        # original_items_times_nlayers = [a for l in [[i.to_list()] * n_layers for (_, i) in items.iterrows()] for a in l]
        data_for_dataframe = [
            a + b for a, b in zip([i.to_list() for (
                _, i) in items.iterrows()], data_and_balance_for_all_items)
        ]
        # Multi-column to represent the (flattened) numpy arrays in a structured way
        multi_columns = pd.MultiIndex.from_tuples(
            [(c, '', '', '') for c in items.columns] +
            [('weights', l, g1, g2) for l in range(n_layers)
             for g1 in items.groups
             for g2 in items.groups] + [('balance', l, g, '')
                                        for l in range(n_layers)
                                        for g in items.groups],
            names=['', 'layer', 'in', 'out'])

        df = pd.DataFrame(data_for_dataframe,
                          index=items.index,
                          columns=multi_columns)
        # Dataframe with three sets of columns: columns from original dataframe, weights (as extracted from BERT & grouped), and the balance computed from them

    ## Apply BERT or, if available, load results saved from previous run
    if compute_trees:
        trees_df = analyze_by_spanning_trees(df, items, dependency_trees,
                                             n_layers, args)
        with open(args.trees_out, 'wb') as file:
            pickle.dump(trees_df, file)
            print('Trees and scores saved as', args.trees_out)
    else:
        with open(args.trees_out, 'rb') as file:
            print('Trees and scores loaded from', args.trees_out)
            trees_df = pickle.load(file)

    plot_tree_scores(trees_df, args)

    if compute_pearson:
        pearson_df = analyze_by_pearson_correlation(df, items,
                                                    dependency_trees, n_layers,
                                                    args)
        with open(args.pearson_out, 'wb') as file:
            pickle.dump(pearson_df, file)
            print('Pearson coefficients and p-values saved as',
                  args.pearson_out)
    else:
        with open(args.pearson_out, 'rb') as file:
            print('Pearson coefficients and p-values loaded from',
                  args.pearson_out)
            pearson_df = pickle.load(file)

    plot_pearson_scores(pearson_df, args)
Beispiel #25
0
    def _create_examples(self,
                         sents,
                         mode,
                         prefix,
                         embedding_method,
                         pretrained=None):
        """
        sents: list of strings
        mode: (string) train, dev or test

        return:
            examples: a list containing dicts representing each example
        """

        allowed_modes = ['train', 'dev', 'test']
        if mode not in allowed_modes:
            raise ValueError(
                f'Mode not recognized, try one of {allowed_modes}')

        # 生成example文件地址
        id_sents_pickle_path = os.path.join(
            config.CACHE_PATH,
            prefix + '_' + mode + '.pkl',
        )
        print("this is examples train_sents", sents[0])
        # 导入或者新建一个example, 生产方法就是调用lang.sents2ids,其实就是利用lang来转换
        # 如果导入的是预训练模型,则并不需要转换
        if pretrained == None:
            id_sents = load_or_create(id_sents_pickle_path,
                                      self.lang.sents2ids,
                                      sents,
                                      force_reload=self.force_reload)
        elif pretrained == True and embedding_method == "bert":
            tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
            id_sents = load_or_create(
                id_sents_pickle_path,
                lambda x: [tokenizer.convert_tokens_to_ids(i) for i in x],
                sents,
                force_reload=self.force_reload)
        if embedding_method == "roberta":
            roberta = torch.hub.load("pytorch/fairseq", "roberta.base")
            tokenizer = roberta.encode
            id_sents = load_or_create(
                id_sents_pickle_path,
                lambda x: [tokenizer(i) for i in x],
                sents,
                force_reload=self.force_reload,
            )
            print("id sents here00", id_sents[0])
        chars_pickle_path = os.path.join(
            config.CACHE_PATH,
            prefix + '_' + mode + '_chars.pkl',
        )

        char_id_sents = [[0] for i in range(len(id_sents))]
        # load_or_create(chars_pickle_path,
        #                self.lang.sents2char_ids,
        #                sents,
        #                force_reload=self.force_reload)

        #  FIXME: Assuming all 3 modes will have labels. This might not be the
        # case for test data <2018-06-29 10:49:29, Jorge Balazs>
        # 顺便也将label转化为1 0 的形式
        labels = open(config.label_dict[self.corpus_name][mode],
                      encoding="utf-8").readlines()
        labels = [l.rstrip() for l in labels]
        id_labels = [self.label2id[label] for label in labels]

        ids = range(len(id_sents))

        # 将id,其实第一个应该算是 index, 原句子, 向量化的句子,char向量化的句子, 标签封装起来
        examples = zip(
            ids,
            sents,
            id_sents,
            char_id_sents,
            id_labels,
        )

        examples = [{
            'id': ex[0],
            'raw_sequence': ex[1],
            'sequence': ex[2],
            'char_sequence': ex[3],
            'label': ex[4],
        } for ex in examples]

        return examples
Beispiel #26
0
    def create_dataloader(self):
        # 读取输入输出
        train_comments = self.train_df["comment_text"].astype(str)
        train_label = self.train_df["target"].values
        train_type_labels = self.train_df[self.toxicity_type_list].values

        # 新的 np 任务
        train_np_labels = np.zeros((len(self.train_df), 4))
        train_np_identity_labels = np.zeros(
            (len(self.train_df), len(self.identity_list) * 4))
        train_df_copy = self.train_df[self.identity_list + ["target"]]
        for column in self.identity_list + ["target"]:
            train_df_copy[column] = np.where(train_df_copy[column] > 0.5, True,
                                             False)
        pp_label_bool = train_df_copy["target"] & np.where(
            train_df_copy[self.identity_list].sum(axis=1) > 0, True, False)
        np_label_bool = ~train_df_copy["target"] & np.where(
            train_df_copy[self.identity_list].sum(axis=1) > 0, True, False)
        pn_label_bool = train_df_copy["target"] & np.where(
            (train_df_copy[self.identity_list]).sum(axis=1) == 0, True, False)
        nn_label_bool = ~train_df_copy["target"] & np.where(
            (train_df_copy[self.identity_list]).sum(axis=1) == 0, True, False)
        train_np_labels[:, 0] = np.where(pp_label_bool > 0, 1, 0)
        train_np_labels[:, 1] = np.where(np_label_bool > 0, 1, 0)
        train_np_labels[:, 2] = np.where(pn_label_bool > 0, 1, 0)
        train_np_labels[:, 3] = np.where(nn_label_bool > 0, 1, 0)
        for i, column in enumerate(self.identity_list):
            pp_label_bool = train_df_copy["target"] & train_df_copy[column]
            np_label_bool = ~train_df_copy["target"] & train_df_copy[column]
            pn_label_bool = train_df_copy["target"] & (~train_df_copy[column])
            nn_label_bool = ~train_df_copy["target"] & (~train_df_copy[column])
            train_np_identity_labels[:, i * 4 + 0] = np.where(
                pp_label_bool > 0, 1, 0)
            train_np_identity_labels[:, i * 4 + 1] = np.where(
                np_label_bool > 0, 1, 0)
            train_np_identity_labels[:, i * 4 + 2] = np.where(
                pn_label_bool > 0, 1, 0)
            train_np_identity_labels[:, i * 4 + 3] = np.where(
                nn_label_bool > 0, 1, 0)

        # 身份原始值
        train_identity_values = self.train_df[self.identity_list].fillna(
            0.).values
        # 所有身份原始值之和
        train_identity_sum = train_identity_values.sum(axis=1)
        # 将身份之和限制在1以下(sigmoid)
        train_identity_sum_label = np.where(train_identity_sum > 1, 1,
                                            train_identity_sum)
        # 身份01值
        train_identity_binary = copy.deepcopy(
            self.train_df[self.identity_list])
        for column in self.identity_list:
            train_identity_binary[column] = np.where(
                train_identity_binary[column] > 0.5, 1, 0)
        # 身份01值有一个就算1
        train_identity_binary_sum = train_identity_binary.sum(axis=1)
        train_identity_or_binary = np.where(train_identity_binary_sum >= 1, 1,
                                            0)
        # 所有身份标签
        train_identity_type_labels = train_identity_values
        train_identity_type_binary_lables = train_identity_binary
        train_identity_sum_label = train_identity_sum_label
        train_identity_binary_label = train_identity_or_binary

        # tokenizer 训练
        bert_tokenizer = BertTokenizer.from_pretrained(self.bert_model_path,
                                                       cache_dir=None,
                                                       do_lower_case=True)
        train_bert_tokens = self.convert_lines(
            self.train_df["comment_text"].fillna("DUMMY_VALUE"), self.max_len,
            bert_tokenizer)
        # 划分训练集和验证集
        valid_tokens = train_bert_tokens[self.train_len:]
        valid_label = train_label[self.train_len:]
        valid_type_labels = train_type_labels[self.train_len:]
        train_tokens = train_bert_tokens[:self.train_len]
        train_label = train_label[:self.train_len]
        train_type_labels = train_type_labels[:self.train_len]
        valid_identity_type_labels = train_identity_type_labels[self.
                                                                train_len:]
        train_identity_type_labels = train_identity_type_labels[:self.
                                                                train_len]
        valid_identity_type_binary_lables = train_identity_type_binary_lables[
            self.train_len:]
        train_identity_type_binary_lables = train_identity_type_binary_lables[:
                                                                              self
                                                                              .
                                                                              train_len]
        valid_identity_sum_label = train_identity_sum_label[self.train_len:]
        train_identity_sum_label = train_identity_sum_label[:self.train_len]
        valid_identity_binary_label = train_identity_binary_label[self.
                                                                  train_len:]
        train_identity_binary_label = train_identity_binary_label[:self.
                                                                  train_len]
        valid_np_labels = train_np_labels[self.train_len:]
        train_np_labels = train_np_labels[:self.train_len]
        valid_np_identity_labels = train_np_identity_labels[self.train_len:]
        train_np_identity_labels = train_np_identity_labels[:self.train_len]

        # 计算样本权重
        target_weight, aux_weight, identity_weight, np_weight, np_identity_weight = self.cal_sample_weights(
        )

        # 将符号化数据转成 tensor
        train_x_tensor = torch.tensor(train_tokens, dtype=torch.long)
        valid_x_tensor = torch.tensor(valid_tokens, dtype=torch.long)
        train_y_tensor = torch.tensor(np.hstack([
            train_label[:, np.newaxis], train_type_labels,
            train_identity_type_labels, train_np_labels
        ]),
                                      dtype=torch.float32)
        valid_y_tensor = torch.tensor(np.hstack([
            valid_label[:, np.newaxis], valid_type_labels,
            valid_identity_type_labels, valid_np_labels
        ]),
                                      dtype=torch.float32)
        target_weight_tensor = torch.tensor(target_weight, dtype=torch.float32)
        aux_weight_tensor = torch.tensor(aux_weight, dtype=torch.float32)
        identity_weight_tensor = torch.tensor(identity_weight,
                                              dtype=torch.float32)
        np_weight_tensor = torch.tensor(np_weight, dtype=torch.float32)
        np_identity_weight_tensor = torch.tensor(np_identity_weight,
                                                 dtype=torch.float32)
        train_attention_mask_tensor = train_x_tensor > 0
        valid_attention_mask_tensor = valid_x_tensor > 0
        if torch.cuda.is_available():
            train_x_tensor = train_x_tensor.to(self.device)
            valid_x_tensor = valid_x_tensor.to(self.device)
            train_y_tensor = train_y_tensor.to(self.device)
            valid_y_tensor = valid_y_tensor.to(self.device)
            target_weight_tensor = target_weight_tensor.to(self.device)
            aux_weight_tensor = aux_weight_tensor.to(self.device)
            identity_weight_tensor = identity_weight_tensor.to(self.device)
            train_attention_mask_tensor = train_attention_mask_tensor.to(
                self.device)
            valid_attention_mask_tensor = valid_attention_mask_tensor.to(
                self.device)
            np_weight_tensor = np_weight_tensor.to(self.device)
            np_identity_weight_tensor = np_identity_weight_tensor.to(
                self.device)
        # 将 tensor 转成 dataset,训练数据和标签一一对应,用 dataloader 加载的时候 dataset[:-1] 是 x,dataset[-1] 是 y
        train_dataset = data.TensorDataset(train_x_tensor, train_y_tensor,
                                           target_weight_tensor,
                                           aux_weight_tensor,
                                           identity_weight_tensor,
                                           train_attention_mask_tensor,
                                           np_weight_tensor)
        valid_dataset = data.TensorDataset(valid_x_tensor, valid_y_tensor,
                                           valid_attention_mask_tensor)
        # 将 dataset 转成 dataloader
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=self.base_batch_size, shuffle=True)
        valid_loader = torch.utils.data.DataLoader(
            valid_dataset, batch_size=self.base_batch_size, shuffle=False)
        # 返回训练数据
        return train_loader, valid_loader
# Load pre-trained model with masked language model head
bert_version = 'bert-large-uncased'
model = BertForMaskedLM.from_pretrained(bert_version)

# Preprocess text
text = pre_text + target_text
# Prevent RuntimeError
if len(text)>2000:
  pre_text = ''
  for i in range(10,20):
    pre_text += list[i]+' '
  text = pre_text + target_text
  print('After decreasing the sentences... ''\n')

tokenizer = BertTokenizer.from_pretrained(bert_version)
tokenized_text = tokenizer.tokenize(text)
mask_positions = []
for i in range(len(tokenized_text)):
    if tokenized_text[i] == '_':
        tokenized_text[i] = '[MASK]'
        mask_positions.append(i)

# Predict missing words from left to right
model.eval()
predicted_token = ''
for mask_pos in mask_positions:
    # Convert tokens to vocab indices
    token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([token_ids])
    # print('tokens_tensor: ''\n',tokens_tensor)
Beispiel #28
0
def get(logger=None, args=None):

    #TODO: 另外生成多一个mask for generation

    data = {}
    taskcla = []

    # Others
    f_name = 'asc_random'

    with open(f_name, 'r') as f_random_seq:
        random_sep = f_random_seq.readlines()[args.idrandom].split()

    print('random_sep: ', random_sep)
    print('domains: ', domains)

    print('random_sep: ', len(random_sep))
    print('domains: ', len(domains))

    for t in range(args.ntasks):
        asc_dataset = asc_datasets[domains.index(random_sep[t])]
        ae_dataset = ae_datasets[domains.index(random_sep[t])]

        data[t] = {}
        if 'Bing' in asc_dataset:
            data[t]['name'] = asc_dataset
            data[t]['ncla'] = 2
        elif 'XuSemEval' in asc_dataset:
            data[t]['name'] = asc_dataset
            data[t]['ncla'] = 3

        print('ae_dataset: ', ae_dataset)

        logger.info("***** Running training *****")

        #ASC for encoder ====================
        processor = data_utils.AscProcessor()
        label_list = processor.get_labels()
        tokenizer = ABSATokenizer.from_pretrained(args.bert_model)
        train_examples = processor.get_train_examples(asc_dataset)
        train_features = data_utils.convert_examples_to_features_gen(
            train_examples, label_list, args.max_seq_length, tokenizer, "asc")

        all_asc_input_ids = torch.tensor([f.input_ids for f in train_features],
                                         dtype=torch.long)
        all_asc_segment_ids = torch.tensor(
            [f.segment_ids for f in train_features], dtype=torch.long)
        all_asc_input_mask = torch.tensor(
            [f.input_mask for f in train_features], dtype=torch.long)
        all_asc_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.long)
        all_tasks = torch.tensor([t for f in train_features], dtype=torch.long)

        #AE for decoder ====================
        processor = data_utils.AeProcessor()
        label_list = processor.get_labels()
        tokenizer = ABSATokenizer.from_pretrained(args.bert_model)
        train_examples = processor.get_train_examples(ae_dataset)
        train_features = data_utils.convert_examples_to_features_gen(
            train_examples, label_list, args.max_seq_length, tokenizer, "ae")

        all_ae_input_ids = torch.tensor([f.input_ids for f in train_features],
                                        dtype=torch.long)
        all_ae_segment_ids = torch.tensor(
            [f.segment_ids for f in train_features], dtype=torch.long)
        all_ae_input_mask = torch.tensor(
            [f.input_mask for f in train_features], dtype=torch.long)
        all_ae_label_ids = torch.tensor([f.label_id for f in train_features],
                                        dtype=torch.long)

        #SG (sentence generation) for decoder ====================
        processor = data_utils.SgProcessor()
        label_list = None
        tokenizer = ABSATokenizer.from_pretrained(args.bert_model)
        train_examples = processor.get_train_examples(asc_dataset)

        mask_source_words = args.mask_source_words
        max_pred = args.max_pred
        mask_prob = args.mask_prob
        skipgram_prb = args.skipgram_prb
        skipgram_size = args.skipgram_size
        mask_whole_word = args.mask_whole_word
        vocab_words = list(tokenizer.vocab.keys())
        indexer = tokenizer.convert_tokens_to_ids

        train_features = data_utils.convert_examples_to_features_gen(
            train_examples,
            label_list,
            args.max_seq_length * 2,
            tokenizer,
            "sg",
            mask_source_words=mask_source_words,
            max_pred=max_pred,
            mask_prob=mask_prob,
            skipgram_prb=skipgram_prb,
            skipgram_size=skipgram_size,
            mask_whole_word=mask_whole_word,
            vocab_words=vocab_words,
            indexer=indexer)  #seq2seq task

        all_sg_input_ids = torch.tensor([f.input_ids for f in train_features],
                                        dtype=torch.long)
        all_sg_segment_ids = torch.tensor(
            [f.segment_ids for f in train_features], dtype=torch.long)
        all_sg_input_mask = torch.tensor(
            [f.input_mask for f in train_features], dtype=torch.long)
        all_sg_masked_lm_labels = torch.tensor(
            [f.masked_lm_labels for f in train_features],
            dtype=torch.long).squeeze(1)
        all_sg_masked_pos = torch.tensor(
            [f.masked_pos for f in train_features],
            dtype=torch.long).squeeze(1)
        all_sg_masked_weights = torch.tensor(
            [f.masked_weights for f in train_features], dtype=torch.long)

        ae_length = all_ae_input_ids.size(0)
        while all_ae_input_ids.size(0) < all_sg_input_ids.size(0):
            rand_id = torch.randint(low=0, high=ae_length, size=(1, ))
            all_ae_input_ids = torch.cat(
                [all_ae_input_ids, all_ae_input_ids[rand_id]], 0)
            all_ae_segment_ids = torch.cat(
                [all_ae_segment_ids, all_ae_segment_ids[rand_id]], 0)
            all_ae_input_mask = torch.cat(
                [all_ae_input_mask, all_ae_input_mask[rand_id]], 0)
            all_ae_label_ids = torch.cat(
                [all_ae_label_ids, all_ae_label_ids[rand_id]], 0)

        #some have sentiment conflict, the ae can be larger than asc
        asc_length = all_asc_input_ids.size(0)
        while all_asc_input_ids.size(0) < all_ae_input_ids.size(0):
            rand_id = torch.randint(low=0, high=asc_length, size=(1, ))
            all_asc_input_ids = torch.cat(
                [all_asc_input_ids, all_asc_input_ids[rand_id]], 0)
            all_asc_segment_ids = torch.cat(
                [all_asc_segment_ids, all_asc_segment_ids[rand_id]], 0)
            all_asc_input_mask = torch.cat(
                [all_asc_input_mask, all_asc_input_mask[rand_id]], 0)
            all_asc_label_ids = torch.cat(
                [all_asc_label_ids, all_asc_label_ids[rand_id]], 0)
            all_sg_input_ids = torch.cat(
                [all_sg_input_ids, all_sg_input_ids[rand_id]], 0)
            all_sg_segment_ids = torch.cat(
                [all_sg_segment_ids, all_sg_segment_ids[rand_id]], 0)
            all_sg_input_mask = torch.cat(
                [all_sg_input_mask, all_sg_input_mask[rand_id]], 0)
            all_sg_masked_lm_labels = torch.cat(
                [all_sg_masked_lm_labels, all_sg_masked_lm_labels[rand_id]], 0)
            all_sg_masked_pos = torch.cat(
                [all_sg_masked_pos, all_sg_masked_pos[rand_id]], 0)
            all_sg_masked_weights = torch.cat(
                [all_sg_masked_weights, all_sg_masked_weights[rand_id]], 0)
            all_tasks = torch.cat([all_tasks, all_tasks[rand_id]], 0)

            # ae is smaller in size than others. beacuase a sentence can have multiple terms

        num_train_steps = int(
            math.ceil(all_asc_input_ids.size(0) /
                      args.train_batch_size)) * args.num_train_epochs
        # num_train_steps = int(len(train_examples) / args.train_batch_size) * args.num_train_epochs

        logger.info("  Num asc examples = %d", all_asc_input_ids.size(0))
        logger.info("  Num sg examples = %d", all_sg_input_ids.size(0))
        logger.info("  Num ae examples = %d", all_ae_input_ids.size(0))

        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)



        train_data = \
            TensorDataset(all_asc_input_ids,all_asc_segment_ids, all_asc_input_mask,\
            all_sg_input_ids, all_sg_segment_ids, all_sg_input_mask,\
            all_sg_masked_lm_labels,all_sg_masked_pos,all_sg_masked_weights,\
            all_ae_input_ids, all_ae_segment_ids, all_ae_input_mask,all_ae_label_ids,all_asc_label_ids,all_tasks)

        data[t]['train'] = train_data
        data[t]['num_train_steps'] = num_train_steps

        logger.info("***** Running validations *****")

        processor = data_utils.AscProcessor()
        label_list = processor.get_labels()
        tokenizer = BertTokenizer.from_pretrained(args.bert_model)
        dev_examples = processor.get_dev_examples(asc_dataset)
        dev_features = data_utils.convert_examples_to_features_gen(
            dev_examples, label_list, args.max_seq_length, tokenizer, "asc")

        all_asc_input_ids = torch.tensor([f.input_ids for f in dev_features],
                                         dtype=torch.long)
        all_asc_segment_ids = torch.tensor(
            [f.segment_ids for f in dev_features], dtype=torch.long)
        all_asc_input_mask = torch.tensor([f.input_mask for f in dev_features],
                                          dtype=torch.long)
        all_asc_label_ids = torch.tensor([f.label_id for f in dev_features],
                                         dtype=torch.long)
        all_tasks = torch.tensor([t for f in dev_features], dtype=torch.long)

        #AE for decoder ====================
        processor = data_utils.AeProcessor()
        label_list = processor.get_labels()
        tokenizer = ABSATokenizer.from_pretrained(args.bert_model)
        dev_examples = processor.get_dev_examples(ae_dataset)
        dev_features = data_utils.convert_examples_to_features_gen(
            dev_examples, label_list, args.max_seq_length, tokenizer, "ae")

        all_ae_input_ids = torch.tensor([f.input_ids for f in dev_features],
                                        dtype=torch.long)
        all_ae_segment_ids = torch.tensor(
            [f.segment_ids for f in dev_features], dtype=torch.long)
        all_ae_input_mask = torch.tensor([f.input_mask for f in dev_features],
                                         dtype=torch.long)
        all_ae_label_ids = torch.tensor([f.label_id for f in dev_features],
                                        dtype=torch.long)

        #SG (sentence generation) for decoder ====================
        processor = data_utils.SgProcessor()
        label_list = None
        tokenizer = ABSATokenizer.from_pretrained(args.bert_model)
        dev_examples = processor.get_dev_examples(asc_dataset)
        mask_source_words = args.mask_source_words
        max_pred = args.max_pred
        mask_prob = args.mask_prob
        skipgram_prb = args.skipgram_prb
        skipgram_size = args.skipgram_size
        mask_whole_word = args.mask_whole_word
        vocab_words = list(tokenizer.vocab.keys())
        indexer = tokenizer.convert_tokens_to_ids

        dev_features = data_utils.convert_examples_to_features_gen(
            dev_examples,
            label_list,
            args.max_seq_length * 2,
            tokenizer,
            "sg",
            mask_source_words=mask_source_words,
            max_pred=max_pred,
            mask_prob=mask_prob,
            skipgram_prb=skipgram_prb,
            skipgram_size=skipgram_size,
            mask_whole_word=mask_whole_word,
            vocab_words=vocab_words,
            indexer=indexer)  #seq2seq task

        all_sg_input_ids = torch.tensor([f.input_ids for f in dev_features],
                                        dtype=torch.long)
        all_sg_segment_ids = torch.tensor(
            [f.segment_ids for f in dev_features], dtype=torch.long)
        all_sg_input_mask = torch.tensor([f.input_mask for f in dev_features],
                                         dtype=torch.long)
        all_sg_masked_lm_labels = torch.tensor(
            [f.masked_lm_labels for f in dev_features],
            dtype=torch.long).squeeze(1)
        all_sg_masked_pos = torch.tensor([f.masked_pos for f in dev_features],
                                         dtype=torch.long).squeeze(1)
        all_sg_masked_weights = torch.tensor(
            [f.masked_weights for f in dev_features], dtype=torch.long)

        ae_length = all_ae_input_ids.size(0)
        while all_ae_input_ids.size(0) < all_sg_input_ids.size(0):
            rand_id = torch.randint(low=0, high=ae_length, size=(1, ))
            all_ae_input_ids = torch.cat(
                [all_ae_input_ids, all_ae_input_ids[rand_id]], 0)
            all_ae_segment_ids = torch.cat(
                [all_ae_segment_ids, all_ae_segment_ids[rand_id]], 0)
            all_ae_input_mask = torch.cat(
                [all_ae_input_mask, all_ae_input_mask[rand_id]], 0)
            all_ae_label_ids = torch.cat(
                [all_ae_label_ids, all_ae_label_ids[rand_id]], 0)

        #some have sentiment conflict, the ae can be larger than asc
        asc_length = all_asc_input_ids.size(0)
        while all_asc_input_ids.size(0) < all_ae_input_ids.size(0):
            rand_id = torch.randint(low=0, high=asc_length, size=(1, ))
            all_asc_input_ids = torch.cat(
                [all_asc_input_ids, all_asc_input_ids[rand_id]], 0)
            all_asc_segment_ids = torch.cat(
                [all_asc_segment_ids, all_asc_segment_ids[rand_id]], 0)
            all_asc_input_mask = torch.cat(
                [all_asc_input_mask, all_asc_input_mask[rand_id]], 0)
            all_asc_label_ids = torch.cat(
                [all_asc_label_ids, all_asc_label_ids[rand_id]], 0)
            all_sg_input_ids = torch.cat(
                [all_sg_input_ids, all_sg_input_ids[rand_id]], 0)
            all_sg_segment_ids = torch.cat(
                [all_sg_segment_ids, all_sg_segment_ids[rand_id]], 0)
            all_sg_input_mask = torch.cat(
                [all_sg_input_mask, all_sg_input_mask[rand_id]], 0)
            all_sg_masked_lm_labels = torch.cat(
                [all_sg_masked_lm_labels, all_sg_masked_lm_labels[rand_id]], 0)
            all_sg_masked_pos = torch.cat(
                [all_sg_masked_pos, all_sg_masked_pos[rand_id]], 0)
            all_sg_masked_weights = torch.cat(
                [all_sg_masked_weights, all_sg_masked_weights[rand_id]], 0)
            all_tasks = torch.cat([all_tasks, all_tasks[rand_id]], 0)

        logger.info("  Num asc examples = %d", all_asc_input_ids.size(0))
        logger.info("  Num sg examples = %d", all_sg_input_ids.size(0))
        logger.info("  Num ae examples = %d", all_ae_input_ids.size(0))


        valid_data = \
            TensorDataset(all_asc_input_ids,all_asc_segment_ids, all_asc_input_mask,\
            all_sg_input_ids, all_sg_segment_ids, all_sg_input_mask,\
            all_sg_masked_lm_labels,all_sg_masked_pos,all_sg_masked_weights,\
            all_ae_input_ids, all_ae_segment_ids, all_ae_input_mask,all_ae_label_ids,all_asc_label_ids,all_tasks)

        data[t]['valid'] = valid_data

        logger.info("***** Running evaluation *****")

        processor = data_utils.AscProcessor()
        label_list = processor.get_labels()
        tokenizer = BertTokenizer.from_pretrained(args.bert_model)
        eval_examples = processor.get_test_examples(asc_dataset)
        eval_features = data_utils.convert_examples_to_features_gen(
            eval_examples, label_list, args.max_seq_length, tokenizer, "asc")

        all_asc_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                         dtype=torch.long)
        all_asc_segment_ids = torch.tensor(
            [f.segment_ids for f in eval_features], dtype=torch.long)
        all_asc_input_mask = torch.tensor(
            [f.input_mask for f in eval_features], dtype=torch.long)
        all_asc_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.long)
        all_tasks = torch.tensor([t for f in eval_features], dtype=torch.long)

        #AE for decoder ====================
        processor = data_utils.AeProcessor()
        label_list = processor.get_labels()
        tokenizer = ABSATokenizer.from_pretrained(args.bert_model)
        eval_examples = processor.get_test_examples(ae_dataset)

        eval_features = data_utils.convert_examples_to_features_gen(
            eval_examples, label_list, args.max_seq_length, tokenizer, "ae")

        all_ae_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                        dtype=torch.long)
        all_ae_segment_ids = torch.tensor(
            [f.segment_ids for f in eval_features], dtype=torch.long)
        all_ae_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                         dtype=torch.long)
        all_ae_label_ids = torch.tensor([f.label_id for f in eval_features],
                                        dtype=torch.long)

        #SG (sentence generation) for decoder ====================
        processor = data_utils.SgProcessor()
        label_list = None
        tokenizer = ABSATokenizer.from_pretrained(args.bert_model)
        eval_examples = processor.get_test_examples(asc_dataset)

        mask_source_words = args.mask_source_words
        max_pred = args.max_pred
        mask_prob = args.mask_prob
        skipgram_prb = args.skipgram_prb
        skipgram_size = args.skipgram_size
        mask_whole_word = args.mask_whole_word
        vocab_words = list(tokenizer.vocab.keys())
        indexer = tokenizer.convert_tokens_to_ids

        eval_features = data_utils.convert_examples_to_features_gen(
            eval_examples,
            label_list,
            args.max_seq_length * 2,
            tokenizer,
            "sg",
            mask_source_words=mask_source_words,
            max_pred=max_pred,
            mask_prob=mask_prob,
            skipgram_prb=skipgram_prb,
            skipgram_size=skipgram_size,
            mask_whole_word=mask_whole_word,
            vocab_words=vocab_words,
            indexer=indexer)  #seq2seq task

        all_sg_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                        dtype=torch.long)
        all_sg_segment_ids = torch.tensor(
            [f.segment_ids for f in eval_features], dtype=torch.long)
        all_sg_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                         dtype=torch.long)
        all_sg_masked_lm_labels = torch.tensor(
            [f.masked_lm_labels for f in eval_features],
            dtype=torch.long).squeeze(1)
        all_sg_masked_pos = torch.tensor([f.masked_pos for f in eval_features],
                                         dtype=torch.long).squeeze(1)
        all_sg_masked_weights = torch.tensor(
            [f.masked_weights for f in eval_features], dtype=torch.long)

        ae_length = all_ae_input_ids.size(0)
        while all_ae_input_ids.size(0) < all_sg_input_ids.size(0):
            rand_id = torch.randint(low=0, high=ae_length, size=(1, ))
            all_ae_input_ids = torch.cat(
                [all_ae_input_ids, all_ae_input_ids[rand_id]], 0)
            all_ae_segment_ids = torch.cat(
                [all_ae_segment_ids, all_ae_segment_ids[rand_id]], 0)
            all_ae_input_mask = torch.cat(
                [all_ae_input_mask, all_ae_input_mask[rand_id]], 0)
            all_ae_label_ids = torch.cat(
                [all_ae_label_ids, all_ae_label_ids[rand_id]], 0)

        #some have sentiment conflict, the ae can be larger than asc
        asc_length = all_asc_input_ids.size(0)
        while all_asc_input_ids.size(0) < all_ae_input_ids.size(0):
            rand_id = torch.randint(low=0, high=asc_length, size=(1, ))
            all_asc_input_ids = torch.cat(
                [all_asc_input_ids, all_asc_input_ids[rand_id]], 0)
            all_asc_segment_ids = torch.cat(
                [all_asc_segment_ids, all_asc_segment_ids[rand_id]], 0)
            all_asc_input_mask = torch.cat(
                [all_asc_input_mask, all_asc_input_mask[rand_id]], 0)
            all_asc_label_ids = torch.cat(
                [all_asc_label_ids, all_asc_label_ids[rand_id]], 0)
            all_sg_input_ids = torch.cat(
                [all_sg_input_ids, all_sg_input_ids[rand_id]], 0)
            all_sg_segment_ids = torch.cat(
                [all_sg_segment_ids, all_sg_segment_ids[rand_id]], 0)
            all_sg_input_mask = torch.cat(
                [all_sg_input_mask, all_sg_input_mask[rand_id]], 0)
            all_sg_masked_lm_labels = torch.cat(
                [all_sg_masked_lm_labels, all_sg_masked_lm_labels[rand_id]], 0)
            all_sg_masked_pos = torch.cat(
                [all_sg_masked_pos, all_sg_masked_pos[rand_id]], 0)
            all_sg_masked_weights = torch.cat(
                [all_sg_masked_weights, all_sg_masked_weights[rand_id]], 0)
            all_tasks = torch.cat([all_tasks, all_tasks[rand_id]], 0)

        logger.info("  Num asc examples = %d", all_asc_input_ids.size(0))
        logger.info("  Num sg examples = %d", all_sg_input_ids.size(0))
        logger.info("  Num ae examples = %d", all_ae_input_ids.size(0))



        eval_data = \
            TensorDataset(all_asc_input_ids,all_asc_segment_ids, all_asc_input_mask,\
            all_sg_input_ids, all_sg_segment_ids, all_sg_input_mask,\
            all_sg_masked_lm_labels,all_sg_masked_pos,all_sg_masked_weights,\
            all_ae_input_ids, all_ae_segment_ids, all_ae_input_mask,all_ae_label_ids,all_asc_label_ids,all_tasks)

        # Run prediction for full data

        data[t]['test'] = eval_data

        taskcla.append((t, int(data[t]['ncla'])))

    # Others
    n = 0
    for t in data.keys():
        n += data[t]['ncla']
    data['ncla'] = n

    return data, taskcla
Beispiel #29
0
 def bert_tokenizer(self):
     tokenizer = BertTokenizer.from_pretrained(self.vocab_file, do_lower_case=True)
     return tokenizer
Beispiel #30
0
    def Get_Bert_Representation(self, examples_train, examples_test):

        train_rep_file = "./data/" + pb.dataset + "_train_" + "bert"
        test_rep_file = "./data/" + pb.dataset + "_test_" + "bert"

        if (os.path.exists(train_rep_file) == True
                and os.path.exists(test_rep_file) == True):
            with open(train_rep_file, 'rb') as file:
                examples_train_rep = pickle.load(file)
                for i, example in enumerate(examples_train):
                    example.bert_mat = examples_train_rep[i]
            with open(test_rep_file, 'rb') as file:
                examples_test_rep = pickle.load(file)
                for i, example in enumerate(examples_test):
                    example.bert_mat = examples_test_rep[i]
        else:
            examples = []
            for example in examples_train:
                examples.append(example)
            for example in examples_test:
                examples.append(example)

            for i, example in enumerate(examples):

                if (self.bert_tokenizer == None):
                    self.bert_tokenizer = BertTokenizer.from_pretrained(
                        'bert-base-uncased')

                text = "[CLS] " + example.fgt_channels[0] + " [SEP]"
                text = text.replace("  ", " ")
                tokenized_text = self.bert_tokenizer.tokenize(text)

                indexed_tokens = self.bert_tokenizer.convert_tokens_to_ids(
                    tokenized_text)
                segments_ids = [0 for _ in tokenized_text]

                tokens_tensor = torch.tensor([indexed_tokens])
                segments_tensors = torch.tensor([segments_ids])

                if (self.bert == None):
                    self.bert = BertModel.from_pretrained('bert-base-uncased')
                    self.bert.eval()

                with torch.no_grad():
                    representation, sum = [], 0

                    encoded_layers, _ = self.bert(tokens_tensor,
                                                  segments_tensors)
                    a, b = encoded_layers[0].numpy(
                    ).shape[1], encoded_layers[0].numpy().shape[2]
                    representation = np.zeros((a, b))

                    for layer in encoded_layers:
                        for words in layer.numpy():
                            representation += words
                            sum += 1
                    if (sum > 0):
                        representation = representation * 1.0 / sum

                    representation = list(representation)
                    while (len(representation) < pb.fgt_maxlength):
                        representation.append(np.zeros(b))

                    example.bert_mat = representation[0:pb.fgt_maxlength]

                print("{:.2%}".format(i * 1.0 / len(examples)))
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
## 下面可以找到参数的下载链接
# https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py
# https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)
home = os.getenv('HOME')

##################################################################
## BERT
##################################################################
## BertTokenizer
tokenizer = BertTokenizer.from_pretrained(home + '/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased-vocab.txt')  # Load pre-trained model tokenizer (vocabulary)
print(tokenizer.max_len)  # 1000000000000; 512 for not large
print(len(tokenizer.vocab))  # 30522; words
print(type(tokenizer.vocab))  # <class 'collections.OrderedDict'>
print(tokenizer.vocab.get('hello', 0))  # 7592
print(tokenizer.vocab.get('helloworld', 0))  # 0
print(tokenizer.ids_to_tokens.get(7592, 'hello'))  # hello
print(tokenizer.ids_to_tokens.get(75920, 'hello'))  # hello
print(tokenizer.convert_ids_to_tokens([0, 1, 99, 100, 101, 102, 103, 104, 998, 999]))  # ['[PAD]', '[unused0]', '[unused98]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '[unused99]', '[unused993]', '!']
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)  # Tokenized input
print(tokenized_text)  # ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', 'henson', 'was', 'a', 'puppet', '##eer', '[SEP]']

## Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
Beispiel #32
0
def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("bert_model",
                        choices=[
                            "bert-base-uncased", "bert-large-uncased",
                            "bert-base-cased", "bert-base-multilingual-cased",
                            "bert-base-multilingual-uncased",
                            "bert-base-chinese"
                        ],
                        help="Variant of pre-trained model.")
    parser.add_argument(
        "language_data",
        nargs="+",
        type=str,
        help="Files with data, name of the file is language code.")
    parser.add_argument("--num-threads", type=int, default=4)
    parser.add_argument("--limit", type=int, default=10000)
    args = parser.parse_args()

    torch.set_num_threads(args.num_threads)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=False)
    model = BertModel.from_pretrained(args.bert_model,
                                      output_attentions=True,
                                      keep_multihead_output=True).to(device)
    model.eval()

    languages = []
    entropies = []

    with torch.no_grad():
        for input_file in args.language_data:
            lng_code = input_file.split("/")[-1][:-4]
            print(f"Working on {lng_code}")

            entropies_sums = None
            sentence_count = 0

            for sentence_tensor in text_data_generator(input_file, tokenizer):
                sentence_count += 1
                layer_attentions = model(sentence_tensor.unsqueeze(0))[0]
                head_count = layer_attentions[0].shape[1]

                if entropies_sums is None:
                    entropies_sums = np.zeros(
                        len(layer_attentions) * head_count)

                head_id = 0
                for att_matrices in layer_attentions:
                    for matrix in att_matrices.squeeze(0):
                        entropy = -torch.mean(
                            (matrix * torch.log(matrix + 1e-9)).sum(1))
                        entropies_sums[head_id] += entropy.cpu().numpy()
                        head_id += 1

                if sentence_count >= args.limit:
                    break

            languages.append(lng_code)
            entropies.append(entropies_sums / sentence_count)

    for lng, entropy in zip(languages, entropies):
        formatted_ent = "\t".join([f"{e:.5f}" for e in entropy])
        print(f"{lng}\t{formatted_ent}")