Beispiel #1
0
def load_pretrained_model_tokenizer(model_type="BertForSequenceClassification",
                                    device="cuda",
                                    config=None):
    bert_model = config['bert_model']
    # Load pre-trained model (weights)
    if model_type == "BertForSequenceClassification":
        model = BertForSequenceClassification.from_pretrained(bert_model,
                                                              num_labels=2)
        # Load pre-trained model tokenizer (vocabulary)
    elif model_type == "BertForNextSentencePrediction":
        model = BertForNextSentencePrediction.from_pretrained(bert_model)
    elif model_type == "specific_shared":
        model = SpecificShared(config)
    elif model_type == "siamese_bert":
        model = SiameseBert(config)
    elif model_type == "n_bert":
        model = nBert(config)
    elif model_type == "bert_sts":
        model = BertSts(config)
    elif model_type == "bert_fine_tune":
        model = BertFineTune(config)
    else:
        print("[Error]: unsupported model type")
        return None, None

    tokenizer = BertTokenizer.from_pretrained(bert_model)
    model.to(device)
    print("Initialized model and tokenizer")
    return model, tokenizer
def load_pretrained_model_tokenizer(model_type="BertForSequenceClassification",
                                    base_model=None,
                                    base_tokenizer=None,
                                    device="cuda",
                                    chinese=False,
                                    num_labels=2):
    # Load pre-trained model (weights)
    if base_model is None:
        if chinese:
            base_model = "bert-base-chinese"
        else:
            base_model = "bert-base-uncased"
    if model_type == "BertForSequenceClassification":
        model = BertForSequenceClassification.from_pretrained(
            base_model, num_labels=num_labels)
        # Load pre-trained model tokenizer (vocabulary)
    elif model_type == "BertForNextSentencePrediction":
        model = BertForNextSentencePrediction.from_pretrained(base_model)
    elif model_type == "BertForTokenClassification":
        model = BertForTokenClassification.from_pretrained(
            base_model, num_labels=num_labels)
    elif model_type == "BertMSE":
        model = BertMSE()
    else:
        print("[Error]: unsupported model type")
        return None, None

    if base_tokenizer is None:
        # Download from huggingface
        tokenizer = BertTokenizer.from_pretrained(base_model)
    else:
        # Load local file
        tokenizer = BertTokenizer.from_pretrained(base_tokenizer)
    model.to(device)
    return model, tokenizer
Beispiel #3
0
    def load(path, new_args=None):
        params = torch.load(path, map_location=lambda storage, loc: storage)
        args = params['args']
        print('new args', new_args)

        args.rl_baseline = new_args.rl_baseline
        args.hrl = new_args.hrl
        args.ngram = new_args.ngram
        args.rl_weight = new_args.rl_weight
        args.reward_alpha = new_args.reward_alpha
        args.avoid_ngram = new_args.avoid_ngram
        args.optim = new_args.optim
        args.cuda = new_args.cuda
        args.rl_bleu = new_args.rl_bleu
        args.rl_f1 = new_args.rl_f1
        args.rl_relevance = new_args.rl_relevance
        args.rl_reward = new_args.rl_reward
        args.rl_relevance_weight = new_args.rl_relevance_weight
        args.rl_relevance_beta = new_args.rl_relevance_beta
        args.rl_expressiveness = new_args.rl_expressiveness
        args.rl_coherence = new_args.rl_coherence
        args.bert_weight_path = new_args.bert_weight_path
        args.bert_vocab_path = new_args.bert_vocab_path
        args.decode_len_constraint = new_args.decode_len_constraint
        if new_args.new_vocab is not None:
            print('loading new vocab from ' + new_args.new_vocab)
            vocab = torch.load(new_args.new_vocab)
        else:
            vocab = params['decoder_vocab']
        # model = VistModel.build_model(args, vocab)
        model = VistModel(args, vocab)
        try:
            model.encoder.load_state_dict(params['encoder_state_dict'])
        except KeyError:
            print('****Warming loading stat dict missing parameters ****')

        try:
            print('decoder params', params['decoder_state_dict'].keys())
            model.decoder.load_state_dict(params['decoder_state_dict'],
                                          strict=False)
        except KeyError:
            print('****Warming loading stat dict missing parameters ****')
            params_name = params['decoder_state_dict'].keys()
            for n, p in model.named_parameters():
                if n not in params_name:
                    print('uniformly initial new parameters %s in [-%f,%f]' %
                          (n, -args.uniform_init, args.uniform_init))
                    p.data.uniform_(-args.uniform_init, args.uniform_init)
        if 'coherence' in args.rl_reward:
            model.bert_tokenizer = BertTokenizer.from_pretrained(
                new_args.bert_vocab_path)
            model.bert_nsp = BertForNextSentencePrediction.from_pretrained(
                new_args.bert_weight_path)
            model.bert_nsp.eval()

        return model
Beispiel #4
0
def score_nsp(file, bert_weight, bert_vocab):
    pairs = []
    for l in open(file):
        sents = l.strip().split('\t')[1].strip().split('.')
        for i in range(len(sents)):
            pre = "" if i == 0 else sents[i - 1].strip() + ' . '
            cur = sents[i].strip() + ' .'
            pairs.append("[CLS] {}[SEP] {} [SEP]".format(pre, cur))

    bert_tokenizer = BertTokenizer.from_pretrained(bert_vocab)
    bert_nsp = BertForNextSentencePrediction.from_pretrained(bert_weight)
    bert_nsp.eval()
    scores = get_nsp(pairs, bert_tokenizer, bert_nsp)
    # print('scores', scores)
    for s, p in zip(pairs, scores):
        print('s={}, p={}'.format(s, p))
    return np.mean(scores)
Beispiel #5
0
    def model_init(self, model='bert-base-chinese'):
        """
        初始化模型内容

        >>> model_init(model='bert-base-chinese')
        >>> model_init(model='bert-base-chinese')
        """
        self.tokeniser = BertTokenizer.from_pretrained(model)
        self.model = BertForNextSentencePrediction.from_pretrained(model)
        self.model.eval()
        if torch.cuda.is_available():

            self.dirve = 'cuda'
        else:
            self.dirve = 'cpu'
        print('use ', self.dirve)

        self.model.to(self.dirve)
Beispiel #6
0
def load_pretrained_model_tokenizer(base_model=None, base_tokenizer=None, device='cuda'):
    if device == 'cuda':
        assert torch.cuda.is_available()

    # Load pre-trained model (weights)
    if base_model is None:
        # Download from huggingface
        base_model = 'bert-base-uncased'
    model = BertForNextSentencePrediction.from_pretrained(base_model)

    if base_tokenizer is None:
        # Download from huggingface
        tokenizer = BertTokenizer.from_pretrained(base_model)
    else:
        # Load local vocab file
        tokenizer = BertTokenizer.from_pretrained(base_tokenizer)
    model.to(device)
    return model, tokenizer
Beispiel #7
0
def load_pretrained_model_tokenizer(model_type="BertForSequenceClassification",
                                    device="cuda",
                                    chinese=False):
    # Load pre-trained model (weights)
    if chinese:
        base_model = "bert-base-chinese"
    else:
        base_model = "bert-base-uncased"
    if model_type == "BertForSequenceClassification":
        model = BertForSequenceClassification.from_pretrained(base_model)
        # Load pre-trained model tokenizer (vocabulary)
    elif model_type == "BertForNextSentencePrediction":
        model = BertForNextSentencePrediction.from_pretrained(base_model)
    else:
        print("[Error]: unsupported model type")
        return None, None

    tokenizer = BertTokenizer.from_pretrained(base_model)
    model.to(device)
    return model, tokenizer
Beispiel #8
0
def load_pretrained_model_tokenizer(base_model=None,
                                    base_tokenizer=None,
                                    device='cuda',
                                    chinese=False):
    # Load pre-trained model (weights)
    if base_model is None:
        # Download from huggingface
        if chinese:
            base_model = 'bert-base-chinese'
        else:
            base_model = 'bert-base-uncased'
    model = BertForNextSentencePrediction.from_pretrained(base_model)

    # Load pre-trained model tokenizer (vocabulary)
    if base_tokenizer is None:
        tokenizer = BertTokenizer.from_pretrained(
            base_model)  # Download from huggingface
    else:
        tokenizer = BertTokenizer.from_pretrained(
            base_tokenizer)  # Load local file
    model.to(device)
    return model, tokenizer
Beispiel #9
0
def start_inference(batch_count, bert_model, data_type):
    torch.manual_seed(10)

    if data_type == 'testing':
        data_location = '../data/testing_db.csv'
    else:
        data_location = '../data/validation_db.csv'

    tokenizer = BertTokenizer.from_pretrained(bert_model)
    model = BertForNextSentencePrediction.from_pretrained(bert_model)
    model.eval()
    model.cuda()

    if batch_count != -1:
        batch_size = 125000
        df = pd.read_csv(data_location,
                         skiprows=range(1, batch_count * batch_size + 1),
                         nrows=batch_size)
        print(
            f'About to process batch number {batch_count}, which contains {df.shape[0]} samples.'
        )
    else:
        df = pd.read_csv(data_location)

    normal_probs_single, db_probs_single = [], []
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        context = row.context  #.split('#_new_utterance_#')[-1]
        normal_probs_single.append(
            bert_prediction(context, row.response, model, tokenizer))
        db_probs_single.append(
            bert_prediction(context, row.db_response_new, model, tokenizer))

    with open(f'{data_type}_full_normal.pkl', 'wb') as f:
        pickle.dump(normal_probs_single, f)

    with open(f'{data_type}_full_db.pkl', 'wb') as f:
        pickle.dump(db_probs_single, f)
Beispiel #10
0
#sentence2 = "It is possible that by making more research more available, online searching could conceivably broaden the work cited and lead researchers, as a collective, away from the “core” journals of their fields and to dispersed but individually relevant work."
sentence2 = "I will show, however, that even as deeper journal back issues became available online, scientists and scholars cited more recent articles; even as more total journals became available online, fewer were cited."
text = sentence1 + " " + sentence2
tokenized_s1 = tokenizer.tokenize(sentence1)

#text = "Who was Jim Morrison ? Jim Morrison was a puppeteer"
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)
# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
print(indexed_tokens)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [1] * len(tokenized_text)

for x in range(len(tokenized_s1)):
    segments_ids[x] = 0

#segments_ids = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
model.eval()

# Predict is Next Sentence ?
predictions = model(tokens_tensor, segments_tensors)
print(predictions[0].data.tolist())
Beispiel #11
0
output_dir = "/home/terry/pan/github/bert/model/bert-base-chinese/"

# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
import os

import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForNextSentencePrediction

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

#model = BertModel.from_pretrained('bert-base-chinese')

model = BertForNextSentencePrediction.from_pretrained('bert-base-chinese')

# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
model_to_save = model.module if hasattr(model, 'module') else model

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)
Beispiel #12
0
def guess(input_text, use355M, iteration):
    nltk.download('punkt')

    next_text = ''
    checkpoint_dir = ''
    if use355M:
        checkpoint_dir = 'tf_model/355M_diary'
    else:
        checkpoint_dir = 'tf_model/124M_diary'

    sess = gpt2.start_tf_sess()
    gpt2.load_gpt2(sess, checkpoint_dir=checkpoint_dir)

    sents = []
    for i in range(iteration):
        text = gpt2.generate(sess,
                             return_as_list=True,
                             checkpoint_dir=checkpoint_dir,
                             length=200,
                             prefix=input_text,
                             truncate="<|endoftext|>")
        text = text[0]

        input_len = len(nltk.sent_tokenize(input_text))
        temp = nltk.sent_tokenize(text)
        sents += temp[input_len:-1]

    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Load pre-trained model (weights)
    model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
    model.eval()
    predicts_text = []

    for sent in sents:
        next_text = sent

        # Tokenized input
        text = "[CLS] " + input_text + " [SEP] " + next_text + " [SEP]"
        tokenized_text = tokenizer.tokenize(text)

        # Convert token to vocabulary indices
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

        # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
        len_1 = len(tokenizer.tokenize(input_text)) + 2  # [CLS] & [SEP]
        len_2 = len(tokenizer.tokenize(next_text)) + 1  # [SEP]
        segments_ids = len_1 * [0] + len_2 * [1]

        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        # Predict is Next Sentence ?
        predictions = model(tokens_tensor, segments_tensors)

        predicts_text.append((predictions[0][0].item(), next_text))

    final_shuang = sorted(predicts_text, key=lambda x: x[0], reverse=True)
    if len(predicts_text) < 3:
        guess1 = " \n" + final_shuang[0][1]
        guess2 = "Then maybe: \n" + final_shuang[1][1]
        print("OH F**K")
    else:
        guess1 = final_shuang[0][1]
        guess2 = final_shuang[1][1]
        guess3 = final_shuang[2][1]

        return guess1, guess2, guess3
Beispiel #13
0
def run_bert_ns(data, year, predictions_dict):
    """
    Train the BERT LM_experiments for the Next sentence prediction
    :param data: The actual data of the year stored on dictionary
    :param year: The corresponding year of the data. It is used when we save the predictions
    :param predictions_dict: A dict where we save the predictions from our experiments
    :return:
    """

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    vocab_size = len(tokenizer.vocab)

    model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
    model.eval()
    model.to('cuda')

    # It is used when we normalize the predicted probabilities of LM_experiments to [0, 1]
    soft = Softmax(dim=-1)

    for doc_id, doc in data.items():

        for peer_id, peer in doc['peer_summarizers'].items():
            summary = peer['system_summary']

            if not_valid(peer_id=peer_id, doc_id=doc_id):
                predictions_dict[year][doc_id][peer_id]['BERT_NS'] = vocab_size
                continue

            with torch.no_grad():
                if summary != '':
                    summary_sentences = sent_tokenize(summary)
                    tokenized_sentences = tokenize_sentences(
                        sentences=summary_sentences, tokenizer=tokenizer)
                    sentences_ids = convert_sentences(
                        sentences=tokenized_sentences, tokenizer=tokenizer)

                    log_probabilities = []
                    for i in range(len(sentences_ids) - 1):
                        indexed_tokens = sentences_ids[i] + sentences_ids[i +
                                                                          1]
                        tokens_tensor = torch.tensor([indexed_tokens])
                        tokens_tensor = tokens_tensor.to('cuda')

                        segments_ids = [0] * len(sentences_ids[i]) + [1] * len(
                            sentences_ids[i + 1])
                        segments_tensor = torch.tensor([segments_ids])
                        segments_tensor = segments_tensor.to('cuda')

                        # predict the next sentence an normalize the prediction
                        predictions = model(tokens_tensor, segments_tensor)
                        predictions = soft(predictions)

                        # [0][0] the probability of Next sentence, actually following
                        # [0][1] the probability of Next sentence, not following
                        p = predictions[0][0].item()
                        log_probabilities.append(math.log(p, 2))

                    if len(log_probabilities) != 0:
                        mean_of_probabilities = np.mean(
                            np.array(log_probabilities))
                        perplexity = math.pow(2, -mean_of_probabilities)

                    else:
                        perplexity = math.pow(
                            2, 0)  # All the summary is 1 sentence

                    predictions_dict[year][doc_id][peer_id][
                        'BERT_NS'] = perplexity

                else:
                    print('BLANK')
                    predictions_dict[year][doc_id][peer_id][
                        'BERT_NS'] = vocab_size

    # Saves the predictions on prediction_dict that holds all the predictions of the experiments
    predictions_path = os.path.join(OUTPUT_DIR, 'predictions of models.json')
    with open(predictions_path, 'w') as of:
        json.dump(obj=predictions_dict, fp=of, sort_keys=True, indent=4)

    return predictions_dict
Beispiel #14
0
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForNextSentencePrediction

CACHE_DIR = 'cache/'
BERT_MODEL = 'model.tar.gz'

# Use only for the pre-trained model
# -------------------------------------------------------------------------------------/
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# model = BertForNextSentencePrediction.from_pretrained('bert-large-uncased')
# model.eval()
# -------------------------------------------------------------------------------------/

#Use only for the fine-tuned model
# -------------------------------------------------------------------------------------/
tokenizer = BertTokenizer.from_pretrained('vocab.txt')
model = BertForNextSentencePrediction.from_pretrained(CACHE_DIR + BERT_MODEL,
                                                      cache_dir=CACHE_DIR)
model.eval()
# -------------------------------------------------------------------------------------/

# ### AllenNLP - Loading

# In[3]:

#**************************************************************************************/
#    Title: AllenNLP
#    Author: AllenAI
#    Date: 2019
#    Code version: #3032
#    Availability: https://github.com/allenai/allennlp/blob/master/allennlp/predictors/decomposable_attention.py
#**************************************************************************************/
Beispiel #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size',
                        default=1,
                        type=int,
                        help='Batch size for inference')

    parser.add_argument(
        '--bert_model',
        default='bert-base-cased',
        type=str,
        help=
        'Bert pre-trained model selected, e.g. bert-base-uncased, bert-large-uncased, bert-base-multilingual-case, bert-base-chinese'
    )
    parser.add_argument(
        '--max_seq_length',
        default=128,
        type=int,
        help='Maximum total input sequence length after tokenization')

    args = parser.parse_args()

    input_ids = torch.zeros([args.batch_size, args.max_seq_length],
                            dtype=torch.long)
    token_type_ids = torch.zeros([args.batch_size, args.max_seq_length],
                                 dtype=torch.long)

    # Export various BERT models
    # Note: For argument definitions used here see modeling.py from pytorch-pretrained-bert
    #       repository
    #
    # Fully trained models
    model = BertModel.from_pretrained(args.bert_model)
    torch.onnx.export(
        model, (input_ids, token_type_ids), 'bert_' + 'batch' +
        str(args.batch_size) + '_' + args.bert_model + '.onnx')

    model = BertForMaskedLM.from_pretrained(args.bert_model)
    torch.onnx.export(
        model, (input_ids, token_type_ids), 'bert_maskedlm_' + 'batch' +
        str(args.batch_size) + '_' + args.bert_model + '.onnx')

    model = BertForNextSentencePrediction.from_pretrained(args.bert_model)
    torch.onnx.export(
        model, (input_ids, token_type_ids), 'bert_nextsentence_' + 'batch' +
        str(args.batch_size) + '_' + args.bert_model + '.onnx')

    model = BertForPreTraining.from_pretrained(args.bert_model)
    torch.onnx.export(
        model, (input_ids, token_type_ids), 'bert_pretraining_' + 'batch' +
        str(args.batch_size) + '_' + args.bert_model + '.onnx')

    # Partially trained models
    model = BertForSequenceClassification.from_pretrained(args.bert_model, 2)
    torch.onnx.export(
        model, (input_ids, token_type_ids), 'bert_classify_' + 'batch' +
        str(args.batch_size) + '_' + args.bert_model + '.untrained.onnx')

    model = BertForTokenClassification.from_pretrained(args.bert_model, 2)
    torch.onnx.export(
        model, (input_ids, token_type_ids), 'bert_tokenclassify_' + 'batch' +
        str(args.batch_size) + '_' + args.bert_model + '.untrained.onnx')

    # Returns error on ONNX export about "squeeze with negative axis -1 might cause onnx model to be incorrect, so commented out.
    #
    # model = BertForQuestionAnswering.from_pretrained(args.bert_model)
    # torch.onnx.export(model,(input_ids,token_type_ids),'bert_question_'+'batch'+str(args.batch_size)+'_'+args.bert_model+'.untrained.onnx')

    choices = 2
    input_ids = torch.zeros([args.batch_size, choices, args.max_seq_length],
                            dtype=torch.long)
    token_type_ids = torch.zeros(
        [args.batch_size, choices, args.max_seq_length], dtype=torch.long)
    model = BertForMultipleChoice.from_pretrained(args.bert_model, choices)
    torch.onnx.export(
        model, (input_ids, token_type_ids), 'bert_multiplechoice_' + 'batch' +
        str(args.batch_size) + '_' + args.bert_model + '.untrained.onnx')
Beispiel #16
0
import ingest
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from keras.preprocessing.sequence import pad_sequences
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForNextSentencePrediction
from tqdm import tqdm, trange
import torch
from torch.nn import CrossEntropyLoss

MAX_LEN = 90
tokenizer = BertTokenizer.from_pretrained('data/mini/vocab.txt',
                                          do_lower_case=True)
model = BertForNextSentencePrediction.from_pretrained('data/mini/')


def tokenize(sentence):
    # tokenize the paragraph
    sentence = tokenizer.tokenize(sentence)
    # convert each token to its vocab id
    sentence = tokenizer.convert_tokens_to_ids(sentence)
    return sentence


def transform(data, returnAsPair=False):
    transformedData = []
    for dataPoint in data:
        # prestory = ["[CLS] " + query + " [SEP]" for query in dataPoint.inputSentences]
        prestory = " [SEP] ".join(dataPoint.inputSentences)
        prestory = "[CLS] " + prestory + " [SEP] "

        prestory = tokenize(prestory)