Beispiel #1
0
def mine_triples(device, input_file, output_file, use_local_model=False):
    if use_local_model:
        print('loading BERT...')
        bert = BertForMaskedLM.from_pretrained("../models/BertForMaskedLM")
        print('loading GPT2...')
        gpt = GPT2LMHeadModel.from_pretrained("../models/GPT2LMHeadModel")
    else:
        print('loading BERT...')
        bert = BertForMaskedLM.from_pretrained(bert_model)
        print('loading GPT2...')
        gpt = GPT2LMHeadModel.from_pretrained(gpt2_model)
    """
        'concat': KnowledgeMiner(
            os.path.join(data_repo, candidate_file),
            device,
            DirectTemplate,
            bert
        ),
        'template': KnowledgeMiner(
            os.path.join(data_repo, candidate_file),
            device,
            PredefinedTemplate,
            bert,
            grammar=False,
            template_loc=os.path.join(template_repo, single_templates)
        ),
        'template_grammar': KnowledgeMiner(
            os.path.join(data_repo, candidate_file),
            device,
            PredefinedTemplate,
            bert,
            grammar=True,
            template_loc=os.path.join(template_repo, single_templates)
        ),
    """

    knowledge_miners = {
        'coherency':
        KnowledgeMiner(input_file,
                       device,
                       EnumeratedTemplate,
                       bert,
                       language_model=gpt,
                       template_loc=os.path.join(template_repo,
                                                 multiple_templates),
                       use_local_model=use_local_model)
    }

    for template_type in knowledge_miners.keys():
        predictions = run_experiment(template_type, knowledge_miners)
        triples = knowledge_miners[template_type].sentences.tuples
        scored_samples = list(zip(triples, predictions))
        scored_samples.sort(key=lambda x: x[1], reverse=True)
        with open(output_file, "w") as f:
            for triple, pred in scored_samples:
                rel, head, tail = triple
                triple = (rel.lower(), head, tail)
                f.write("\t".join(triple) + "\t" + "{:.5f}".format(pred))
                f.write("\n")
Beispiel #2
0
class Classifier(torch.nn.Module):
    def __init__(self, hidden_size=768, linear_out=2, batch_first=True):

        super(Classifier, self).__init__()

        self.output_model_file = "lm/pytorch_model.bin"
        self.output_config_file = "lm/config.json"
        self.tokenizer = BertTokenizer.from_pretrained("lm",
                                                       do_lower_case=False)
        self.config = BertConfig.from_json_file(self.output_config_file)
        self.model = BertForMaskedLM(self.config)
        device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.state_dict = torch.load(self.output_model_file,
                                     map_location=device)
        self.model.load_state_dict(self.state_dict)
        self.lstm = torch.nn.LSTM(hidden_size, 300)
        self.linear = torch.nn.Linear(300, linear_out)

    def get_embeddings(self, x_instance):
        indexed_tokens = x_instance.tolist()
        break_sentence = indexed_tokens.index(102)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_ids = [0] * (break_sentence + 1)
        segments_ids += [1] * (len(indexed_tokens) - break_sentence - 1)
        segments_tensors = torch.tensor([segments_ids])
        self.model.eval()
        with torch.no_grad():
            encoded_layers, _ = self.model.bert(tokens_tensor.to(device),
                                                segments_tensors.to(device))
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1, 0, 2)
        token_vecs_cat = []
        for token in token_embeddings:
            cat_vec = torch.stack((token[-1], token[-2], token[-3], token[-4]))
            mean_vec = torch.mean(cat_vec, 0)
            token_vecs_cat.append(mean_vec)
        token_vecs_cat = torch.stack(token_vecs_cat, dim=0)
        return token_vecs_cat

    def embed_data(self, x):
        entries = []
        for entry in x:
            emb = self.get_embeddings(entry.to(device)).to(device)
            entries.append(emb)
        return torch.stack(entries)

    def forward(self, x):

        h = self.embed_data(x)
        h = h.permute(1, 0, 2)
        output, _ = self.lstm(h)
        pred = self.linear(output)
        pred = pred.permute(1, 0, 2)
        return pred
Beispiel #3
0
 def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
     model = BertForMaskedLM(config=config)
     model.eval()
     loss = model(input_ids, token_type_ids, input_mask, token_labels)
     prediction_scores = model(input_ids, token_type_ids, input_mask)
     outputs = {
         "loss": loss,
         "prediction_scores": prediction_scores,
     }
     return outputs
def get_words_for_blank_slow_decode(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer):
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)


    mask_positions = []
    tokenized_text = tokenizer.tokenize(text)
    top_words_all = []
    for i in range(len(tokenized_text)):
        if tokenized_text[i] == '_':
            tokenized_text[i] = '[MASK]'
            mask_positions.append(i)

    while mask_positions:
        top_words = []
        # Convert tokens to vocab indices
        token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([token_ids])

        # Call BERT to calculate unnormalized probabilities for all pos
        model.eval()
        predictions = model(tokens_tensor)

        # get predictions
        mask_preds = predictions[0, mask_positions, :]

        candidates = [] #(word, prob)
        for mask_pos in mask_positions:
            mask_preds = predictions[0, mask_pos, :]

            top_idxs = mask_preds.detach().numpy().argsort()[::-1]
            top_idx = top_idxs[0]
            top_prob = mask_preds[top_idx]
            top_word = tokenizer.ids_to_tokens[top_idx]
            candidates.append((top_word, top_prob.detach().item()))
            top_words_pos = []
            for i in top_idxs[:20]:
                top_words_pos.append((tokenizer.ids_to_tokens[i], mask_preds[i].detach().item()))
            top_words.append(top_words_pos)
        best_candidate = max(candidates, key = lambda x: x[1])
        best_pos = mask_positions[candidates.index(best_candidate)]

        tokenized_text[best_pos] = best_candidate[0]
        mask_positions = [i for i in mask_positions if i != best_pos]

        top_words_all.append(top_words[candidates.index(best_candidate)])

    pred_sent = ' '.join(tokenized_text).replace(' ##', '')
    return (pred_sent, top_words_all)
Beispiel #5
0
def predict_missing_word(sentence):
    tokenized_text = tokenizer.tokenize(sentence)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Load pre-trained model (weights)
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()

    # Predict all tokens
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)

    masked_index = tokenized_text.index('[MASK]')

    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

    return predicted_token
Beispiel #6
0
 def __init__(self, top_k, bert_name):
     self.do_lower_case = "uncased" in bert_name
     self.top_k = top_k
     self.tokenizer = BertTokenizer.from_pretrained(
         bert_name, do_lower_case=self.do_lower_case)
     self.model = BertForMaskedLM.from_pretrained(bert_name)
     self.model.eval()
Beispiel #7
0
 def __init__(self, factorize=True):
     self.model = BertForMaskedLM.from_pretrained('bert-base-uncased')
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.weight_of_phrase = []
     self.weight_of_position = []
     self.weight_average = []
     self.factorize = factorize
def load_pretrained_model_tokenizer(model_type="BertForSequenceClassification",
                                    base_model=None,
                                    base_tokenizer=None,
                                    device="cuda",
                                    chinese=False):
    # Load pre-trained model (weights)
    if base_model is None:
        # Download from huggingface
        if chinese:
            base_model = "bert-base-chinese"
        else:
            base_model = "bert-base-uncased"
    if model_type == "BertForSequenceClassification":
        model = BertForSequenceClassification.from_pretrained(base_model)
        # Load pre-trained model tokenizer (vocabulary)
    elif model_type == "BertForNextSentencePrediction":
        model = BertForNextSentencePrediction.from_pretrained(base_model)
    elif model_type == "BertForMaskedLM":
        model = BertForMaskedLM.from_pretrained(base_model)
    else:
        print("[Error]: unsupported model type")
        return None, None

    if base_tokenizer is None:
        # Download from huggingface
        tokenizer = BertTokenizer.from_pretrained(base_model)
    else:
        # Load local vocab file
        tokenizer = BertTokenizer.from_pretrained(base_tokenizer)
    model.to(device)
    return model, tokenizer
    def __init__(self, model):
        # tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(model)

        # Model
        self.bertModel = BertForMaskedLM.from_pretrained(model)
        self.bertModel.eval()
Beispiel #10
0
def do_ai_madlib(text_with_blanks, blank_token):
    mask_token = '[MASK]'

    bert_version = 'bert-base-cased'
    model = BertForMaskedLM.from_pretrained(bert_version)
    tokenizer = BertTokenizer.from_pretrained(bert_version)

    tokens = tokenizer.tokenize(text_with_blanks)
    mask_idxs = []
    for i in range(0, len(tokens)):
        if tokens[i] == blank_token:
            tokens[i] = mask_token
            mask_idxs.append(i)

    model.eval()
    for i in mask_idxs:
        # convert tokens to their index in the "vocabulary"
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        # create a tensor for these indices
        tokens_tensor = torch.tensor([token_ids])
        preds = model(tokens_tensor)[0,i]
        pred_id = torch.argmax(preds).item()
        pred_token = tokenizer.convert_ids_to_tokens([pred_id])[0]
        tokens[i] = pred_token

    for i in mask_idxs:
        tokens[i] = '__' + tokens[i] + '__'
    return ' '.join(tokens).replace(' ##', '')
Beispiel #11
0
def generate_syntactically_similar_sentences_replace(num_of_perturb, dataset):
	"""Generate syntactically similar sentences for each sentence in the dataset.
	For PaInv-Replace
	Returns dictionary of original sentence to list of generated sentences
	"""
	# Use nltk treebank tokenizer and detokenizer
	tokenizer = TreebankWordTokenizer()
	detokenizer = TreebankWordDetokenizer()

	# Stopwords from nltk
	stopWords = list(set(stopwords.words('english')))

	# File from which sentences are read
	file = open(dataset, "r")

	# when we use Bert
	berttokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
	bertmodel = BertForMaskedLM.from_pretrained('bert-large-uncased')
	bertmodel.eval()

	# Number of perturbations you want to make for a word in a sentence
	dic = {}
	num_of_perturb = 50
	num_sent = 0
	for line in file:
		s_list = line.split("\n")
		source_sent = s_list[0]
		# Generating new sentences using BERT
		new_sents = perturb(source_sent, bertmodel, num_of_perturb)
		dic[line] = new_sents		
		if new_sents != []:
			num_sent += 1
	return dic
Beispiel #12
0
def example_get_lm(tokens_tensor, segments_tensors, tokenizer):
    '''how to use BertForMaskedLM'''
    # Load pre-trained model (weights)
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()

    # If you have a GPU, put everything on cuda
    tokens_tensor = tokens_tensor.to('cuda')
    segments_tensors = segments_tensors.to('cuda')
    model.to('cuda')

    # Predict all tokens
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)

    masked_index = 8
    # confirm we were able to predict 'henson'
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    print("predicted_index")
    print(predicted_index)
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print("predicted_token")
    print(predicted_token)
    # assert predicted_token == 'henson'
    return
Beispiel #13
0
def predict():
    # Load pre-trained model with masked language model head
    bert_version = 'bert-large-uncased'
    model = BertForMaskedLM.from_pretrained(bert_version)

    # Preprocess text
    tokenizer = BertTokenizer.from_pretrained(bert_version)
    tokenized_text = tokenizer.tokenize(text[idx])
    mask_positions = []
    for i in range(len(tokenized_text)):
        if tokenized_text[i] == '_':
            tokenized_text[i] = '[MASK]'
            mask_positions.append(i)

    # Predict missing words from left to right
    model.eval()
    for mask_pos in mask_positions:
        # Convert tokens to vocab indices
        token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([token_ids])
        # Call BERT to predict token at this position
        predictions = model(tokens_tensor)[0, mask_pos]
        predicted_index = torch.argmax(predictions).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        # Update text
        tokenized_text[mask_pos] = predicted_token

    for mask_pos in mask_positions:
        tokenized_text[mask_pos] = "_" + tokenized_text[mask_pos] + "_"

    madlib = (' '.join(tokenized_text).replace(' ##', ''))
    bottom = Text(madlibsframe, height=10, width=50, wrap=WORD)
    bottom.configure(font=("Times New Roman", 18, "bold"))
    bottom.insert(END, madlib)
    bottom.pack()
def guess_single_word(text):
    tokenized_text = tokenizer.tokenize(text)
    print(tokenized_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    print(indexed_tokens)
    masked_index = tokenized_text.index('[MASK]')
    print(masked_index)

    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    print(tokens_tensor, segments_tensors)

    # Load pre-trained model (weights)
    model = BertForMaskedLM.from_pretrained(
        pretrained_model_name_or_path=pretrained_model_path)
    model.eval()

    # Predict all tokens
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)

    print(predictions.shape)
    pre_idxs = get_top_n_idx(predictions[0, masked_index], 5)
    print(pre_idxs)
    print(tokenizer.convert_ids_to_tokens(np.asarray(pre_idxs)))

    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    print(predicted_index)
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

    print(predicted_token)
Beispiel #15
0
    def __init__(self, model_path, tokenizer_path):
        super(Bert, self).__init__()
        self.model_path = model_path
        self.tokenizer_path = tokenizer_path

        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.model = BertForMaskedLM.from_pretrained(model_path)
Beispiel #16
0
 def __init__(self, segment_size, output_size, dropout):
     super(BertPunc, self).__init__()
     self.bert = BertForMaskedLM.from_pretrained('bert-base-uncased')
     self.bert_vocab_size = 30522
     self.bn = nn.BatchNorm1d(segment_size * self.bert_vocab_size)
     self.fc = nn.Linear(segment_size * self.bert_vocab_size, output_size)
     self.dropout = nn.Dropout(dropout)
Beispiel #17
0
def mine_from_wikipedia(hardware):
    print('loading BERT...')
    bert = BertForMaskedLM.from_pretrained(bert_model)
    print('loading GPT2...')
    gpt = GPT2LMHeadModel.from_pretrained(gpt2_model)

    knowledge_miners = {
        'concat':
        KnowledgeMiner(data_repo + wikipedia_candidates, hardware,
                       DirectTemplate, bert),
        'template':
        KnowledgeMiner(data_repo + wikipedia_candidates,
                       hardware,
                       PredefinedTemplate,
                       bert,
                       grammar=False,
                       template_loc=template_repo + single_templates),
        'template_grammar':
        KnowledgeMiner(data_repo + wikipedia_candidates,
                       hardware,
                       PredefinedTemplate,
                       bert,
                       grammar=True,
                       template_loc=template_repo + single_templates),
        'coherency':
        KnowledgeMiner(data_repo + wikipedia_candidates,
                       hardware,
                       EnumeratedTemplate,
                       bert,
                       language_model=gpt,
                       template_loc=template_repo + multiple_templates)
    }

    for template_type in knowledge_miners.keys():
        run_experiment(template_type, knowledge_miners)
Beispiel #18
0
 def __init__(self, use_gpu=False):
     self.tokenizer = BertTokenizer.from_pretrained(DEFAULT_BERT_WEIGHTS)
     self.model = BertForMaskedLM.from_pretrained(DEFAULT_BERT_WEIGHTS)
     self.model.eval()
     use_gpu = use_gpu and torch.cuda.is_available()
     self.device = torch.device("cuda" if use_gpu else "cpu")
     self.model.to(self.device)
Beispiel #19
0
    def __init__(self, hidden_size=768, linear_out=2, batch_first=True):

        super(Classifier, self).__init__()

        self.output_model_file = "lm/pytorch_model.bin"
        self.output_config_file = "lm/config.json"
        self.tokenizer = BertTokenizer.from_pretrained("lm",
                                                       do_lower_case=False)
        self.config = BertConfig.from_json_file(self.output_config_file)
        self.model = BertForMaskedLM(self.config)
        device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.state_dict = torch.load(self.output_model_file,
                                     map_location=device)
        self.model.load_state_dict(self.state_dict)
        self.lstm = torch.nn.LSTM(hidden_size, 300)
        self.linear = torch.nn.Linear(300, linear_out)
Beispiel #20
0
 def __init__(self):
     self.use_cuda = torch.cuda.is_available()
     self.device = torch.device("cuda" if self.use_cuda else "cpu")
     self.bertmodel = 'bert-large-uncased'
     self.tokenizer = BertTokenizer.from_pretrained(self.bertmodel)
     self.model = BertForMaskedLM.from_pretrained(self.bertmodel).to(
         self.device)
     self.model.eval()
Beispiel #21
0
 def initialize_bert_corrector(self):
     t1 = time.time()
     self.bert_tokenizer = BertTokenizer(self.bert_model_vocab)
     # Prepare model
     self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
     print("Loaded model: %s, vocab file: %s, spend: %.3f s." %
           (self.bert_model_dir, self.bert_model_vocab, time.time() - t1))
     self.initialized_bert_corrector = True
def load_model(modeldir):
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained(modeldir)
    # Load pre-trained model (weights)
    model = BertForMaskedLM.from_pretrained(modeldir)
    model.eval()
    model.to('cuda')
    return model, tokenizer
Beispiel #23
0
    def __init__(self, train_data=None, dev_data=None):
        super().__init__(train_data, dev_data)

        model_name = 'bert-base-uncased'
        self.bert = BertForMaskedLM.from_pretrained(model_name)
        self.bert.to('cuda')
        self.tokenizer = tokenization.BertTokenizer.from_pretrained(model_name)
        self.bert.eval()
Beispiel #24
0
 def init(self, model_type, model_dir):
     self.model_type = model_type
     self.model_dir = model_dir
     self.tokenizer = BertTokenizer.from_pretrained(model_type, cache_dir=model_dir)
     self.model = BertForMaskedLM.from_pretrained(model_type, cache_dir=model_dir)
     self.model.eval()
     if self.gpu:
         self.model.to("cuda")
Beispiel #25
0
    def __init__(self, weight_name='bert-base-uncased'):
        self.tokenizer = BertTokenizerFast.from_pretrained(weight_name,
                                                           do_lower_case=True)
        self.model = BertForMaskedLM.from_pretrained(weight_name)
        self.loss_fct = torch.nn.CrossEntropyLoss()

        self.device = self.get_device()
        self.model = self.model.to(self.device)
        self.model.eval()
Beispiel #26
0
def load_model(device):
    global model, modelp
    model = BertModel.from_pretrained('bert-base-uncased')
    model.eval()
    model.to(device)

    modelp = BertForMaskedLM.from_pretrained('bert-base-uncased')
    modelp.eval()
    modelp.to(device)
Beispiel #27
0
 def __init__(self,
              model_name: str = 'bert-base-uncased',
              do_lower_case: bool = True):
     # Load pre-trained model tokenizer (vocabulary)
     self.tokenizer = BertTokenizer.from_pretrained(
         model_name, do_lower_case=do_lower_case)
     # Load pre-trained model (weights)
     self.model = BertForMaskedLM.from_pretrained(model_name)
     self.model.eval()
Beispiel #28
0
def load_bert():
    global bert_tok, bert
    if bert is None:
        bert_model_str = os.getenv(
            'BERT_MODEL', default='bert-base-uncased'
        )  # 'bert-base-uncased', 'bert-base-multilingual-uncased'
        bert_tok = BertTokenizer.from_pretrained(bert_model_str)
        bert = BertForMaskedLM.from_pretrained(bert_model_str)
        bert.eval()
Beispiel #29
0
 def initialize_bert_corrector(self):
     t1 = time.time()
     self.bert_tokenizer = BertTokenizer(self.bert_model_vocab)
     self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids([MASK_TOKEN
                                                               ])[0]
     # Prepare model
     self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
     logger.debug("Loaded model ok, path: %s, spend: %.3f s." %
                  (self.bert_model_dir, time.time() - t1))
     self.initialized_bert_corrector = True
Beispiel #30
0
def loadBERT():
  global tokenizer
  global model
  print("Loading BERT")
  # Load pre-trained model tokenizer (vocabulary)
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  # Load pre-trained model (weights)
  model = BertForMaskedLM.from_pretrained('bert-base-uncased')
  model.eval()
  print("Done")
model = BertModel.from_pretrained(home + '/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased/')
model.eval()

## Predict hidden states features for each layer
print(tokens_tensor.shape)  # torch.Size([1, 14])
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)
## We have a hidden states for each of the 24 layers in model bert-large-uncased
print(len(encoded_layers))  # 24
print(encoded_layers[0].shape)  # torch.Size([1, 14, 1024])
x = torch.LongTensor([[1, 2], [3, 4]]); print(x.shape)  # torch.Size([2, 2])
print(modelfj)

##################################################################
## BertForMaskedLM
model = BertForMaskedLM.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased/')
model.eval()

## Predict all tokens
with torch.no_grad():
    predictions = model(tokens_tensor, segments_tensors)
print(predictions.shape)  # torch.Size([1, 14, 30522])

## confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item(); print(predicted_index)  # 27227
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])
print(predicted_token)  # ['henson']

##################################################################
## OpenAI GPT2
##################################################################