Esempio n. 1
0
def mine_triples(device, input_file, output_file, use_local_model=False):
    if use_local_model:
        print('loading BERT...')
        bert = BertForMaskedLM.from_pretrained("../models/BertForMaskedLM")
        print('loading GPT2...')
        gpt = GPT2LMHeadModel.from_pretrained("../models/GPT2LMHeadModel")
    else:
        print('loading BERT...')
        bert = BertForMaskedLM.from_pretrained(bert_model)
        print('loading GPT2...')
        gpt = GPT2LMHeadModel.from_pretrained(gpt2_model)
    """
        'concat': KnowledgeMiner(
            os.path.join(data_repo, candidate_file),
            device,
            DirectTemplate,
            bert
        ),
        'template': KnowledgeMiner(
            os.path.join(data_repo, candidate_file),
            device,
            PredefinedTemplate,
            bert,
            grammar=False,
            template_loc=os.path.join(template_repo, single_templates)
        ),
        'template_grammar': KnowledgeMiner(
            os.path.join(data_repo, candidate_file),
            device,
            PredefinedTemplate,
            bert,
            grammar=True,
            template_loc=os.path.join(template_repo, single_templates)
        ),
    """

    knowledge_miners = {
        'coherency':
        KnowledgeMiner(input_file,
                       device,
                       EnumeratedTemplate,
                       bert,
                       language_model=gpt,
                       template_loc=os.path.join(template_repo,
                                                 multiple_templates),
                       use_local_model=use_local_model)
    }

    for template_type in knowledge_miners.keys():
        predictions = run_experiment(template_type, knowledge_miners)
        triples = knowledge_miners[template_type].sentences.tuples
        scored_samples = list(zip(triples, predictions))
        scored_samples.sort(key=lambda x: x[1], reverse=True)
        with open(output_file, "w") as f:
            for triple, pred in scored_samples:
                rel, head, tail = triple
                triple = (rel.lower(), head, tail)
                f.write("\t".join(triple) + "\t" + "{:.5f}".format(pred))
                f.write("\n")
Esempio n. 2
0
 def __init__(self, factorize=True):
     self.model = BertForMaskedLM.from_pretrained('bert-base-uncased')
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.weight_of_phrase = []
     self.weight_of_position = []
     self.weight_average = []
     self.factorize = factorize
Esempio n. 3
0
def do_ai_madlib(text_with_blanks, blank_token):
    mask_token = '[MASK]'

    bert_version = 'bert-base-cased'
    model = BertForMaskedLM.from_pretrained(bert_version)
    tokenizer = BertTokenizer.from_pretrained(bert_version)

    tokens = tokenizer.tokenize(text_with_blanks)
    mask_idxs = []
    for i in range(0, len(tokens)):
        if tokens[i] == blank_token:
            tokens[i] = mask_token
            mask_idxs.append(i)

    model.eval()
    for i in mask_idxs:
        # convert tokens to their index in the "vocabulary"
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        # create a tensor for these indices
        tokens_tensor = torch.tensor([token_ids])
        preds = model(tokens_tensor)[0,i]
        pred_id = torch.argmax(preds).item()
        pred_token = tokenizer.convert_ids_to_tokens([pred_id])[0]
        tokens[i] = pred_token

    for i in mask_idxs:
        tokens[i] = '__' + tokens[i] + '__'
    return ' '.join(tokens).replace(' ##', '')
    def __init__(self, model):
        # tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(model)

        # Model
        self.bertModel = BertForMaskedLM.from_pretrained(model)
        self.bertModel.eval()
def guess_single_word(text):
    tokenized_text = tokenizer.tokenize(text)
    print(tokenized_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    print(indexed_tokens)
    masked_index = tokenized_text.index('[MASK]')
    print(masked_index)

    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    print(tokens_tensor, segments_tensors)

    # Load pre-trained model (weights)
    model = BertForMaskedLM.from_pretrained(
        pretrained_model_name_or_path=pretrained_model_path)
    model.eval()

    # Predict all tokens
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)

    print(predictions.shape)
    pre_idxs = get_top_n_idx(predictions[0, masked_index], 5)
    print(pre_idxs)
    print(tokenizer.convert_ids_to_tokens(np.asarray(pre_idxs)))

    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    print(predicted_index)
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

    print(predicted_token)
Esempio n. 6
0
def example_get_lm(tokens_tensor, segments_tensors, tokenizer):
    '''how to use BertForMaskedLM'''
    # Load pre-trained model (weights)
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()

    # If you have a GPU, put everything on cuda
    tokens_tensor = tokens_tensor.to('cuda')
    segments_tensors = segments_tensors.to('cuda')
    model.to('cuda')

    # Predict all tokens
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)

    masked_index = 8
    # confirm we were able to predict 'henson'
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    print("predicted_index")
    print(predicted_index)
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print("predicted_token")
    print(predicted_token)
    # assert predicted_token == 'henson'
    return
Esempio n. 7
0
 def __init__(self, use_gpu=False):
     self.tokenizer = BertTokenizer.from_pretrained(DEFAULT_BERT_WEIGHTS)
     self.model = BertForMaskedLM.from_pretrained(DEFAULT_BERT_WEIGHTS)
     self.model.eval()
     use_gpu = use_gpu and torch.cuda.is_available()
     self.device = torch.device("cuda" if use_gpu else "cpu")
     self.model.to(self.device)
Esempio n. 8
0
def predict():
    # Load pre-trained model with masked language model head
    bert_version = 'bert-large-uncased'
    model = BertForMaskedLM.from_pretrained(bert_version)

    # Preprocess text
    tokenizer = BertTokenizer.from_pretrained(bert_version)
    tokenized_text = tokenizer.tokenize(text[idx])
    mask_positions = []
    for i in range(len(tokenized_text)):
        if tokenized_text[i] == '_':
            tokenized_text[i] = '[MASK]'
            mask_positions.append(i)

    # Predict missing words from left to right
    model.eval()
    for mask_pos in mask_positions:
        # Convert tokens to vocab indices
        token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([token_ids])
        # Call BERT to predict token at this position
        predictions = model(tokens_tensor)[0, mask_pos]
        predicted_index = torch.argmax(predictions).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        # Update text
        tokenized_text[mask_pos] = predicted_token

    for mask_pos in mask_positions:
        tokenized_text[mask_pos] = "_" + tokenized_text[mask_pos] + "_"

    madlib = (' '.join(tokenized_text).replace(' ##', ''))
    bottom = Text(madlibsframe, height=10, width=50, wrap=WORD)
    bottom.configure(font=("Times New Roman", 18, "bold"))
    bottom.insert(END, madlib)
    bottom.pack()
Esempio n. 9
0
def generate_syntactically_similar_sentences_replace(num_of_perturb, dataset):
	"""Generate syntactically similar sentences for each sentence in the dataset.
	For PaInv-Replace
	Returns dictionary of original sentence to list of generated sentences
	"""
	# Use nltk treebank tokenizer and detokenizer
	tokenizer = TreebankWordTokenizer()
	detokenizer = TreebankWordDetokenizer()

	# Stopwords from nltk
	stopWords = list(set(stopwords.words('english')))

	# File from which sentences are read
	file = open(dataset, "r")

	# when we use Bert
	berttokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
	bertmodel = BertForMaskedLM.from_pretrained('bert-large-uncased')
	bertmodel.eval()

	# Number of perturbations you want to make for a word in a sentence
	dic = {}
	num_of_perturb = 50
	num_sent = 0
	for line in file:
		s_list = line.split("\n")
		source_sent = s_list[0]
		# Generating new sentences using BERT
		new_sents = perturb(source_sent, bertmodel, num_of_perturb)
		dic[line] = new_sents		
		if new_sents != []:
			num_sent += 1
	return dic
Esempio n. 10
0
def mine_from_wikipedia(hardware):
    print('loading BERT...')
    bert = BertForMaskedLM.from_pretrained(bert_model)
    print('loading GPT2...')
    gpt = GPT2LMHeadModel.from_pretrained(gpt2_model)

    knowledge_miners = {
        'concat':
        KnowledgeMiner(data_repo + wikipedia_candidates, hardware,
                       DirectTemplate, bert),
        'template':
        KnowledgeMiner(data_repo + wikipedia_candidates,
                       hardware,
                       PredefinedTemplate,
                       bert,
                       grammar=False,
                       template_loc=template_repo + single_templates),
        'template_grammar':
        KnowledgeMiner(data_repo + wikipedia_candidates,
                       hardware,
                       PredefinedTemplate,
                       bert,
                       grammar=True,
                       template_loc=template_repo + single_templates),
        'coherency':
        KnowledgeMiner(data_repo + wikipedia_candidates,
                       hardware,
                       EnumeratedTemplate,
                       bert,
                       language_model=gpt,
                       template_loc=template_repo + multiple_templates)
    }

    for template_type in knowledge_miners.keys():
        run_experiment(template_type, knowledge_miners)
Esempio n. 11
0
    def __init__(self, model_path, tokenizer_path):
        super(Bert, self).__init__()
        self.model_path = model_path
        self.tokenizer_path = tokenizer_path

        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.model = BertForMaskedLM.from_pretrained(model_path)
Esempio n. 12
0
def load_pretrained_model_tokenizer(model_type="BertForSequenceClassification",
                                    base_model=None,
                                    base_tokenizer=None,
                                    device="cuda",
                                    chinese=False):
    # Load pre-trained model (weights)
    if base_model is None:
        # Download from huggingface
        if chinese:
            base_model = "bert-base-chinese"
        else:
            base_model = "bert-base-uncased"
    if model_type == "BertForSequenceClassification":
        model = BertForSequenceClassification.from_pretrained(base_model)
        # Load pre-trained model tokenizer (vocabulary)
    elif model_type == "BertForNextSentencePrediction":
        model = BertForNextSentencePrediction.from_pretrained(base_model)
    elif model_type == "BertForMaskedLM":
        model = BertForMaskedLM.from_pretrained(base_model)
    else:
        print("[Error]: unsupported model type")
        return None, None

    if base_tokenizer is None:
        # Download from huggingface
        tokenizer = BertTokenizer.from_pretrained(base_model)
    else:
        # Load local vocab file
        tokenizer = BertTokenizer.from_pretrained(base_tokenizer)
    model.to(device)
    return model, tokenizer
Esempio n. 13
0
def predict_missing_word(sentence):
    tokenized_text = tokenizer.tokenize(sentence)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Load pre-trained model (weights)
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()

    # Predict all tokens
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)

    masked_index = tokenized_text.index('[MASK]')

    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

    return predicted_token
Esempio n. 14
0
 def __init__(self, top_k, bert_name):
     self.do_lower_case = "uncased" in bert_name
     self.top_k = top_k
     self.tokenizer = BertTokenizer.from_pretrained(
         bert_name, do_lower_case=self.do_lower_case)
     self.model = BertForMaskedLM.from_pretrained(bert_name)
     self.model.eval()
Esempio n. 15
0
 def __init__(self, segment_size, output_size, dropout):
     super(BertPunc, self).__init__()
     self.bert = BertForMaskedLM.from_pretrained('bert-base-uncased')
     self.bert_vocab_size = 30522
     self.bn = nn.BatchNorm1d(segment_size * self.bert_vocab_size)
     self.fc = nn.Linear(segment_size * self.bert_vocab_size, output_size)
     self.dropout = nn.Dropout(dropout)
Esempio n. 16
0
    def __init__(self, train_data=None, dev_data=None):
        super().__init__(train_data, dev_data)

        model_name = 'bert-base-uncased'
        self.bert = BertForMaskedLM.from_pretrained(model_name)
        self.bert.to('cuda')
        self.tokenizer = tokenization.BertTokenizer.from_pretrained(model_name)
        self.bert.eval()
Esempio n. 17
0
 def init(self, model_type, model_dir):
     self.model_type = model_type
     self.model_dir = model_dir
     self.tokenizer = BertTokenizer.from_pretrained(model_type, cache_dir=model_dir)
     self.model = BertForMaskedLM.from_pretrained(model_type, cache_dir=model_dir)
     self.model.eval()
     if self.gpu:
         self.model.to("cuda")
Esempio n. 18
0
 def initialize_bert_corrector(self):
     t1 = time.time()
     self.bert_tokenizer = BertTokenizer(self.bert_model_vocab)
     # Prepare model
     self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
     print("Loaded model: %s, vocab file: %s, spend: %.3f s." %
           (self.bert_model_dir, self.bert_model_vocab, time.time() - t1))
     self.initialized_bert_corrector = True
Esempio n. 19
0
 def __init__(self):
     self.use_cuda = torch.cuda.is_available()
     self.device = torch.device("cuda" if self.use_cuda else "cpu")
     self.bertmodel = 'bert-large-uncased'
     self.tokenizer = BertTokenizer.from_pretrained(self.bertmodel)
     self.model = BertForMaskedLM.from_pretrained(self.bertmodel).to(
         self.device)
     self.model.eval()
Esempio n. 20
0
def load_model(modeldir):
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained(modeldir)
    # Load pre-trained model (weights)
    model = BertForMaskedLM.from_pretrained(modeldir)
    model.eval()
    model.to('cuda')
    return model, tokenizer
Esempio n. 21
0
def load_bert():
    global bert_tok, bert
    if bert is None:
        bert_model_str = os.getenv(
            'BERT_MODEL', default='bert-base-uncased'
        )  # 'bert-base-uncased', 'bert-base-multilingual-uncased'
        bert_tok = BertTokenizer.from_pretrained(bert_model_str)
        bert = BertForMaskedLM.from_pretrained(bert_model_str)
        bert.eval()
Esempio n. 22
0
def load_model(device):
    global model, modelp
    model = BertModel.from_pretrained('bert-base-uncased')
    model.eval()
    model.to(device)

    modelp = BertForMaskedLM.from_pretrained('bert-base-uncased')
    modelp.eval()
    modelp.to(device)
Esempio n. 23
0
    def __init__(self, weight_name='bert-base-uncased'):
        self.tokenizer = BertTokenizerFast.from_pretrained(weight_name,
                                                           do_lower_case=True)
        self.model = BertForMaskedLM.from_pretrained(weight_name)
        self.loss_fct = torch.nn.CrossEntropyLoss()

        self.device = self.get_device()
        self.model = self.model.to(self.device)
        self.model.eval()
Esempio n. 24
0
 def __init__(self,
              model_name: str = 'bert-base-uncased',
              do_lower_case: bool = True):
     # Load pre-trained model tokenizer (vocabulary)
     self.tokenizer = BertTokenizer.from_pretrained(
         model_name, do_lower_case=do_lower_case)
     # Load pre-trained model (weights)
     self.model = BertForMaskedLM.from_pretrained(model_name)
     self.model.eval()
Esempio n. 25
0
def loadBERT():
  global tokenizer
  global model
  print("Loading BERT")
  # Load pre-trained model tokenizer (vocabulary)
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  # Load pre-trained model (weights)
  model = BertForMaskedLM.from_pretrained('bert-base-uncased')
  model.eval()
  print("Done")
Esempio n. 26
0
 def initialize_bert_corrector(self):
     t1 = time.time()
     self.bert_tokenizer = BertTokenizer(self.bert_model_vocab)
     self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids([MASK_TOKEN
                                                               ])[0]
     # Prepare model
     self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
     logger.debug("Loaded model ok, path: %s, spend: %.3f s." %
                  (self.bert_model_dir, time.time() - t1))
     self.initialized_bert_corrector = True
Esempio n. 27
0
def get_model_and_tokenizer_bert(model_name):
    bert = BertForMaskedLM.from_pretrained(model_name)
    tokenizer = tokenization.BertTokenizer.from_pretrained(model_name)
    bert.eval()
    if torch.cuda.is_available():
        device = 'cuda'
        print("Using GPU!")
    else:
        device = "cpu"
        print("GPU not available.")
    bert.to(device)
    return bert, tokenizer
Esempio n. 28
0
    def __init__(self):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.processor = SentProcessor()
        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # Load pre-trained model (weights)
        self.model = BertForMaskedLM.from_pretrained('bert-base-uncased')
        self.model.to(self.device)
        self.model.eval()
        self.mask_token = '[MASK]'
Esempio n. 29
0
 def __init__(self, model_name_or_path="bert-base-cased"):
     super(BERTLM, self).__init__()
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
     self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path,
                                                    do_lower_case=False)
     self.model = BertForMaskedLM.from_pretrained(model_name_or_path)
     self.model.to(self.device)
     self.model.eval()
     # BERT-specific symbols
     self.mask_tok = self.tokenizer.convert_tokens_to_ids(["[MASK]"])[0]
     self.pad = self.tokenizer.convert_tokens_to_ids(["[PAD]"])[0]
     print("Loaded BERT model!")
Esempio n. 30
0
def evaluate_tokens(text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    id_mask = [i for i, e in enumerate(tokenized_text) if e == '[MASK]']
    # Create the segments tensors.
    id_segments = [0] * len(tokenized_text)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = torch.tensor([id_segments])
    # Load pre-trained model (weights)
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()
    return id_mask, model, segments_tensor, tokenizer, tokens_tensor
model = BertModel.from_pretrained(home + '/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased/')
model.eval()

## Predict hidden states features for each layer
print(tokens_tensor.shape)  # torch.Size([1, 14])
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)
## We have a hidden states for each of the 24 layers in model bert-large-uncased
print(len(encoded_layers))  # 24
print(encoded_layers[0].shape)  # torch.Size([1, 14, 1024])
x = torch.LongTensor([[1, 2], [3, 4]]); print(x.shape)  # torch.Size([2, 2])
print(modelfj)

##################################################################
## BertForMaskedLM
model = BertForMaskedLM.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased/')
model.eval()

## Predict all tokens
with torch.no_grad():
    predictions = model(tokens_tensor, segments_tensors)
print(predictions.shape)  # torch.Size([1, 14, 30522])

## confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item(); print(predicted_index)  # 27227
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])
print(predicted_token)  # ['henson']

##################################################################
## OpenAI GPT2
##################################################################