Esempio n. 1
0
 def __init__(self):
     self.use_cuda = torch.cuda.is_available()
     self.device = torch.device("cuda" if self.use_cuda else "cpu")
     self.bertmodel = 'bert-large-uncased'
     self.tokenizer = BertTokenizer.from_pretrained(self.bertmodel)
     self.model = TFBertForMaskedLM.from_pretrained(self.bertmodel).to(self.device)
     self.model.eval()
Esempio n. 2
0
    def __init__(self, index_type="mlm", model_path="bert-base-uncased", **kwargs):
        Expander.__init__(self, index_type)

        self.candidate_pos = ["NOUN", "ADJ", "ADV"]
        self.model_path = model_path

        allowed_keys = list(self.__dict__.keys())
        self.__dict__.update((k, v)
                             for k, v in kwargs.items() if k in allowed_keys)
        rejected_keys = set(kwargs.keys()) - set(allowed_keys)
        if rejected_keys:
            raise ValueError(
                "Invalid arguments in ElasticSearchRetriever constructor:{}".format(rejected_keys))

        logger.info(
            ">> loading HF model for Query Expansion from " + model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_path, use_fast=True)
        self.model = TFBertForMaskedLM.from_pretrained(
            self.model_path, from_pt=True)
        logger.info(">> Loading Spacy NLP model ")

        try:
            self.nlp = spacy.load('en_core_web_md')
        except OSError:
            logger.info(
                "Downloading language model for the spaCy POS tagger (don't worry, this will only happen once)")
            from spacy.cli import download
            download('en_core_web_md')
            self.nlp = spacy.load('en_core_web_md')
Esempio n. 3
0
 def _init_model(self):
     """
     Initializes model.
     """
     if self.model_dir:
         logger.status_update("Loading BERT model at {}...".format(
             self.model_dir))
         self.model = TFBertForMaskedLM.from_pretrained(self.model_dir,
                                                        from_pt=True,
                                                        config=self.config)
     elif self.model_name:
         logger.status_update("Loading BERT model {}...".format(
             self.model_name))
         self.model = TFBertForMaskedLM.from_pretrained(self.model_name,
                                                        config=self.config)
     return self.model
def main(raw_args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name",
                        type=str,
                        required=True,
                        help="model name e.g. xlnet-tiny-chinese")
    parser.add_argument("--cache_dir",
                        type=str,
                        default=None,
                        required=False,
                        help="Directory containing pytorch model")
    parser.add_argument("--pytorch_model_path",
                        type=str,
                        required=True,
                        help="/path/to/<pytorch-model-name>.bin")
    parser.add_argument("--tf_cache_dir",
                        type=str,
                        required=True,
                        help="Directory in which to save tensorflow model")
    args = parser.parse_args(raw_args)

    # Load the PyTorch model in TensorFlow
    tf_model = TFBertForMaskedLM.from_pretrained(args.cache_dir, from_pt=True)

    # Save the TensorFlow model
    tf_model.save_pretrained(args.tf_cache_dir)
Esempio n. 5
0
    def _build_masked_lm_from_huggingface(self):
        from transformers import TFBertForMaskedLM

        model = TFBertForMaskedLM.from_pretrained(
            os.path.join(os.environ["PYTORCH_MODEL_PATH"], "bert_uncased_L-6_H-768_A-12-pytorch"),
            from_pt=True,
        )
        return model
Esempio n. 6
0
 def test_TFBertForMaskedLM(self):
     from transformers import BertTokenizer, TFBertForMaskedLM
     pretrained_weights = 'bert-base-uncased'
     tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFBertForMaskedLM.from_pretrained(pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name,
                          onnx_model,
                          inputs_onnx,
                          predictions,
                          self.model_files,
                          rtol=1.e-2,
                          atol=1.e-4))
def language_decoder(args):
    lang_model = TFBertForMaskedLM.from_pretrained(
        args.model_name, cache_dir='/scratch/gpfs/zzada/cache-tf')
    d_size = lang_model.config.hidden_size
    v_size = lang_model.config.vocab_size

    lang_decoder = lang_model.mlm
    lang_decoder.trainable = False

    inputs = Input((d_size, ))
    x = Reshape((1, d_size))(inputs)
    x = lang_decoder(x)
    x = Reshape((v_size, ))(x)
    # x = Lambda(lambda z: tf.gather(z, vocab_indices, axis=-1))(x)
    x = Activation('softmax')(x)
    lm_decoder = Model(inputs=inputs, outputs=x)
    lm_decoder.summary()
    return lm_decoder
Esempio n. 8
0
def main(args):
    root = os.environ.get('BASE_DIR')
    tmp = root + "/models/tmp/" + args.model
    savepath = root + "/models/" + args.model

    for folder in [tmp, savepath]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    # download transformers model and save in tmp folder
    model = BertForMaskedLM.from_pretrained(args.model)
    model.save_pretrained(tmp)

    # Load the PyTorch model in TensorFlow
    tf_model = TFBertForMaskedLM.from_pretrained(tmp, from_pt=True)

    # Save the TensorFlow model
    tf.saved_model.save(tf_model, savepath)

    # Download needed files
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/" + args.model + "/"
    wget.download(url + 'config.json', savepath)
    wget.download(url + 'vocab.txt', savepath)

    #rename files
    os.rename(savepath + "/config.json", savepath + "/bert_config.json")
    os.rename(savepath + "/variables/variables.data-00000-of-00001",
              savepath + "/bert_model.ckpt.data-00000-of-00001")
    os.rename(savepath + "/variables/variables.index",
              savepath + "/bert_model.ckpt.index")

    #remove useless stuff
    os.rmdir(savepath + "/assets")
    os.rmdir(savepath + "/variables")
    os.remove(savepath + "/saved_model.pb")
    shutil.rmtree("./models/tmp")
Esempio n. 9
0
def load_model(model_path):
    return (
        TFBertForMaskedLM.from_pretrained(model_path),
        BertTokenizer.from_pretrained(model_path),
    )
Esempio n. 10
0
def get_custom_MLMmodel(num_tokens):
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.resize_token_embeddings(num_tokens)
    model.save_pretrained('tmp/CustomModel/')
    return TFBertForMaskedLM.from_pretrained('tmp/CustomModel', from_pt=True)
def download_bert_model():
    return TFBertForMaskedLM.from_pretrained("bert-base-cased")
def sentence_generator():
    model_path = "./data/bert-large-cased-whole-word-masking"
    model = TFBertForMaskedLM.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    return RhymeGenerator(model, tokenizer)
Esempio n. 13
0

if __name__ == '__main__':
    model_path = "../tcdata/bert/"
    tokenizer = BertTokenizer.from_pretrained("../tcdata/bert/vocab.txt")
    model_config = BertConfig.from_pretrained("../tcdata/bert/config.json")
    # model_config.output_attentions = False
    # model_config.output_hidden_states = False
    # model_config.use_cache = True
    # #
    # bert_model = TFBertModel.from_pretrained(pretrained_model_name_or_path=model_path, from_pt=False,
    #                                          config=model_config, cache_dir="../user_data/temp")
    # model = TFBertForMaskedLM(config=model_config)
    # model.bert = bert_model
    # model.resize_token_embeddings(len(tokenizer))
    model = TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path=model_path, from_pt=False,
                                              config=model_config, cache_dir="../user_data/temp")
    model.resize_token_embeddings(len(tokenizer))
    #

    # inputs = tokenizer("中国的首都是[MASK]", return_tensors="tf")
    # inputs["labels"] = tokenizer("中国的首都是北京", return_tensors="tf")["input_ids"]
    inputs = tokenizer.encode("中国的首都是[MASK]", return_tensors="tf")
    # print(tokenizer.tokenize("中国的首都是[MASK]"))
    outputs = model(inputs)
    # print(outputs)
    # exit(0)
    o1 = tf.argmax(outputs.logits[0], axis=1)
    print(o1)
    print(tokenizer.decode(o1))

import tensorflow as tf
import numpy as np
import jsonpickle
from tensorflow.python.keras.layers import Softmax
from transformers import BertTokenizer, TFBertForMaskedLM

from src.spacy_utils import LANG_MODEL, PatternNotFoundException
from src.semantic_sequence import SemanticSequence, MASK
from src.qualia_structure import CreationStrategy, QualiaElement, DebugQualiaStructure, Role

from spacy.lang.en.stop_words import STOP_WORDS

NAME_OF_MODEL = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(NAME_OF_MODEL)
MODEL = TFBertForMaskedLM.from_pretrained(NAME_OF_MODEL, return_dict=True)


class NumpyFloatHandler(jsonpickle.handlers.BaseHandler):
    '''
    Handler to convert numpy floats to string. Otherwise would be printed
    as None.
    '''
    def flatten(self, obj, data):
        return str(obj)


jsonpickle.handlers.registry.register(np.float, NumpyFloatHandler)
jsonpickle.handlers.registry.register(np.float32, NumpyFloatHandler)
jsonpickle.handlers.registry.register(np.float64, NumpyFloatHandler)
Esempio n. 15
0
          ┃ ┫ ┫   ┃ ┫ ┫
          ┗━┻━┛   ┗━┻━┛

"""

import tensorflow as tf
import numpy as np

for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)

from transformers import BertTokenizer, TFBertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

model = TFBertForMaskedLM.from_pretrained('bert-base-cased', return_dict=True)

inputs = tokenizer("The capital of France is [MASK].", return_tensors="tf")

outputs = model(inputs)
logits = outputs.logits

output = np.argmax(logits[0][6])
o1 = tokenizer.decode(int(output))

inputs = tokenizer("The capital of [MASK] is BeiJing.", return_tensors="tf")

outputs = model(inputs)
logits = outputs.logits

output = np.argmax(logits[0][4])
Esempio n. 16
0
import json
import tensorflow as tf
from transformers import TFBertForMaskedLM, BertTokenizer
from utils import tokenize_and_label, train_test_split
from sklearn.metrics import classification_report

lm_model = TFBertForMaskedLM.from_pretrained(
    '../SavedModels/DiBERT')  # For fine tuned bert
#lm_model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')  For pretrained bert
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.add_tokens(["[STARTQ]", "[ENDQ]", "[URL]"])
lm_model.resize_token_embeddings(len(tokenizer))


class TaskModel(tf.keras.models.Model):
    def __init__(self, trained_lm_model, num_classes=5):
        super(TaskModel, self).__init__()
        self.encoder = trained_lm_model.layers[0]
        self.prediction = tf.keras.layers.Dense(num_classes,
                                                activation='softmax')

    def call(self, inputs):
        encoded_seq, _ = self.encoder(inputs)
        return self.prediction(encoded_seq)


task_model = TaskModel(lm_model)
optimizer = tf.keras.optimizers.Adam(lr=0.00001)

with open('../Data/annotated_threads.json', 'r') as f:
    data = json.load(f)