def text_genration(PADDING_TEXT, prompt):
    from transformers import TFAutoModelWithLMHead, AutoTokenizer
    model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
    # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
    # PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
    # (except for Alexei and Maria) are discovered.
    # The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
    # remainder of the story. 1883 Western Siberia,
    # a young Grigori Rasputin is asked by his father and a group of men to perform magic.
    # Rasputin has a vision and denounces one of the men as a horse thief. Although his
    # father initially slaps him for making such an accusation, Rasputin watches as the
    # man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
    # the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
    # with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
    # prompt = "Today the weather is really nice and I am planning on "
    inputs = tokenizer.encode(PADDING_TEXT + prompt,
                              add_special_tokens=False,
                              return_tensors="tf")
    prompt_length = len(
        tokenizer.decode(inputs[0],
                         skip_special_tokens=True,
                         clean_up_tokenization_spaces=True))
    outputs = model.generate(inputs,
                             max_length=250,
                             do_sample=True,
                             top_p=0.95,
                             top_k=60)
    generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
    print(generated)
    return
Exemple #2
0
    def test_lmhead_model_from_pretrained(self):
        for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)

            model = TFAutoModelWithLMHead.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForMaskedLM)
    def test_lmhead_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ["bert-base-uncased"]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)

            model = TFAutoModelWithLMHead.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForMaskedLM)
    def test_from_pretrained_identifier(self):
        model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER,
                                                      from_pt=True)
        self.assertIsInstance(model, TFBertForMaskedLM)
        self.assertEqual(model.num_parameters(), 14830)
        self.assertEqual(model.num_parameters(only_trainable=True), 14830)

        model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER,
                                                    from_tf=True)
        self.assertIsInstance(model, BertForMaskedLM)
        self.assertEqual(model.num_parameters(), 14410)
        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
    def test_from_identifier_from_model_type(self):
        model = TFAutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER,
                                                      from_pt=True)
        self.assertIsInstance(model, TFRobertaForMaskedLM)
        self.assertEqual(model.num_parameters(), 14830)
        self.assertEqual(model.num_parameters(only_trainable=True), 14830)

        model = AutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER,
                                                    from_tf=True)
        self.assertIsInstance(model, RobertaForMaskedLM)
        self.assertEqual(model.num_parameters(), 14410)
        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
def masked_lang_fill(sequence, no_of_version):
    from transformers import TFAutoModelWithLMHead, AutoTokenizer
    import tensorflow as tf
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
    model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")

    input = tokenizer.encode(sequence, return_tensors="tf")
    mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
    token_logits = model(input)[0]
    mask_token_logits = token_logits[0, mask_token_index, :]
    top_5_tokens = tf.math.top_k(mask_token_logits,
                                 no_of_version).indices.numpy()

    for token in top_5_tokens:
        print(sequence.replace(tokenizer.mask_token,
                               tokenizer.decode([token])))
    return
def Summarization(ARTICLE):
    from transformers import TFAutoModelWithLMHead, AutoTokenizer
    model = TFAutoModelWithLMHead.from_pretrained("t5-base")
    tokenizer = AutoTokenizer.from_pretrained("t5-base")
    # T5 uses a max_length of 512 so we cut the article to 512 tokens.
    inputs = tokenizer.encode("summarize: " + ARTICLE,
                              return_tensors="tf",
                              max_length=512)
    summary_ids = model.generate(inputs,
                                 max_length=150,
                                 min_length=40,
                                 length_penalty=2.0,
                                 num_beams=4,
                                 early_stopping=True)
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    # print(output)
    return output
def run_generation(args):
    if args.lib == 'pt':
        model = AutoModelWithLMHead.from_pretrained(args.model)
    elif args.lib == 'tf':
        model = TFAutoModelWithLMHead.from_pretrained(args.model)
    else:
        raise ValueError('{} is no lib'.format(args.lib))

    tokenizer = AutoTokenizer.from_pretrained(args.model)

    if args.input:
        input_text = args.input
    else:
        input_text = TEXT

    print("Text: {}".format(input_text))
    tokenized_input_words = tokenizer.encode(input_text,
                                             add_special_tokens=False,
                                             return_tensors=args.lib)
    print("BOS: {}".format(tokenizer.bos_token_id))
    print("PAD: {}".format(tokenizer.pad_token_id))
    print("EOS: {}".format(tokenizer.eos_token_id))
    print("Input tokens: {}".format(tokenized_input_words))
    torch.manual_seed(0)
    generated_tokens = model.generate(tokenized_input_words,
                                      bos_token_id=tokenizer.bos_token_id,
                                      eos_token_ids=tokenizer.eos_token_id,
                                      pad_token_id=tokenizer.pad_token_id,
                                      do_sample=False,
                                      no_repeat_ngram_size=2,
                                      max_length=40,
                                      num_beams=5,
                                      early_stopping=True)
    print("Output tokens: {}".format(generated_tokens))

    generated_words = tokenizer.decode(generated_tokens[0],
                                       skip_special_tokens=True,
                                       clean_up_tokenization_spaces=True)
    print("Output text: {}".format(generated_words))
 def test_from_pretrained_identifier(self):
     logging.basicConfig(level=logging.INFO)
     model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
     self.assertIsInstance(model, TFBertForMaskedLM)
     self.assertEqual(model.num_parameters(), 14830)
     self.assertEqual(model.num_parameters(only_trainable=True), 14830)
 def test_from_pretrained_identifier(self):
     logging.basicConfig(level=logging.INFO)
     model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
     self.assertIsInstance(model, TFBertForMaskedLM)
Exemple #11
0
    check_output_path(args.output_path, force=True)

    tokenizer = AutoTokenizer.from_pretrained(args.model_select)
    dataset = load_dataset(*args.dataset_name.split(", "))
    # use num_proc = 6 can give 6x speedup ideally as compared to 1 proc, which is really good stuff for tokenizing many examples
    # this is the main reason why using HF's datasets instead of torch.Dataset
    encoded = dataset.map(convert_to_features, batched=True, fn_kwargs={"args": args, "tokenizer": tokenizer}, num_proc=6)
    columns = ['input_ids', "source_lengths", "target_lengths", 'attention_mask', 'labels', 'decoder_attention_mask']
    encoded.set_format(type='tensorflow', columns=columns)

    if args.do_train:
        add_filehandler_for_logger(args.output_path, logger, out_name="train")
        strategy = get_strategy(args, logger)
        with strategy.scope():
            # from_pt to aovid repeated downloading
            model = TFAutoModelWithLMHead.from_pretrained(args.model_select, from_pt=True)
            train_dataset = get_dataset(encoded["train"], tag="train")
            val_dataset = None
            if "validation" in encoded:
                val_dataset = get_dataset(encoded["validation"], tag="eval")
            trainer = T2TTrainer(args, logger)
            trainer.train(model, strategy, tokenizer, train_dataset=train_dataset, eval_dataset=val_dataset, evaluate_fn=evaluate, verbose=True)

    # we want the testing is independent of the training as much as possible
    # so that it is okay to do test when args.do_train = False and checkpoints already exist
    if args.do_test:
        test_set = "test"
        if test_set in encoded:
            add_filehandler_for_logger(args.output_path, logger, out_name="test")
            sorted_indices, index2path = get_existing_cks(args.output_path, return_best_ck=False)
            if args.ck_index_select < 0:
Exemple #12
0
def create_mlm_model_and_optimizer():
    with strategy.scope():
        model = TFAutoModelWithLMHead.from_pretrained(PRETRAINED_MODEL)
        optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
    return model, optimizer
sentiment_analysis_model = pipeline('sentiment-analysis')
text_generation_model = pipeline('text-generation')


def sentiment_analysis(text: str):
    result = sentiment_analysis_model(text)
    return result


def text_generation(text: str):
    result = text_generation_model(text, max_length=80, do_sample=False)
    return result


t5_model = TFAutoModelWithLMHead.from_pretrained("t5-large", return_dict=True)
t5_tokenizer = AutoTokenizer.from_pretrained("t5-large")


def summarize(text: str):
    # T5 uses a max_length of 512 so we cut the article to 512 tokens.
    inputs = t5_tokenizer.encode("summarize: " + text,
                                 return_tensors="tf",
                                 max_length=512)
    outputs = t5_model.generate(inputs,
                                max_length=150,
                                min_length=40,
                                length_penalty=2.0,
                                num_beams=4,
                                early_stopping=True)
    result = t5_tokenizer.decode(
Exemple #14
0
# remodeling the model and saving the model as tensorflow (tf_model.h5)
# create folder to save converted models

import os
os.mkdir("./model/pb_model")
os.mkdir("./model/tf_model/keras")

# loading hugging face converter as described here:
# https://huggingface.co/transformers/model_sharing.html

from transformers import TFAutoModelWithLMHead
import tensorflow as tf

# load pytorch_model.bin and related model structures, convert to h5
tf_model = TFAutoModelWithLMHead.from_pretrained("./model/trained_model/",
                                                 from_pt=True)
# and save converted tf_model.h5 in "tf_model"
tf_model.save_pretrained("./model/tf_model/")
# and save "saved_model.pb" in "pb_model"
tf_model.save("./model/pb_model/")

tf.saved_model.save(german_model, "./model/tf_model/keras")

# loading the h5 model is not a problem with TFAutoModelWithLMHead
loaded = tf.saved_model.load("./model/tf_model/keras")

#tokenizer = AutoTokenizer.from_pretrained("anonymous-german-nlp/german-gpt2")

prompt = "Ada liebte ihre Katze"
inputs = tokenizer.encode(prompt,
                          add_special_tokens=False,
 def test_from_identifier_from_model_type(self):
     logging.basicConfig(level=logging.INFO)
     model = TFAutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
     self.assertIsInstance(model, TFRobertaForMaskedLM)
     self.assertEqual(model.num_parameters(), 14830)
     self.assertEqual(model.num_parameters(only_trainable=True), 14830)
Exemple #16
0
from aliyunsdkcore.client import AcsClient
from aliyunsdkcore.acs_exception.exceptions import ClientException
from aliyunsdkcore.acs_exception.exceptions import ServerException
from aliyunsdkalimt.request.v20181012.TranslateGeneralRequest import TranslateGeneralRequest

logger = logging.getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

logger.info("Loading model ...")
tokenizer = AutoTokenizer.from_pretrained(
    "/home/admin/workspace/model/transformers/bert-base-multilingual-cased")
model = TFAutoModelWithLMHead.from_pretrained(
    "/home/admin/workspace/model/transformers/bert-base-multilingual-cased")

config = configparser.ConfigParser()
config.read("/home/admin/workspace/.secret")

client = AcsClient(config["account xjx"]["access_key"],
                   config["account xjx"]["access_secret"], 'cn-hangzhou')


def cut_sentences(text, min_len=3):
    """
    Cut sentences by their length and punctuation, remove all spaces.
    """
    text = text.replace(" ", "")
    corpus = re.split(r"[\,\.\?,。?\n]", text)
    corpus = list(filter(lambda x: len(x) >= min_len, corpus))
Exemple #17
0
    reader = FARMReader(model_name_or_path=os.path.join(
        dir_path, 'SaveAlbert'),
                        use_gpu=False,
                        num_processes=1)
elif selector == '2':
    reader = FARMReader(model_name_or_path=os.path.join(dir_path, 'SaveBERT'),
                        use_gpu=False,
                        num_processes=1)
elif selector == '3':
    reader = FARMReader(model_name_or_path="./SaveXLNet",
                        use_gpu=False,
                        num_processes=1)

nlp = en_coref_md.load()
model_sum = TFAutoModelWithLMHead.from_pretrained(os.path.join(
    dir_path, 'summarizer'),
                                                  return_dict=True)
tokenizer_sum = AutoTokenizer.from_pretrained(
    os.path.join(dir_path, 'summarizer'))
classifier = pipeline("zero-shot-classification",
                      model=os.path.join(dir_path, 'zero-shot-classifier'))
document_store = ElasticsearchDocumentStore(host="localhost",
                                            username="",
                                            password="",
                                            index="rethinkl_test1")
retriever = DensePassageRetriever(document_store=document_store)
tokenizer_converse = AutoTokenizer.from_pretrained(
    os.path.join(dir_path, 'DialoGPT-large'))
model_converse = AutoModelForCausalLM.from_pretrained(
    os.path.join(dir_path, 'DialoGPT-large'))
finder = Finder(reader, retriever)