def tell_story(event: LambdaDict, context: LambdaDict) -> LambdaDict:
    """
    Function to take string input, and use text generation models to create
    a story which can serve as a natural language response to player inputs

    :param event: Input AWS Lambda event dict
    :param context: Input AWS Lambda context dict

    :return: Output AWS Lambda dict
    """
    # Decode the request
    request_body = event.get("body")
    if type(request_body) == str:
        request_body = json.loads(request_body)

    story_context = request_body["context"]
    print(story_context)

    # Load model
    """
    tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
    model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")
    
    tokenizer = AutoTokenizer.from_pretrained("sshleifer/bart-tiny-random")
    model = AutoModelWithLMHead.from_pretrained("sshleifer/bart-tiny-random")
    """
    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
    model = AutoModelWithLMHead.from_pretrained("distilgpt2")

    # Compute story
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    story_input = tokenizer.encode(story_context + tokenizer.eos_token,
                                   return_tensors='pt')

    # generated a response while limiting the total chat history to 1000 tokens,
    generated_story = model.generate(story_input,
                                     pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    story_result = tokenizer.decode(generated_story[:,
                                                    story_input.shape[-1]:][0],
                                    skip_special_tokens=True)

    result = {
        "statusCode": 200,
        "body": story_result,
        "headers": {
            "Access-Control-Allow-Origin": "*"
        },
    }
    return result
Beispiel #2
0
    def __init__(self,
                 pretrained_model_path="facebook/bart-large-cnn",
                 max_length=1024):

        self.pretrained_model_path = pretrained_model_path
        self.max_length = max_length

        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_path)
        self.model = AutoModelWithLMHead.from_pretrained(pretrained_model_path)

        task_specific_params = self.model.config.task_specific_params
        if task_specific_params is not None:
            self.model.config.update(
                task_specific_params.get("summarization", {}))
Beispiel #3
0
def init_model(model_name: str, device, do_lower_case: bool = False):
    """
    Initialize a pre-trained LM
    :param model_name: from MODEL_CLASSES
    :param device: CUDA / CPU device
    :param do_lower_case: whether the model is lower cased or not
    :return: the model and tokenizer
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              do_lower_case=do_lower_case)
    model = AutoModelWithLMHead.from_pretrained(model_name)
    model.to(device)
    model.eval()
    return tokenizer, model
Beispiel #4
0
    def __init__(self, hparams):
        super().__init__(hparams)

        self.tokenizer = AutoTokenizer.from_pretrained(
            "huggingface/CodeBERTa-small-v1", resume_download=True)
        # FIXME: using mask lm head in pretraining, but using automodel in finetuning.

        # self.lm_model = AutoModel.from_pretrained(
        #     "huggingface/CodeBERTa-small-v1", resume_download=True, config=hparams["roberta_config"])

        self.model = AutoModelWithLMHead.from_config(
            AutoConfig.from_pretrained("huggingface/CodeBERTa-small-v1",
                                       resume_download=True,
                                       config=hparams["roberta_config"]))
 def __init__(self, segment_size, output_size, dropout, vocab_size):
     super(ALBertSmallDenseHiddenPunc, self).__init__()
     self.bert = AutoModelWithLMHead.from_pretrained(
         './models/albert_chinese_small/')
     # self.bert_vocab_size = vocab_size
     # self.bn = nn.BatchNorm1d(segment_size*self.bert_vocab_size)
     # self.fc = nn.Linear(segment_size*self.bert_vocab_size, output_size)
     self.albert = self.bert.albert
     self.dense = self.bert.predictions.dense
     # 批标准化
     self.bn = nn.BatchNorm1d(segment_size * 128)
     # NOTE rnn_hidden*2 使用bert中间层hidden_state 384
     self.fc = nn.Linear(segment_size * 128, output_size)
     self.dropout = nn.Dropout(dropout)
Beispiel #6
0
    def __init__(self, params):
        super(BertTagger, self).__init__()
        self.num_tag = params.num_tag
        self.hidden_dim = params.hidden_dim
        config = AutoConfig.from_pretrained(params.model_name)
        config.output_hidden_states = True
        # self.bert = BertModel.from_pretrained("bert-base-cased")
        self.model = AutoModelWithLMHead.from_pretrained(params.model_name, config=config)
        if params.ckpt != "":
            logger.info("Reloading model from %s" % params.ckpt)
            model_ckpt = torch.load(params.ckpt)
            self.model.load_state_dict(model_ckpt)

        self.linear = nn.Linear(self.hidden_dim, self.num_tag)
def init_model(model_name: str,
               device: torch.device):
    """
    Initialize a pre-trained LM
    :param model_name: from MODEL_CLASSES
    :param device: CUDA / CPU device
    :return: the model and tokenizer
    """
    logger.info(f'Initializing {model_name}')
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelWithLMHead.from_pretrained(model_name)
    model.to(device)
    model.eval()
    return model, tokenizer
Beispiel #8
0
    def load_model(self):
        if 'longformer' in self.args.model_path:
            model = LongformerModel.from_pretrained(self.args.model_path)
            for layer in model.encoder.layer:
                layer.attention.self.attention_mode = self.args.attention_mode
                self.args.attention_window = 512 # layer.attention.self.attention_window
        elif self.args.model_path in ['bart.large', 'bart.base']:
            model = torch.hub.load('pytorch/fairseq', self.args.model_path)
            model.config = model.args
            model.config.hidden_size = model.config.decoder_output_dim
        elif 'bart' in self.args.model_path and 'base' in self.args.model_path:
            config = AutoConfig.from_pretrained(self.args.model_path)
            config.encoder_attention_heads = 12
            config.decoder_attention_heads = 12
            config.attention_dropout = 0.1
            if self.args.seq2seq:
                model = AutoModelWithLMHead.from_pretrained(self.args.model_path, config=config)
            else:
                model = AutoModel.from_pretrained(self.args.model_path, config=config)
        elif 'bart' in self.args.model_path and 'large' in self.args.model_path:
            config = AutoConfig.from_pretrained(self.args.model_path)
            config.attention_dropout = 0.1
            config.gradient_checkpointing = True
            if self.args.seq2seq:
                model = AutoModelWithLMHead.from_pretrained(self.args.model_path, config=config)
            else:
                model = AutoModel.from_pretrained(self.args.model_path, config=config)
        else:
            model = AutoModel.from_pretrained(self.args.model_path)

        print("Loaded model with config:")
        print(model.config)

        for p in model.parameters():
            p.requires_grad_(True)
        model.train()
        return model
def _parallelize_summarizations(article_filenames: List[str]) -> None:
    model = AutoModelWithLMHead.from_pretrained("t5-base", return_dict=True)
    model.share_memory()
    tokenizer = AutoTokenizer.from_pretrained("t5-base")
    """
    # Multiprocessing #1
    # Hangs on Ubuntu machine for some reason (tried using torch.multiprocessing, moving outside of main method, using model and tokenizer as params, etc; nothing works)
    # For some reason, worked fine in 40-45 seconds on FB Mac
    # 102-105 seconds
    mp.set_start_method("spawn")
    with Pool(NUM_PROCESSES) as p:
        article_summaries = p.starmap(
            _summarize_news_article,
            [
                (model, tokenizer, article_filename)
                for article_filename in article_filenames
            ],
        )
        print(f"Article summaries: {article_summaries}")
    p.close()
    p.join()
    """

    # Multiprocesing #2
    # Issue #1: https://stackoverflow.com/questions/50168647/multiprocessing-causes-python-to-crash-and-gives-an-error-may-have-been-in-progr
    # 40-45 seconds
    # 90-105 seconds (more variance than multiprocessing #1, but first approach should be better since it's better to fix # processes created)
    """
    procs = []
    for article_file in article_files:
        proc = Process(
            target=_summarize_news_article, args=(model, tokenizer, article_file,)
        )
        procs.append(proc)
        proc.start()

    for proc in procs:
        proc.join()
    """

    # List Comprehension
    # 90-95 seconds
    # ~220 seconds
    article_summaries = [
        _summarize_news_article(model, tokenizer, article_file)
        for article_file in article_filenames
    ]
    for article_summary in article_summaries:
        print(f"Article summary: {article_summary}")
Beispiel #10
0
def main(politician, epochs):
    """
    High-level management of model training process.
    """
    train_path = f"..\\data\\{politician}\\training_data.txt"
    val_path = f"..\\data\\{politician}\\validation_data.txt"

    tokenizer = AutoTokenizer.from_pretrained(
        "anonymous-german-nlp/german-gpt2")

    special_tokens_dict = {
        'bos_token': '<BOS>',
        'eos_token': '<EOS>',
        'pad_token': '<PAD>',
        'additional_special_tokens': ['<EOQ>']
    }
    tokenizer.add_special_tokens(special_tokens_dict)

    train_dataset, test_dataset, data_collator = load_dataset(
        train_path, val_path, tokenizer)

    model = AutoModelWithLMHead.from_pretrained(
        "anonymous-german-nlp/german-gpt2")
    model.resize_token_embeddings(len(tokenizer))

    training_args = TrainingArguments(
        output_dir=
        f".\\output-models\\gpt2-{politician}-{epochs}",  # output directory
        overwrite_output_dir=
        True,  # overwrite the content of the output directory
        num_train_epochs=epochs,  # number of training epochs
        per_device_train_batch_size=32,  # batch size for training
        per_device_eval_batch_size=64,  # batch size for evaluation
        eval_steps=400,  # Number of update steps between two evaluations.
        save_steps=800,  # after # steps model is saved
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        prediction_loss_only=True,
    )

    trainer.train()
    trainer.save_model()
def train():
    dialogues = pd.read_csv('/content/TlkPersonaChatRus/dialogues.tsv',
                            sep='\t')
    for column in dialogues.columns:
        dialogues[column].replace(to_replace=r'<[a-zA-Z0-9_=\/ ]+>',
                                  value=' ',
                                  regex=True,
                                  inplace=True)
    dialogues['dialogue'].replace(
        to_replace=r'Пользователь [12]:|Привет|Здравствуйте|[!)?,]',
        value='',
        regex=True,
        inplace=True)
    dialogues['dialogue'].replace(to_replace=r'\s\s+',
                                  value=' ',
                                  regex=True,
                                  inplace=True)
    dialogues = dialogues['dialogue']
    dialogues.to_csv('./Datasets/dialogues')

    tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
    model = AutoModelWithLMHead.from_pretrained('distilgpt2')
    tokenizer.pad_token = tokenizer.eos_token

    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=
        '/content/drive/MyDrive/semester-practice-3rd/Datasets/dialogues.txt',
        block_size=128,
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=False)

    training_args = TrainingArguments(
        output_dir=
        '/content/drive/MyDrive/semester-practice-3rd/Models/distilgpt2',
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_device_train_batch_size=8,
    )

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=dataset)

    trainer.train()
    trainer.save_model('model/gpt2_chat')
Beispiel #12
0
def nlp_it (text, lenght):
    tokenizer = AutoTokenizer.from_pretrained('t5-base')
    model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)

    inputs = tokenizer.encode(
        "summarize: " + text,
                              return_tensors='pt',
                              max_length=512,
                              truncation=True
    )

    summary_ids = model.generate(inputs, max_length=lenght, min_length=80, length_penalty=5., num_beams=2)
    summary = tokenizer.decode(summary_ids[0])

    return summary
Beispiel #13
0
def main():
    # Example: use pre-trained model from Huggingface
    tokenizer = AutoTokenizer.from_pretrained("MoseliMotsoehli/TswanaBert")
    model = AutoModelWithLMHead.from_pretrained("MoseliMotsoehli/TswanaBert")

    # On own model: make sure you trained a model with Masking (The current Roberta model has no masking)
    #tokenizer = AutoTokenizer.from_pretrained("./tswana_models/output")
    #model = AutoModel.from_pretrained("./tswana_models/output", output_attentions=True)
    unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)

    text = 'Ka ponyo ya <mask> lefatshe la nona.'
    # Ka ponyo ya leitlho lefatshe la nona
    #text = 'Ke bereketse kwa <mask> lobaka lo lo leele.'
    #'Ke bereketse kwa Kanye lobaka lo lo leele'
    print(text, unmasker(text))
    def test_lmhead_model_from_pretrained(self):
        for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)

            model = TFAutoModelWithLMHead.from_pretrained(model_name,
                                                          from_pt=True)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForMaskedLM)

            model = AutoModelWithLMHead.from_pretrained(model_name,
                                                        from_tf=True)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForMaskedLM)
Beispiel #15
0
 def __init__(self,
              model_name="bert-base-cased-finetuned-mrpc",
              task="SequenceClassification"):
     self.tokenizer = AutoTokenizer.from_pretrained(model_name)
     if task == "SC":
         self.model = AutoModelForSequenceClassification.from_pretrained(
             model_name)
     elif task == "QA":
         self.model = AutoModelForQuestionAnswering.from_pretrained(
             model_name)
     elif task == "LM":
         self.model = AutoModelWithLMHead.from_pretrained(model_name)
     elif task == "TC":
         self.model = AutoModelForTokenClassification.from_pretrained(
             "dbmdz/bert-large-cased-finetuned-conll03-english")
Beispiel #16
0
    def __init__(self, pretrained_model_name, config, skip_mlm=False):
        super(XLMForMTBPreTraining, self).__init__()

        self.config = config
        self.skip_mlm = skip_mlm
        # Add the entity markers and BLANKS tokens
        model = AutoModelWithLMHead.from_pretrained(pretrained_model_name)
        self.encoder = model.transformer
        self.encoder.resize_token_embeddings(self.config.vocab_size)

        if not self.skip_mlm:
            self.mlm_head = model.pred_layer
            self.mlm_loss_fn = nn.CrossEntropyLoss()
        self.re_head = REHead(self.config)
        self.mtb_loss_fn = nn.BCEWithLogitsLoss()
def modelWithLMHead(*args, **kwargs):
    r"""
        # Using torch.hub !
        import torch

        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
        assert model.config.output_attention == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

    """
    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
Beispiel #18
0
def loadModels(splittingModelPath, waypointModelPath):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModelWithLMHead.from_pretrained("bert-base-uncased")

    splittingNet = SplittingNet(model.bert)
    splittingNet.load_state_dict(
        torch.load(splittingModelPath, map_location=torch.device('cpu')))
    splittingNet.eval()

    waypointNet = WaypointNet(model.bert)
    waypointNet.load_state_dict(
        torch.load(waypointModelPath, map_location=torch.device('cpu')))
    waypointNet.eval()

    return (splittingNet, waypointNet, tokenizer)
    def __init__(self, model_scale):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        #self.device = "cpu"

        print("Device is " + str(self.device))

        #self.tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
        #self.model = GPT2LMHeadModel.from_pretrained("distilgpt2")
        if model_scale == 0:
            self.tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
            self.model = AutoModelWithLMHead.from_pretrained("distilgpt2")
            self.scorer = LMScorer.from_pretrained("distilgpt2",
                                                   device=self.device)
        elif model_scale == 1:
            self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
            self.model = AutoModelWithLMHead.from_pretrained("gpt2")
            self.scorer = LMScorer.from_pretrained("gpt2", device=self.device)
        elif model_scale == 2:
            self.tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
            self.model = AutoModelWithLMHead.from_pretrained("gpt2-medium")
            self.scorer = LMScorer.from_pretrained("gpt2-medium",
                                                   device=self.device)
        elif model_scale == 3:
            self.tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
            self.model = AutoModelWithLMHead.from_pretrained("gpt2-large")
            self.scorer = LMScorer.from_pretrained("gpt2-large",
                                                   device=self.device)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
            self.model = AutoModelWithLMHead.from_pretrained("gpt2-xl")
            self.scorer = LMScorer.from_pretrained("gpt2-xl",
                                                   device=self.device)

        #self.model.eval()
        self.model.to(self.device)
Beispiel #20
0
    def __init__(self, transformer_model, random_seed):
        
        random.seed(random_seed)
        np.random.seed(random_seed)
        torch.manual_seed(random_seed)        

        self.random_seed = random_seed
        self.model = AutoModelWithLMHead.from_pretrained(transformer_model)
        self.tokenizer = AutoTokenizer.from_pretrained(transformer_model)
        self.terms          = []
        self.embeddings     = torch.FloatTensor([])
        self.embeddings_2d  = None
        self.diffs          = []
        self.embed          = None        
        self.sim_fn         = torch.nn.CosineSimilarity(dim=1)
Beispiel #21
0
def load_from_pretrained():
    try:
        print("*" * 50)
        print("Load from checkpoint")
        tokenizer = GPT2Tokenizer.from_pretrained(opts.model_name_or_path,
                                                  pad_token='<|endoftext|>',
                                                  cls_token='<|cls|>',
                                                  sep_token='<|sep|>')
        model = GPT2LMHeadModel.from_pretrained(opts.model_name_or_path)
        try:
            with open(os.path.join(opts.output_dir, 'stats.pkl'), 'rb') as f:
                stats = pickle.load(f)
        except:
            print("Can't find training stats...")
            stats = None
        print("*" * 50)
    except Exception as e:
        print(e)
        try:
            # from dialogpt pretrained
            print("*" * 50)
            print("Load from pretrained")
            print("*" * 50)
            tokenizer = GPT2Tokenizer.from_pretrained(
                tokenizer_path,
                pad_token='<|endoftext|>',
                cls_token='<|cls|>',
                sep_token='<|sep|>')
            model = GPT2LMHeadModel.from_pretrained(model_path)
        except:
            print("*" * 50)
            print("Downloading ... ")
            print("*" * 50)
            # download dialogpt
            tokenizer = AutoTokenizer.from_pretrained(
                opts.download_name,
                pad_token='<|endoftext|>',
                cls_token='<|cls|>',
                sep_token='<|sep|>')
            model = AutoModelWithLMHead.from_pretrained(opts.download_name)
            # save to dialogpt
            tokenizer.save_pretrained(tokenizer_path)
            model.save_pretrained(model_path)
        stats = None
    tokenizer.add_special_tokens(
        {'additional_special_tokens': ['<|start|>', '<|p1|>', '<|p2|>']})
    model.resize_token_embeddings(len(tokenizer))
    return model.to(device), tokenizer, stats
Beispiel #22
0
    def _set_language_model_and_tokenizer(self,
                                          pretrained_weights: str) -> None:
        self._pretrained_weights = pretrained_weights

        try:
            self.language_model = BertForMaskedLM.from_pretrained(
                self._pretrained_weights, cache_dir=self._lm_dir)
            self.tokenizer = BertTokenizer.from_pretrained(
                self._pretrained_weights, cache_dir=self._lm_dir)
        except OSError:
            print('{} requires AutoModelWithLMHead and AutoTokenizer'.format(
                pretrained_weights))
            self.language_model = AutoModelWithLMHead.from_pretrained(
                self._pretrained_weights, cache_dir=self._lm_dir)
            self.tokenizer = AutoTokenizer.from_pretrained(
                self._pretrained_weights, cache_dir=self._lm_dir)
Beispiel #23
0
    def _convert_to_transformers_lm(adaptive_model, prediction_head):
        # init model
        transformers_model = AutoModelWithLMHead.from_config(adaptive_model.language_model.model.config)
        # transfer weights for language model + prediction head
        setattr(transformers_model, transformers_model.base_model_prefix, adaptive_model.language_model.model)
        # Adding decoder bias (required for conversion to transformers)
        prediction_head.decoder.bias = prediction_head.bias

        ph_state_dict = prediction_head.state_dict()
        ph_state_dict["transform.dense.weight"] = ph_state_dict.pop("dense.weight")
        ph_state_dict["transform.dense.bias"] = ph_state_dict.pop("dense.bias")
        ph_state_dict["transform.LayerNorm.weight"] = ph_state_dict.pop("LayerNorm.weight")
        ph_state_dict["transform.LayerNorm.bias"] = ph_state_dict.pop("LayerNorm.bias")
        transformers_model.cls.predictions.load_state_dict(ph_state_dict)

        return transformers_model
def transformers_svc():
    """Return a Transformers BentoService."""
    # When the ExampleBentoService got saved and loaded again in the test, the
    # two class attribute below got set to the loaded BentoService class.
    # Resetting it here so it does not effect other tests
    TransformersGPT2TextGenerator._bento_service_bundle_path = None
    TransformersGPT2TextGenerator._bento_service_bundle_version = None

    svc = TransformersGPT2TextGenerator()

    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    model = AutoModelWithLMHead.from_pretrained(
        "gpt2", pad_token_id=tokenizer.eos_token_id)
    model_artifact = {"model": model, "tokenizer": tokenizer}
    svc.pack("gptModel", model_artifact)
    return svc
    def __init__(self, name='T5 Translator', task='translation_en_to_de', device="cpu"):
        super().__init__(name)
        
        #Init name and metadata
        self.name = name
        self.task = task
        # self.device = 1 if torch.cuda.is_available() else -1
        self.device = -1 if device.lower() == "cpu" else 1

        #Create net
        self.tokenizer = AutoTokenizer.from_pretrained("t5-base")
        self.model = AutoModelWithLMHead.from_pretrained("t5-base")
        self.predictor = pipeline(self.task, 
                                  model = self.model,
                                  tokenizer = self.tokenizer,
                                  device = self.device)
def run_chinese_bert_wwm_torch():
    import torch
    tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")
    model = AutoModelWithLMHead.from_pretrained("hfl/chinese-bert-wwm-ext")

    input_ids = torch.tensor(tokenizer.encode("[MASK][MASK][MASK]总书记讲到三个关联“做好[MASK][MASK]防控工作,直接关系[MASK][MASK]生命安全和身体健康,直接关系经济社会大局稳定,也事关我国对外开放”", add_special_tokens=True)).unsqueeze(
        0)  # Batch size 1
    # print(f"input_ids:{input_ids}")
    outputs = model(input_ids)
    logits = outputs[0]
    # print(logits)
    # print("-" * 30)
    # print(f"outputs:{outputs}")
    print("-" * 30)
    # print(tf.math.argmax(logits, axis=2)[0, :])
    print(tokenizer.decode(torch.argmax(logits, dim=2)[0, :]))
    def __init__(self, brown_ic=None, glove_vecs=None, word_vectors=None):
        #write any initializing code here

        # DEFINE THRESHOLD VALUE
        self.dist_threshold = 0.3

        # 1. GET EMBEDDING FOR RED WORDS USING GPT2
        torch.set_grad_enabled(False)
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.model = AutoModelWithLMHead.from_pretrained('gpt2')

        #get stop words and what-not
        nltk.download('popular', quiet=True)
        nltk.download('words', quiet=True)
        self.corp_words = set(nltk.corpus.words.words())

        return
Beispiel #28
0
def main(config: DictConfig) -> None:
    print(config)
    tokenizer = AutoTokenizer.from_pretrained("imthanhlv/gpt2news")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    gpt2 = AutoModelWithLMHead.from_pretrained("imthanhlv/gpt2news")
    gpt2.resize_token_embeddings(len(tokenizer))
    gpt2.config.pad_token_id = gpt2.config.eos_token_id

    corpus = UITABSAHotel()
    # num_labels = corpus.num_labels
    num_labels = corpus.num_aspect_labels
    model = GPT2TextClassification(gpt2, num_labels)
    datamodule = MultiLabelClassificationDatamodule(corpus=corpus, tokenizer=tokenizer, **config.data)
    logger = WandbLogger(**config.logger)
    trainer = pl.Trainer(logger=logger, **config.trainer)
    trainer.fit(model, datamodule=datamodule)
Beispiel #29
0
def load_model(model_path, tokenizer_path=None, no_cuda=False, quantize=False):
    '''
    Loads a pretrained language model and tokenzier.
    
    :param model_path:
        The name of standard pretrained model or a path
        to a checkpoint for weights initialization.
    :param tokenizer_path:
        Optional pretrained tokenizer name or path if not
        the same as the model checkpoint path. If both None,
        a new tokenizer will be initialized.
    :param no_cuda:
        Disable CUDA devices even when they are available. Defaults to False.
    :param quantize:
        Indicates whether to load quantize the model.
    :returns:
        A :class:`transformers.PreTrainedModel` and a
        :class:`transformers.PreTrainedTokenizer`.

    '''

    # Setup device
    device = torch.device(
        'cuda' if torch.cuda.is_available() and not no_cuda else 'cpu')
    if quantize and device != 'cpu':
        raise RuntimeError('Model quantization only available on CPU devices.')

    if tokenizer_path:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    elif model_path:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
    else:
        raise ValueError(
            'Instantiating a new tokenizer from scratch is not support; however, it can be done from another script.'
            'Use the tokenizer_path argument to provide it with the location of the script to load the tokenizer.'
        )

    model = AutoModelWithLMHead.from_pretrained(model_path)
    if quantize:
        model = torch.quantization.quantize_dynamic(model, {
            torch.nn.Linear, torch.nn.Embedding,
            transformers.modeling_utils.Conv1D
        },
                                                    dtype=torch.qint8)

    return model.to(device), tokenizer
Beispiel #30
0
def get_transformer(LM: bool):
    if LM:
        if model_name == 'distilbert-base-cased':
            model = TFDistilBertForMaskedLM.from_pretrained(
                'distilbert-base-cased')
        elif model_name == 'huggingface/CodeBERTa-small-v1':
            model = AutoModelWithLMHead.from_pretrained(
                'huggingface/CodeBERTa-small-v1')
            model = pt_to_tf(model, TFRobertaForMaskedLM)
    else:
        if model_name == 'distilbert-base-cased':
            model = TFDistilBertModel.from_pretrained(
                'distilbert-base-cased', )
        elif model_name == 'huggingface/CodeBERTa-small-v1':
            model = AutoModel.from_pretrained('huggingface/CodeBERTa-small-v1')
            model = pt_to_tf(model, TFRobertaModel)
    return model