Esempio n. 1
0
    def load_data(self):
        train = tfds.load('glue/' + self.dataset_name,
                          split='train',
                          shuffle_files=True)
        train_unshuffled = tfds.load('glue/' + self.dataset_name,
                                     split='train',
                                     shuffle_files=False)
        validation = tfds.load('glue/' + self.dataset_name,
                               split='validation',
                               shuffle_files=True)
        # test = tfds.load('glue/' + self.dataset_name, split='test', shuffle_files=True)

        # Prepare datasets for Huggingface's transformers
        tokenizer = DistilBertTokenizer.from_pretrained(self.model_name)

        train = glue_convert_examples_to_features(train,
                                                  tokenizer,
                                                  max_length=self.max_length,
                                                  task=self.dataset_name)
        self.train_unshuffled = glue_convert_examples_to_features(
            train_unshuffled,
            tokenizer,
            max_length=self.max_length,
            task=self.dataset_name)
        validation = glue_convert_examples_to_features(
            validation,
            tokenizer,
            max_length=self.max_length,
            task=self.dataset_name)
        # test = glue_convert_examples_to_features(test, tokenizer, max_length=self.max_length, task=self.dataset_name)

        self.validation = validation.batch(self.max_length).prefetch(1)
        self.train = train.shuffle(1000).repeat().batch(
            int(self.max_length / 2)).prefetch(1)
Esempio n. 2
0
    def _load_remote_model(self, model_name, tokenizer_kwargs, model_kwargs):
        if model_name not in ModelsByFamily.Supported:
            raise ValueError(f'Model {model_name} not supported.')

        do_lower_case = False
        if 'uncased' in model_name.lower():
            do_lower_case = True
        tokenizer_kwargs.update({'do_lower_case': do_lower_case})

        self._tokenizer = None
        self._model = None

        if model_name in ModelsByFamily.Bert:
            self._tokenizer = BertTokenizer.from_pretrained(
                model_name, **tokenizer_kwargs)
            self._model = TFBertForSequenceClassification.from_pretrained(
                model_name, **model_kwargs)
        elif model_name in ModelsByFamily.Roberta:
            self._tokenizer = RobertaTokenizer.from_pretrained(
                model_name, **tokenizer_kwargs)
            self._model = TFRobertaForSequenceClassification.from_pretrained(
                model_name, **model_kwargs)
        elif model_name in ModelsByFamily.XLNet:
            self._tokenizer = XLNetTokenizer.from_pretrained(
                model_name, **tokenizer_kwargs)
            self._model = TFXLNetForSequenceClassification.from_pretrained(
                model_name, **model_kwargs)
        elif model_name in ModelsByFamily.DistilBert:
            self._tokenizer = DistilBertTokenizer.from_pretrained(
                model_name, **tokenizer_kwargs)
            self._model = TFDistilBertForSequenceClassification.from_pretrained(
                model_name, **model_kwargs)

        assert self._tokenizer and self._model
Esempio n. 3
0
def simple_inference():
    '''
    this one is simpler and better for general case. It doesn't show the distribution of all the sentiments.
    this one uses the TextClassificationPipeline from transformers lib which is preferable
    :return:
    '''
    tokenizer = DistilBertTokenizer.from_pretrained("./model_out/")
    model = DistilBertForSequenceClassification.from_pretrained("./model_out/")
    model.to('cpu')
    sentiment_classifier = TextClassificationPipeline(model=model,
                                                      tokenizer=tokenizer,
                                                      device=-1)

    t1 = time.time()
    result = sentiment_classifier("this is so cute!")
    t2 = time.time()
    print(t2 - t1, result)

    result = sentiment_classifier("That's so disgusting!")
    t3 = time.time()
    print(t3 - t2, result)

    result = sentiment_classifier("this is a simple test.")
    t4 = time.time()
    print(t4 - t3, result)
def training_data(
    tickets_data_path: str,
    text_column: str,
    label_column: str,
    test_size: float = 0.25,
    subset_size: int = -1,
    max_length: int = 100,
    pad_to_max_length: bool = True,
) -> Tuple[Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray],
           DistilBertTokenizer]:

    df = pd.read_csv(tickets_data_path)
    x = df[text_column].tolist()
    y = df[label_column].tolist()
    unique_labels = sorted(list(set(y)))
    y = encode_labels(y, unique_labels)
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")
    tokenizer.max_length = max_length
    tokenizer.pad_to_max_length = pad_to_max_length
    print("tokenizing all texts...")
    x = encode_texts(tokenizer, x)
    subset_size = len(x) if subset_size < 0 else subset_size
    x_train, x_test, y_train, y_test = train_test_split(x[:subset_size],
                                                        y[:subset_size],
                                                        test_size=test_size,
                                                        random_state=42)
    return (x_train, x_test, y_train, y_test), tokenizer
Esempio n. 5
0
def load_model(manifest):
    """Loads the model object from the file at model_filepath key in config dict"""
    checkpoints_path = manifest["model_filepath"]
    if __name__ == "__main__":
        checkpoints = checkpoints_path
    else:
        checkpoints = client.file(checkpoints_path).getFile().name
        assert_model_md5(checkpoints)

    class_mapping = {
        0: "Movies_Negative",
        1: "Movies_Positive",
        2: "Food_Negative",
        3: "Food_Positive",
        4: "Clothing_Negative",
        5: "Clothing_Positive",
    }
    model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=len(class_mapping),
        output_attentions=False,
        output_hidden_states=False,
    )
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    model.load_state_dict(
        torch.load(checkpoints, map_location=torch.device("cpu")))
    return model, tokenizer, class_mapping
    def test_distilbert(self):
        for tokenizer_name in DistilBertTokenizer.pretrained_vocab_files_map["vocab_file"].keys():
            tokenizer_p = DistilBertTokenizer.from_pretrained(tokenizer_name)
            tokenizer_r = DistilBertTokenizerFast.from_pretrained(tokenizer_name)

            # Check we have the same number of added_tokens for both pair and non-pair inputs.
            self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False))
            self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True))

            # Check we have the correct max_length for both pair and non-pair inputs.
            self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
            self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)

            # DistilBert should match 100%
            # Assert the set of special tokens match.
            self.assertSequenceEqual(
                tokenizer_p.special_tokens_map.items(),
                tokenizer_r.special_tokens_map.items(),
                "DistilBert tokenizers doesn't have the same set of special_tokens",
            )

            # Assure tokenization overlap between python and rust impl.
            self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.0)

            # Ensure add_tokens and add_special_tokens return the correct vocab size
            self.assert_add_tokens(tokenizer_r)

            # Check for offsets mapping
            self.assert_offsets_mapping(tokenizer_r)

            # Check for dynamic encoding sequence handling in batch_encode_plus
            self.assert_batch_encode_dynamic_overflowing(tokenizer_r)

            # Check alignment for build_inputs_with_special_tokens
            self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
Esempio n. 7
0
def load_transformer(model_type):
    if model_type == "distilbert":
        tokenizer = DistilBertTokenizer.from_pretrained(
            'distilbert-base-uncased')
        model = TFDistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=1)
    elif model_type == "bert_x12":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-base-uncased", num_labels=1)
    elif model_type == "bert_x24":
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-large-uncased", num_labels=1)
    elif model_type == "albert_v2_x12":
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = TFAlbertForSequenceClassification.from_pretrained(
            "albert-base-v2", num_labels=1)
    elif model_type == "longformer_x12":
        tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-base-4096')
        model = TFLongformerForSequenceClassification.from_pretrained(
            "allenai/longformer-base-4096", num_labels=1)
    elif model_type == "longformer_x24":
        tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-large-4096')
        model = TFLongformerForSequenceClassification.from_pretrained(
            "allenai/longformer-large-4096", num_labels=1)
    else:
        raise ValueError(model_type + " was invalid")

    return model, tokenizer
def main():
    # model files check and download
    check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)

    ailia_model = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id)
    tokenizer = DistilBertTokenizer.from_pretrained(
        'distilbert-base-uncased-finetuned-sst-2-english')
    model_inputs = tokenizer.encode_plus(args.input, return_tensors="pt")
    inputs_onnx = {
        k: v.cpu().detach().numpy()
        for k, v in model_inputs.items()
    }

    print("Input : ", args.input)

    # inference
    if args.benchmark:
        print('BENCHMARK mode')
        for i in range(5):
            start = int(round(time.time() * 1000))
            score = ailia_model.predict(inputs_onnx)
            end = int(round(time.time() * 1000))
            print("\tailia processing time {} ms".format(end - start))
    else:
        score = ailia_model.predict(inputs_onnx)

    score = numpy.exp(score) / numpy.exp(score).sum(-1, keepdims=True)

    label_name = ["negative", "positive"]

    label_id = numpy.argmax(numpy.array(score))
    print("Label : ", label_name[label_id])
    print("Score : ", score[0][0][label_id])

    print('Script finished successfully.')
Esempio n. 9
0
    def __init__(
        self,
        model=None,
        tokenizer=None,
        model_name="bert-large-uncased",
        mask_token="***mask***",
        disable_gpu=False,
    ):
        self.mask_token = mask_token
        self.delemmatizer = Delemmatizer()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu"
        )
        print("using model:", model_name)
        print("device:", self.device)

        if not model:
            if "distilbert" in model_name:
                self.bert = DistilBertForMaskedLM.from_pretrained(model_name)
            else:
                self.bert = BertForMaskedLM.from_pretrained(model_name)
            self.bert.to(self.device)
        else:
            self.bert = model
            self.bert.to(self.device)

        if not tokenizer:
            if "distilbert" in model_name:
                self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
            else:
                self.tokenizer = BertTokenizer.from_pretrained(model_name)
        else:
            self.tokenizer = tokenizer

        self.bert.eval()
Esempio n. 10
0
 def get_tokenizer(self):
     if self.hparams.model_type == 'bert':
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     elif self.hparams.model_type == 'bert-cased':
         tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
     elif self.hparams.model_type == 'bert-large':
         tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
     elif self.hparams.model_type == 'distilbert':
         tokenizer = DistilBertTokenizer.from_pretrained(
             'distilbert-base-uncased')
     elif self.hparams.model_type == 'roberta':
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
     elif self.hparams.model_type == 'roberta-large':
         tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
     elif self.hparams.model_type == 'albert':
         tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
     elif self.hparams.model_type == 'albert-xxlarge':
         tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2')
     elif self.hparams.model_type == 'electra':
         tokenizer = ElectraTokenizer.from_pretrained(
             'google/electra-base-discriminator')
     elif self.hparams.model_type == 'electra-large':
         tokenizer = ElectraTokenizer.from_pretrained(
             'google/electra-large-discriminator')
     else:
         raise ValueError
     return tokenizer
Esempio n. 11
0
def answergen(context, question):

    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',
                                                    return_token_type_ids=True)
    model = DistilBertForQuestionAnswering.from_pretrained(
        'distilbert-base-uncased-distilled-squad')

    encoding = tokenizer.encode_plus(question, context)

    input_ids, attention_mask = encoding["input_ids"], encoding[
        "attention_mask"]

    start_scores, end_scores = model(torch.tensor([input_ids]),
                                     attention_mask=torch.tensor(
                                         [attention_mask]))

    ans_tokens = input_ids[torch.argmax(start_scores
                                        ):torch.argmax(end_scores) + 1]
    answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens,
                                                    skip_special_tokens=True)

    print("\nQuestion ", question)
    #print ("\nAnswer Tokens: ")
    #print (answer_tokens)

    answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)
    #print ("\nAnswer : ",answer_tokens_to_string)
    return answer_tokens_to_string
Esempio n. 12
0
def find_matches(model, image_embeddings, query, image_filenames, n=9):
    tokenizer = DistilBertTokenizer.from_pretrained(CFG.text_tokenizer)
    encoded_query = tokenizer([query])
    batch = {
        key: torch.tensor(values).to(CFG.device)
        for key, values in encoded_query.items()
    }
    with torch.no_grad():
        text_features = model.text_encoder(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"])
        text_embeddings = model.text_projection(text_features)

    image_embeddings_n = F.normalize(image_embeddings, p=2, dim=-1)
    text_embeddings_n = F.normalize(text_embeddings, p=2, dim=-1)
    dot_similarity = text_embeddings_n @ image_embeddings_n.T

    _, indices = torch.topk(dot_similarity.squeeze(0), n * 5)
    matches = [image_filenames[idx] for idx in indices[::5]]

    _, axes = plt.subplots(3, 3, figsize=(10, 10))
    for match, ax in zip(matches, axes.flatten()):
        image = cv2.imread(f"{CFG.image_path}/{match}")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        ax.imshow(image)
        ax.axis("off")

    plt.show()
Esempio n. 13
0
def build_model_pretrained(config):

    #Create different tokenizers for both source and target language.
    src_tokenizer = DistilBertTokenizer.from_pretrained(
        'distilbert-base-multilingual-cased')
    tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    tgt_tokenizer.bos_token = '<s>'
    tgt_tokenizer.eos_token = '</s>'

    #encoder_config = DistilBertConfig.from_pretrained('distilbert-base-multilingual-cased')

    encoder = DistilBertModel.from_pretrained(
        'distilbert-base-multilingual-cased')

    if config.decoder.pretrained:
        decoder = BertForMaskedLM.from_pretrained('bert-base-uncased')
    else:

        decoder_config = BertConfig(vocab_size=tgt_tokenizer.vocab_size,
                                    is_decoder=True)
        decoder = BertForMaskedLM(decoder_config)

    model = TranslationModel(encoder, decoder)
    model.cuda()

    tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer})
    return model, tokenizers
Esempio n. 14
0
def get_tokenizer(lm='bert'):
    """Return the tokenizer. Intiailize it if not initialized.

    Args:
        lm (string): the name of the language model (bert, albert, or distilbert)
    Returns:
        BertTokenizer or DistilBertTokenizer or AlbertTokenizer
    """
    global tokenizer
    if tokenizer is None:
        if lm == 'bert':
            from transformers import BertTokenizer
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        elif lm == 'distilbert':
            from transformers import DistilBertTokenizer
            tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        elif lm == 'albert':
            from transformers import AlbertTokenizer
            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        elif lm == 'roberta':
            from transformers import RobertaTokenizer
            tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        elif lm == 'xlnet':
            from transformers import XLNetTokenizer
            tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        elif lm == 'longformer':
            from transformers import LongformerTokenizer
            tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
    return tokenizer
Esempio n. 15
0
 def load_tokenizer(self):
     # Load the tokenizer.
     if self.verbose == True:
         print('Loading {} tokenizer...'.format(self.model_name))
     if self.model_name == 'bert':
         self.tokenizer = BertTokenizer.from_pretrained(self.model_type,
                                                        do_lower_case=True)
     if self.model_name == 'distilbert':
         self.tokenizer = DistilBertTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'albert':
         self.tokenizer = AlbertTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'bart':
         self.tokenizer = BartTokenizer.from_pretrained(self.model_type,
                                                        do_lower_case=True)
     if self.model_name == 'xlnet':
         self.tokenizer = XLNetTokenizer.from_pretrained(self.model_type,
                                                         do_lower_case=True)
     if self.model_name == 'roberta':
         self.tokenizer = RobertaTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'camenbert':
         self.tokenizer = CamembertTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'flaubert':
         self.tokenizer = FlaubertTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'gpt2':
         self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_type)
Esempio n. 16
0
def main():
    
    # 1 get data into dataframe
    df = read_into_pandas()
    (mlb_category, df) = replace_column_with_label_representation(df, 'category', 'category_int')
    df_train, df_test = train_test_split(df, test_size=0.2)    

    # 2 transform into BERT format
    df_bert = pd.DataFrame({
        'id':df_train['id'],
        'label':df_train['category_int'],
        'alpha':['a']*df_train.shape[0],
        'text': df_train['text'].str[:512].replace(r'\n', ' ', regex=True)
    })
    df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.01)
    df_bert_test = pd.DataFrame({
        'id':df_test['id'],
        'text': df_test['text'].str[:512].replace(r'\n', ' ', regex=True)
    })
    # Saving dataframes to .tsv format as required by BERT
    df_bert_train.to_csv('../datasets/Newswire_BERT/train.tsv', sep='\t', index=False, header=False)
    df_bert_dev.to_csv('../datasets/Newswire_BERT/dev.tsv', sep='\t', index=False, header=False)
    df_bert_test.to_csv('../datasets/Newswire_BERT/test.tsv', sep='\t', index=False, header=False)

    # 3 load pretrained model
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', return_dict=True)

    # 4 transform
    tokenized = df_bert_train['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

    print('Padding')
    
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
    
    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    
    print('Shape after padding ' + str(np.array(padded).shape))
    
    attention_mask = np.where(padded != 0, 1, 0)
    attention_mask.shape
    
    input_ids = torch.tensor(padded)
    attention_mask = torch.tensor(attention_mask).to('cuda:0')
    
    print('Embedding model start')

    model.train()
    
    with torch.no_grad():
        input_ids = input_ids.clone().detach().to(torch.int64).to('cuda:0')
        model = model.to('cuda:0')
        labels = torch.tensor(df_bert_train['label'].values).to(torch.int64).to('cuda:0')
        print(labels)
        last_hidden_states = model(input_ids, attention_mask=attention_mask, labels=labels)
        print(model)
        model.save_pretrained('models/BERT1')
Esempio n. 17
0
 def __init__(self, max_len):
     self.model_name = 'distilbert-base-uncased'
     self.max_len = max_len
     self.tkzr = DistilBertTokenizer.from_pretrained(self.model_name)
     self.model = TFDistilBertForSequenceClassification.from_pretrained(
         self.model_name)
     self.optimizer = optimizers.Adam(learning_rate=3e-5)
     self.loss = losses.SparseCategoricalCrossentropy(from_logits=True)
Esempio n. 18
0
 def model_load(self, path: str):
     config = DistilBertConfig.from_pretrained(path + "/config.json")
     tokenizer = DistilBertTokenizer.from_pretrained(
         path, do_lower_case=self.do_lower_case)
     model = DistilBertForQuestionAnswering.from_pretrained(path,
                                                            from_tf=False,
                                                            config=config)
     return model, tokenizer
Esempio n. 19
0
    def __init__(self, filename, maxlen):
        # Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter="\t")

        # Initialize the BERT tokenizer
        self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

        self.maxlen = maxlen
Esempio n. 20
0
def answer(context: str, question: str):
    tokenizer: DistilBertTokenizer = DistilBertTokenizer.from_pretrained(
        MODEL_PATH, return_token_type_ids=True)
    input_ids, input_mask = encode(context, question, tokenizer)
    answer_tokens = get_answer_tokens(input_ids, input_mask, tokenizer)

    answer = tokenizer.convert_tokens_to_string(answer_tokens)
    return answer
Esempio n. 21
0
 def __init__(self,
              model_name="distilbert-base-uncased-distilled-squad",
              device="cuda"):
     super().__init__()
     self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
     self.model = DistilBertForQuestionAnswering.from_pretrained(model_name)
     self.device = device
     self.model = self.model.to(self.device)
Esempio n. 22
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.model = DistilBertForQuestionAnswering.from_pretrained(
         self.model_dir)
     self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_dir)
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
     self.model.to(self.device)
Esempio n. 23
0
 def __init__(self, *args, **kwargs):
     # initialize super class with request & response schema, configs
     super().__init__(*args, **kwargs)
     # initialize model and other tools
     self.tokenizer = DistilBertTokenizer.from_pretrained(
         'distilbert-base-uncased')
     self.model = DistilBertForSequenceClassification.from_pretrained(
         'distilbert-base-uncased-finetuned-sst-2-english')
Esempio n. 24
0
 def __init__(
         self,
         semantic_analysis_config: BrainSentimentAnalysisConfiguration):
     super().__init__()
     self._semantic_analysis_config = semantic_analysis_config
     model_dir = semantic_analysis_config.model_dir
     tokenizer = DistilBertTokenizer.from_pretrained(model_dir)
     model = DistilBertForSequenceClassification.from_pretrained(model_dir)
     self.sentiment_classifier = SentimentClassifer(model, tokenizer)
Esempio n. 25
0
    def __init__(self, model_path='distilbert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'):
        super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
        self.model_path = model_path

        self.tokenizer = DistilBertTokenizer.from_pretrained(model_path)
        self.model = DistilBertForMaskedLM.from_pretrained(model_path)

        self.model.to(self.device)
        self.model.eval()
 def __init__(self, qas: list, qids: list, aids: list, goldids: dict):
     self.tokenizer = DistilBertTokenizer.from_pretrained(
         'distilbert-base-cased')
     # BertTokenizer.from_pretrained('bert-base-cased')
     self.qas = qas
     self.qids = qids
     self.aids = aids
     self.goldids = goldids
     self.max_len = 512
Esempio n. 27
0
def check_sentiment(text):
    tokenizer = DistilBertTokenizer.from_pretrained('./pretrain_distillbert_full_sst')
    model = DistilBertForSequenceClassification.from_pretrained('./pretrain_distillbert_full_sst')
    sentiment_classifier = SentimentClassifer(model, tokenizer)
    result = sentiment_classifier(text)
    sentiment = max(result, key=result.get)
    sentiment_distribution = list(result.values())
    print("sentiment of {}: {}".format(text, sentiment))
    return sentiment
Esempio n. 28
0
def build_model(config):

    src_tokenizer = DistilBertTokenizer.from_pretrained(
        'distilbert-base-multilingual-cased')
    tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    tgt_tokenizer.bos_token = '<s>'
    tgt_tokenizer.eos_token = '</s>'

    #hidden_size and intermediate_size are both wrt all the attention heads.
    #Should be divisible by num_attention_heads
    encoder_config = BertConfig(
        vocab_size=src_tokenizer.vocab_size,
        hidden_size=config.hidden_size,
        num_hidden_layers=config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        hidden_act=config.hidden_act,
        hidden_dropout_prob=config.dropout_prob,
        attention_probs_dropout_prob=config.dropout_prob,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12)

    decoder_config = BertConfig(
        vocab_size=tgt_tokenizer.vocab_size,
        hidden_size=config.hidden_size,
        num_hidden_layers=config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        hidden_act=config.hidden_act,
        hidden_dropout_prob=config.dropout_prob,
        attention_probs_dropout_prob=config.dropout_prob,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        is_decoder=True)

    #Create encoder and decoder embedding layers.
    encoder_embeddings = nn.Embedding(src_tokenizer.vocab_size,
                                      config.hidden_size,
                                      padding_idx=src_tokenizer.pad_token_id)
    decoder_embeddings = nn.Embedding(tgt_tokenizer.vocab_size,
                                      config.hidden_size,
                                      padding_idx=tgt_tokenizer.pad_token_id)

    encoder = BertModel(encoder_config)
    encoder.set_input_embeddings(encoder_embeddings)

    decoder = BertForMaskedLM(decoder_config)
    decoder.set_input_embeddings(decoder_embeddings)

    model = TranslationModel(encoder, decoder)

    return model, src_tokenizer, tgt_tokenizer
 def __init__(self, model_name="distilbert-base-uncased", device="cuda"):
     super().__init__()
     self.device = device
     self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
     self.model = DistilBertModel.from_pretrained(model_name).to(
         self.device)
     self.linear = nn.Linear(self.model.config.dim,
                             self.model.config.num_labels).to(self.device)
     self.dropout = nn.Dropout(self.model.config.qa_dropout).to(self.device)
Esempio n. 30
0
def get_tokenizer(name, size):
    if name == 'bert':
        return BertTokenizer.from_pretrained(f"bert-{size}-uncased")
    elif name == "albert":
        return AlbertTokenizer.from_pretrained(f"albert-{size}-v2")
    elif name == "distilbert":
        return DistilBertTokenizer.from_pretrained(f"distilbert-{size}-uncased")
    else:
        raise AssertionError()