Ejemplo n.º 1
0
def predict(input_text, net_trained, candidate_num=3, output_print=False):
    TEXT = pickle_load(PKL_FILE)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    tokenizer_bert = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
    text = preprocessing_text(input_text)
    text = tokenizer_bert.tokenize(text)
    text.insert(0, "[CLS]")
    text.append("[SEP]")
    token_ids = torch.ones((max_length)).to(torch.int64)
    ids_list = list(map(lambda x: TEXT.vocab.stoi[x], text))
    for i, index in enumerate(ids_list):
        token_ids[i] = index
    ids_list = token_ids.unsqueeze_(0)
    input = ids_list.to(device)
    input_mask = (input != 1)
    outputs, attention_probs = net_trained(input,
                                           token_type_ids=None,
                                           attention_mask=None,
                                           output_all_encoded_layers=False,
                                           attention_show_flg=True)

    offset_tensor = torch.tensor(offset, device=device)
    outputs -= offset_tensor
    if output_print == True: print(outputs)
    _, preds = torch.topk(outputs, candidate_num)
    return preds
Ejemplo n.º 2
0
def mk_html(input, preds, normlized_weights, TEXT):
    "HTMLデータを作成する"
    tokenizer_bert = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
    # indexの結果を抽出
    index = 0
    sentence = input.squeeze_(0) # 文章  #  torch.Size([1, 256])  > torch.Size([256]) 
    pred = preds[0]  # 予測


    # 予測結果を文字に置き換え
    if pred == 0:
        pred_str = "Negative"
    else:
        pred_str = "Positive"

    # 表示用のHTMLを作成する

    html = '推論ラベル:{}<br><br>'.format(pred_str)
    # Self-Attentionの重みを可視化。Multi-Headが12個なので、12種類のアテンションが存在

    for i in range(12):

        # indexのAttentionを抽出と規格化
        # 0単語目[CLS]の、i番目のMulti-Head Attentionを取り出す
        # indexはミニバッチの何個目のデータかをしめす
        attens = normlized_weights[index, i, 0, :]
        attens /= attens.max()

        #html += '[BERTのAttentionを可視化_' + str(i+1) + ']<br>'
        for word, attn in zip(sentence, attens):

            # 単語が[SEP]の場合は文章が終わりなのでbreak
            if tokenizer_bert.convert_ids_to_tokens([word.numpy().tolist()])[0] == "[SEP]":
                break

            # 関数highlightで色をつける、関数tokenizer_bert.convert_ids_to_tokensでIDを単語に戻す
            #html += highlight(tokenizer_bert.convert_ids_to_tokens(
            #    [word.numpy().tolist()])[0], attn)
        #html += "<br><br>"

    # 12種類のAttentionの平均を求める。最大値で規格化
    all_attens = attens*0  # all_attensという変数を作成する
    for i in range(12):
        attens += normlized_weights[index, i, 0, :]
    attens /= attens.max()

    html += '[BERTのAttentionを可視化_ALL]<br>'
    for word, attn in zip(sentence, attens):

        # 単語が[SEP]の場合は文章が終わりなのでbreak
        if tokenizer_bert.convert_ids_to_tokens([word.numpy().tolist()])[0] == "[SEP]":
            break

        # 関数highlightで色をつける、関数tokenizer_bert.convert_ids_to_tokensでIDを単語に戻す
        html += highlight(tokenizer_bert.convert_ids_to_tokens(
            [word.numpy().tolist()])[0], attn)
    html += "<br><br>"

    return html
Ejemplo n.º 3
0
    def __init__(self,
                 data_dir=r'./',
                 bert_dir=r'./pytorch_advanced/nlp_sentiment_bert/'):
        self.data_dir = data_dir
        self.bert_dir = bert_dir
        self.tokenizer_bert = BertTokenizer(
            vocab_file=self.bert_dir + "vocab/bert-base-uncased-vocab.txt",
            do_lower_case=True)
        self.vocab_bert, self.ids_to_tokens_bert = load_vocab(
            vocab_file=self.bert_dir + "vocab/bert-base-uncased-vocab.txt")

        config = get_config(file_path=self.bert_dir +
                            "weights/bert_config.json")
        self.net_bert = BertModel(config)
        self.net_bert = set_learned_params(self.net_bert,
                                           weights_path=self.bert_dir +
                                           "weights/pytorch_model.bin")
Ejemplo n.º 4
0
 def __init__(self, vocab_file, max_text_length=256, use_basic_form=False, mecab_dict=None):
     self.tokenizer = BertTokenizer(
         vocab_file=vocab_file, do_lower_case=False, do_basic_tokenize=False)
     if mecab_dict is not None:
         self.tagger = MeCab.Tagger("-d {}".format(mecab_dict))
     else:
         self.tagger = MeCab.Tagger("")
     self.text_field, self.label_field = self._prepare(
         max_text_length, use_basic_form)
     self.vocab, self.ids_to_tokens = self._load_vocab(vocab_file)
Ejemplo n.º 5
0
    def __init__(self, vocab_file, max_text_length=256, **kwargs):
        do_normalize_text = kwargs["do_normalize_text"] if "do_normalize_text" in kwargs else False
        use_basic_form = kwargs["use_basic_form"] if "use_basic_form" in kwargs else False
        mecab_dict = kwargs["mecab_dict"] if "mecab_dict" in kwargs else None

        self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False, do_basic_tokenize=False)
        if mecab_dict is not None:
            self.tagger = MeCab.Tagger("-d {}".format(mecab_dict))
        else:
            self.tagger = MeCab.Tagger("")
        self.text_field, self.label_field = self._prepare(max_text_length, do_normalize_text, use_basic_form)
        self.vocab, self.ids_to_tokens = self._load_vocab(vocab_file)
Ejemplo n.º 6
0
import os
import io
import string
import re
import sys
import random
import spacy
import torchtext
import mojimoji
import string
#import MeCab
from torchtext.vocab import Vectors
from utils.bert import BertTokenizer, load_vocab

# 単語分割用のTokenizerを用意
tokenizer_bert = BertTokenizer(vocab_file="./vocab/vocab.txt",
                               do_lower_case=False)


def get_chABSA_DataLoaders_and_TEXT(max_length=256, batch_size=32):
    """IMDbのDataLoaderとTEXTオブジェクトを取得する。 """
    def preprocessing_text(text):

        # 半角・全角の統一
        text = mojimoji.han_to_zen(text)
        # 改行、半角スペース、全角スペースを削除
        text = re.sub('\r', '', text)
        text = re.sub('\n', '', text)
        text = re.sub(' ', '', text)
        text = re.sub(' ', '', text)
        # 数字文字の一律「0」化
        text = re.sub(r'[0-9 0-9]+', '0', text)  # 数字
Ejemplo n.º 7
0
# In[14]:


# 動作確認 検証データのデータセットで確認
batch = next(iter(train_dl))
print("Textの形状=", batch.Text[0].shape)
print("Labelの形状=", batch.Label.shape)
print(batch.Text)
print(batch.Label)


# In[15]:


# ミニバッチの1文目を確認してみる
tokenizer_bert = BertTokenizer(vocab_file="./vocab/vocab.txt", do_lower_case=False)
text_minibatch_1 = (batch.Text[0][1]).numpy()

# IDを単語に戻す
text = tokenizer_bert.convert_ids_to_tokens(text_minibatch_1)

print(text)


# # 2.BERTによるネガポジ分類モデル実装

# In[16]:


from utils.bert import get_config, BertModel,BertForchABSA, set_learned_params
Ejemplo n.º 8
0
def DataLoader(max_length=256, batch_size=32):
    """IMDbのDataLoaderとTEXTオブジェクトを取得する。 """
    # 乱数のシードを設定
    torch.manual_seed(0)
    np.random.seed(0)
    random.seed(0)
    # 単語分割用のTokenizerを用意
    tokenizer_bert = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)

    def preprocessing_text(text):
        # 半角・全角の統一
        text = mojimoji.han_to_zen(text)
        # 改行、半角スペース、全角スペースを削除
        text = re.sub('\r', '', text)
        text = re.sub('\n', '', text)
        text = re.sub(' ', '', text)
        text = re.sub(' ', '', text)
        text = re.sub("\"", '', text)
        # 数字文字の一律「0」化
        text = re.sub(r'[0-9 0-9]+', '0', text)  # 数字

        # カンマ、ピリオド以外の記号をスペースに置換
        for p in string.punctuation:
            if (p == ".") or (p == ","):
                continue
            else:
                text = text.replace(p, " ")
            return text

    # 前処理と単語分割をまとめた関数を定義
    # 単語分割の関数を渡すので、tokenizer_bertではなく、tokenizer_bert.tokenizeを渡す点に注意
    def tokenizer_with_preprocessing(text, tokenizer=tokenizer_bert.tokenize):
        text = preprocessing_text(text)
        ret = tokenizer(text)  # tokenizer_bert
        return ret

    # データを読み込んだときに、読み込んだ内容に対して行う処理を定義します
    max_length = 256
    TEXT = torchtext.data.Field(sequential=True,
                                tokenize=tokenizer_with_preprocessing,
                                use_vocab=True,
                                lower=False,
                                include_lengths=True,
                                batch_first=True,
                                fix_length=max_length,
                                init_token="[CLS]",
                                eos_token="[SEP]",
                                pad_token='[PAD]',
                                unk_token='[UNK]')
    LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

    # フォルダ「data」から各csvファイルを読み込みます
    # BERT用で処理するので、10分弱時間がかかります
    train_val_ds, test_ds = torchtext.data.TabularDataset.splits(
        path=DATA_PATH,
        train='train.csv',
        test='test.csv',
        format='csv',
        fields=[('Text', TEXT), ('Label', LABEL)])

    vocab_bert, ids_to_tokens_bert = load_vocab(vocab_file=VOCAB_FILE)
    TEXT.build_vocab(train_val_ds, min_freq=1)
    TEXT.vocab.stoi = vocab_bert

    batch_size = 32  # BERTでは16、32あたりを使用する
    train_dl = torchtext.data.Iterator(train_val_ds,
                                       batch_size=batch_size,
                                       train=True)
    val_dl = torchtext.data.Iterator(test_ds,
                                     batch_size=batch_size,
                                     train=False,
                                     sort=False)
    # 辞書オブジェクトにまとめる
    dataloaders_dict = {"train": train_dl, "val": val_dl}
    return train_dl, val_dl, TEXT, dataloaders_dict
Ejemplo n.º 9
0
class tmv_torch_bert_classify(lreg.tmv_tf_log_regress_classify):
    def __init__(self,
                 data_dir=r'./',
                 bert_dir=r'./pytorch_advanced/nlp_sentiment_bert/'):
        self.data_dir = data_dir
        self.bert_dir = bert_dir
        self.tokenizer_bert = BertTokenizer(
            vocab_file=self.bert_dir + "vocab/bert-base-uncased-vocab.txt",
            do_lower_case=True)
        self.vocab_bert, self.ids_to_tokens_bert = load_vocab(
            vocab_file=self.bert_dir + "vocab/bert-base-uncased-vocab.txt")

        config = get_config(file_path=self.bert_dir +
                            "weights/bert_config.json")
        self.net_bert = BertModel(config)
        self.net_bert = set_learned_params(self.net_bert,
                                           weights_path=self.bert_dir +
                                           "weights/pytorch_model.bin")

    def load_data(self,
                  csv_file_kspa,
                  dependent_var,
                  langs=None,
                  task_word='Definition',
                  answer_ex_clm='Definition'):
        self.dependent_var = dependent_var
        self.answer_ex_clm = answer_ex_clm
        self.df_response_answer_ex = pd.read_csv(self.data_dir + csv_file_kspa,
                                                 encoding='latin1')
        self.df_response_answer_ex = self.df_response_answer_ex.set_index(
            r'Student_Question_Index')

        if langs != None:
            lang_clm = task_word + r'-Language'
            self.df_response_answer_ex = \
                self.df_response_answer_ex[self.df_response_answer_ex[lang_clm].isin(langs)]

        self.ans_clm = task_word + r'-Answer'
        self.ans_and_ex_clm = task_word + r'-Answer-and-Example'

        self.df_response_answer_ex[self.ans_and_ex_clm] = self.df_response_answer_ex[self.answer_ex_clm] \
            + ' ' + self.df_response_answer_ex[self.ans_clm]

        # to move LABEL and TXT columns to the end
        columns = list(self.df_response_answer_ex.columns)
        columns.remove(self.dependent_var)
        columns.remove(self.ans_and_ex_clm)
        columns.append(self.dependent_var)
        columns.append(self.ans_and_ex_clm)
        self.df_ac_modeling_values = self.df_response_answer_ex.reindex(
            columns=columns)

    def get_tokens(self):
        def preprocessing_text(text):
            for p in string.punctuation:
                if (p == ".") or (p == ","):
                    continue
                else:
                    text = text.replace(p, " ")

            text = text.replace(".", " . ")
            text = text.replace(",", " , ")
            return text

        def tokenizer_with_preprocessing(text,
                                         tokenizer=self.tokenizer_bert.tokenize
                                         ):
            text = preprocessing_text(text)
            ret = tokenizer(text)  # tokenizer_bert
            return ret

        TEXT = torchtext.data.Field(sequential=True,
                                    tokenize=tokenizer_with_preprocessing,
                                    use_vocab=True,
                                    lower=True,
                                    include_lengths=True,
                                    batch_first=True,
                                    fix_length=max_length,
                                    init_token="[CLS]",
                                    eos_token="[SEP]",
                                    pad_token='[PAD]',
                                    unk_token='[UNK]')
        LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

        fields = [(None, None)] * (len(self.df_response_answer_ex.columns) - 1)
        fields.append(('Label', LABEL))
        fields.append(('Text', TEXT))

        train_val_ds = torchtext.data.TabularDataset(
            path=self.modeling_data_file_name,
            format='csv',
            fields=fields,
            skip_header=True)

        TEXT.build_vocab(train_val_ds, min_freq=1)
        TEXT.vocab.stoi = self.vocab_bert

        return train_val_ds

    # Modified by [email protected] 11/22/2020
    def perform_modeling(self,
                         df_ac_modeling_data,
                         key_word=r'',
                         csv_dump=False,
                         number_class=3,
                         epochs=10,
                         batch_size=32,
                         tmp_csv_name='TORCH_RESPONSE_ANSWER_EX_FILE.CSV',
                         bert_pkl_name='weights/bert_fine_tuning_VDOK_'):
        self.modeling_data_file_name = self.data_dir + tmp_csv_name
        self.batch_size = batch_size
        df_ac_modeling_data_buf = df_ac_modeling_data.copy()

        if self.ans_and_ex_clm not in df_ac_modeling_data_buf.columns:
            df_ac_modeling_data_buf[self.ans_and_ex_clm] = df_ac_modeling_data_buf[self.answer_ex_clm] \
                                    + ' ' + df_ac_modeling_data_buf[self.ans_clm]

            # to move LABEL and TXT columns to the end
            columns = list(df_ac_modeling_data_buf.columns)
            columns.remove(self.dependent_var)
            columns.remove(self.ans_and_ex_clm)
            columns.append(self.dependent_var)
            columns.append(self.ans_and_ex_clm)
            df_ac_modeling_data_buf = df_ac_modeling_data_buf.reindex(
                columns=columns)

        df_ac_modeling_data_buf.to_csv(self.modeling_data_file_name)

        train_val_ds = self.get_tokens()

        train_ds, val_ds = train_val_ds.split(
            split_ratio=0.8, random_state=random.seed(random_seed))

        train_dl = torchtext.data.Iterator(train_ds,
                                           batch_size=self.batch_size,
                                           train=True)

        val_dl = torchtext.data.Iterator(val_ds,
                                         batch_size=self.batch_size,
                                         train=False,
                                         sort=False)

        self.dataloaders_dict = {"train": train_dl, "val": val_dl}

        batch = next(iter(val_dl))
        print(batch.Text)
        print(batch.Label)

        text_minibatch_1 = (batch.Text[0][1]).numpy()
        text = self.tokenizer_bert.convert_ids_to_tokens(text_minibatch_1)
        print(text)

        print('Building model...')
        net = BertForVDOK(self.net_bert, number_class)
        net.train()

        for name, param in net.named_parameters():
            param.requires_grad = False

        for name, param in net.bert.encoder.layer[-1].named_parameters():
            param.requires_grad = True

        for name, param in net.cls.named_parameters():
            param.requires_grad = True

        optimizer = optim.Adam(
            [{
                'params': net.bert.encoder.layer[-1].parameters(),
                'lr': 5e-5
            }, {
                'params': net.cls.parameters(),
                'lr': 5e-5
            }],
            betas=(0.9, 0.999))

        self.criterion = nn.CrossEntropyLoss()

        self.net_trained = self.train_model(net,
                                            self.dataloaders_dict,
                                            self.criterion,
                                            optimizer,
                                            num_epochs=epochs)

        # Modified by [email protected] 11/22/2020
        # save_path = self.bert_dir + 'weights/bert_fine_tuning_VDOK_' + key_word + '.pth'
        # torch.save(self.net_trained.state_dict(), save_path)
        save_path = self.bert_dir + bert_pkl_name + key_word + '.pkl'
        with open(save_path, 'wb') as f:
            cloudpickle.dump(self.net_trained, f)

    def train_model(self, net, dataloaders_dict, criterion, optimizer,
                    num_epochs):
        device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
        print('Using device: ', device)
        print('-----start-------')

        net.to(device)

        torch.backends.cudnn.benchmark = True

        batch_size = dataloaders_dict["train"].batch_size

        for epoch in range(num_epochs):
            for phase in ['train', 'val']:
                if phase == 'train':
                    net.train()
                else:
                    net.eval()

                epoch_loss = 0.0
                epoch_corrects = 0
                iteration = 1

                t_epoch_start = time.time()
                t_iter_start = time.time()

                for batch in (dataloaders_dict[phase]):
                    inputs = batch.Text[0].to(device)
                    labels = batch.Label.to(device)

                    optimizer.zero_grad()

                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = net(inputs,
                                      token_type_ids=None,
                                      attention_mask=None,
                                      output_all_encoded_layers=False,
                                      attention_show_flg=False)

                        loss = criterion(outputs, labels)

                        _, preds = torch.max(outputs, 1)

                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                            if (iteration % 10 == 0):
                                t_iter_finish = time.time()
                                duration = t_iter_finish - t_iter_start
                                acc = (torch.sum(preds == labels.data)
                                       ).double() / batch_size
                                print(
                                    'Iteration {} || Loss: {:.4f} || 10iter: {:.4f} sec. || Accuracy: {}'
                                    .format(iteration, loss.item(), duration,
                                            acc))
                                t_iter_start = time.time()

                        iteration += 1

                        epoch_loss += loss.item() * batch_size
                        epoch_corrects += torch.sum(preds == labels.data)

                t_epoch_finish = time.time()
                epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
                epoch_acc = epoch_corrects.double() / len(
                    dataloaders_dict[phase].dataset)

                print('Epoch {}/{} | {:^5} |  Loss: {:.4f} Acc: {:.4f}'.format(
                    epoch + 1, num_epochs, phase, epoch_loss, epoch_acc))
                t_epoch_start = time.time()

        return net

    def perform_prediction(self, df_ac_prediction_data, number_class):
        self.df_ac_predict_target = df_ac_prediction_data.loc[:, [
            self.dependent_var
        ]]
        df_ac_prediction_data_buf = df_ac_prediction_data.copy()

        if self.ans_and_ex_clm not in df_ac_prediction_data_buf.columns:
            df_ac_prediction_data_buf[self.ans_and_ex_clm] = df_ac_prediction_data_buf[self.answer_ex_clm] \
                                + ' ' + df_ac_prediction_data_buf[self.ans_clm]

            # to move LABEL and TXT columns to the end
            columns = list(df_ac_prediction_data_buf.columns)
            columns.remove(self.dependent_var)
            columns.remove(self.ans_and_ex_clm)
            columns.append(self.dependent_var)
            columns.append(self.ans_and_ex_clm)
            df_ac_prediction_data_buf = df_ac_prediction_data_buf.reindex(
                columns=columns)

        df_ac_prediction_data_buf.to_csv(self.modeling_data_file_name)

        test_ds = self.get_tokens()
        test_dl = torchtext.data.Iterator(test_ds,
                                          batch_size=self.batch_size,
                                          train=False,
                                          sort=False)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.net_trained.eval()
        self.net_trained.to(device)

        epoch_corrects = 0
        self.predict_res = []

        for batch in tqdm(test_dl):
            device = torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu")
            inputs = batch.Text[0].to(device)
            labels = batch.Label.to(device)

            with torch.set_grad_enabled(False):
                outputs = self.net_trained(inputs,
                                           token_type_ids=None,
                                           attention_mask=None,
                                           output_all_encoded_layers=False,
                                           attention_show_flg=False)
                loss = self.criterion(outputs, labels)
                _, preds = torch.max(outputs, 1)
                epoch_corrects += torch.sum(preds == labels.data)
                self.predict_res += preds.tolist()

        epoch_acc = epoch_corrects.double() / len(test_dl.dataset)

        print('Test Data {} Accuracy: {:.4f}'.format(len(test_dl.dataset),
                                                     epoch_acc))

        self.df_ac_classified = pd.DataFrame(
            np.array(self.predict_res, dtype=np.int64),
            df_ac_prediction_data.index, [r'Score_Class'])
        self.df_ac_classified[self.dependent_var] = self.df_ac_predict_target[
            self.dependent_var]

    def modeling_prediction_evaluation_all(self,
                                           key_word=r'',
                                           csv_dump=False,
                                           number_class=3,
                                           epochs=10,
                                           batch_size=32):
        self.df_ac_predict_target_all = pd.DataFrame()
        self.predict_res_all = np.array([], np.int64)
        self.df_ac_classified_all = pd.DataFrame()

        for x in range(len(self.random_order_set)):
            print(r'----------------')
            print(r'RANDOM SET: ', x)
            self.iloc_concat_for_cross_validation(x)
            self.perform_modeling(
                self.df_ac_modeling_values.iloc[
                    self.concatenated_value_order, :], key_word, csv_dump,
                number_class, epochs)
            self.perform_prediction(
                self.df_ac_modeling_values.iloc[self.random_order_set[x], :],
                number_class)
            self.evaluate_prediction(key_word)
            if len(self.df_ac_predict_target_all) == 0:
                self.df_ac_predict_target_all = self.df_ac_predict_target.copy(
                )
            else:
                self.df_ac_predict_target_all = self.df_ac_predict_target_all.append(
                    self.df_ac_predict_target)
            self.predict_res_all = np.append(self.predict_res_all,
                                             self.predict_res)
            if len(self.df_ac_classified_all) == 0:
                self.df_ac_classified_all = self.df_ac_classified.copy()
                self.df_indices_all = pd.DataFrame(self.se_indices)
            else:
                self.df_ac_classified_all = self.df_ac_classified_all.append(
                    self.df_ac_classified)
                self.df_indices_all = pd.concat(
                    [self.df_indices_all, self.se_indices], axis=1)

        self.df_indices_all = self.df_indices_all.T
        print(r'----------------')
        print(r'ALL DATA (Macro Average):')
        print(self.df_indices_all.describe())
        if csv_dump == True:
            self.df_indices_all.describe().to_csv(
                self.data_dir + r'Classified-Prediction-Indices-Macro-' +
                key_word + r'.csv',
                encoding='latin1')
        print(r'----------------')
        print(r'ALL DATA (Micro Average):')
        self.evaluate_prediction(
            key_word,
            csv_dump=True,
            df_ac_predict_target=self.df_ac_predict_target_all,
            predict_res=self.predict_res_all)

    # Modified by [email protected] on 11/22/2020
    def restore_model(self,
                      key_word=r'',
                      tmp_csv_name='TORCH_RESPONSE_ANSWER_EX_FILE.CSV',
                      bert_pkl_name='weights/bert_fine_tuning_VDOK_',
                      batch_size=32):
        self.modeling_data_file_name = self.data_dir + tmp_csv_name
        self.batch_size = batch_size
        save_path = self.bert_dir + bert_pkl_name + key_word + '.pkl'
        with open(save_path, 'rb') as f:
            self.net_trained = cloudpickle.load(f)
        self.criterion = nn.CrossEntropyLoss()
Ejemplo n.º 10
0
def tokenizer_with_preprocessing(text):
    tokenizer_bert = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
    text = preprocessing_text(text)
    ret = tokenizer_bert.tokenize(text)  
    return ret
    # カンマ、ピリオド以外の記号をスペースに置換
    for p in string.punctuation:
        if (p == ".") or (p == ","):
            continue
        else:
            text = text.replace(p, " ")

    # ピリオドなどの前後にはスペースを入れておく
    text = text.replace(".", " . ")
    text = text.replace(",", " , ")
    return text


# 単語分割用のTokenizerを用意
tokenizer_bert = BertTokenizer(
    vocab_file="./weights/bert-base-uncased-vocab.txt", do_lower_case=True)


# 前処理と単語分割をまとめた関数を定義
# 単語分割の関数を渡すので、tokenizer_bertではなく、tokenizer_bert.tokenizeを渡す点に注意
def tokenizer_with_preprocessing(text, tokenizer=tokenizer_bert.tokenize):
    text = preprocessing_text(text)
    ret = tokenizer(text)  # tokenizer_bert
    return ret


def main():
    # define output dataframe
    sample = pd.read_csv("./data/sample_submission.csv")
    # データを読み込んだときに、読み込んだ内容に対して行う処理を定義します
    max_length = 256
Ejemplo n.º 12
0
 def __init__(self, vocab_file, max_text_length, **kwargs):
     self.tokenizer = BertTokenizer(vocab_file=vocab_file,
                                    do_lower_case=True)
     self.text_field, self.label_field = self._prepare(max_text_length)
     self.vocab, self.ids_to_tokens = self._load_vocab(vocab_file)