Python FullTokenizer 예제들, bert.bert_tokenization.FullTokenizer Python 예제들

예제 #1

0

파일 보기

파일: main_IEMOCAP.py 프로젝트: MIntelligence-Group/Speech-Text_EmoRec

def createTokenizer():
    currentDir = "/home/puneet/code/Interspeech/"  #os.path.dirname(os.path.realpath('/content/drive/MyDrive/iemocap(version2)/'))
    modelsFolder = os.path.join(currentDir, "iemocap(version2)/model",
                                "multi_cased_L-12_H-768_A-12")
    vocab_file = os.path.join(modelsFolder, "vocab.txt")

    tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case=True)
    return tokenizer

예제 #2

0

파일 보기

def getDeepBias(text):
    global mod
    print(text)

    text.replace('\n', ' ')
    tokenizer = bert_tokenization.FullTokenizer(vocab_file='vocab.txt',
                                                do_lower_case=False)
    tokens = tokenizer.tokenize(text)

    token_segments = []

    index = 382

    tmp = ['[CLS]'] + tokens[:382] + ['[SEP]']
    tmp = np.array(tmp)
    tmp = tokenizer.convert_tokens_to_ids(tmp)

    while len(tmp) < 384:
        tmp.append(0)

    tmp = np.array(tmp)
    #print(tmp)
    #print(tmp.shape)
    #print('predicted')

    token_segments.append(tmp)

    while (index < len(tokens)):
        index += 382

        temp = ['[CLS]'] + tokens[index - 382:index] + ['[SEP]']
        temp = tokenizer.convert_tokens_to_ids(temp)

        if len(temp) > 100:
            while len(temp) < 384:
                temp.append(0)
            temp = np.array(temp)
            token_segments.append(temp)

    token_segments = np.array(token_segments)

    print(token_segments.shape)
    preds = []

    for t in token_segments:
        print(t)
        preds.append(mod.predict(t.reshape(1, 384)))

    avg = [0, 0]
    for i in preds:
        avg += i
    avg /= len(preds)
    print("avg:" + str(avg))

    return avg[0]

예제 #3

0

파일 보기

    def test_compare(self):

        model_dir = tempfile.TemporaryDirectory().name
        os.makedirs(model_dir)
        save_path = MiniBertFactory.create_mini_bert_weights(model_dir)
        tokenizer = bert_tokenization.FullTokenizer(vocab_file=os.path.join(
            model_dir, "vocab.txt"),
                                                    do_lower_case=True)

        # prepare input
        max_seq_len = 16
        input_str = "hello, bert!"
        input_tokens = tokenizer.tokenize(input_str)
        input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        input_ids = input_ids + [0] * (max_seq_len - len(input_tokens))
        input_mask = [0] * len(input_tokens) + [0] * (
            max_seq_len - len(input_tokens)
        )  # FIXME: input_mask broken - chane to [1]*
        token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len -
                                                          len(input_tokens))

        input_ids = np.array([input_ids], dtype=np.int32)
        input_mask = np.array([input_mask], dtype=np.int32)
        token_type_ids = np.array([token_type_ids], dtype=np.int32)

        print("   tokens:", input_tokens)
        print(
            "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len,
                                        input_ids), input_ids.shape,
            token_type_ids)

        bert_1_seq_out = CompareBertActivationsTest.predict_on_stock_model(
            model_dir, input_ids, input_mask, token_type_ids)
        bert_2_seq_out = CompareBertActivationsTest.predict_on_keras_model(
            model_dir, input_ids, input_mask, token_type_ids)

        np.set_printoptions(precision=9,
                            threshold=20,
                            linewidth=200,
                            sign="+",
                            floatmode="fixed")

        print("stock bert res", bert_1_seq_out.shape)
        print("keras bert res", bert_2_seq_out.shape)

        print("stock bert res:\n {}".format(bert_1_seq_out[0, :2, :10]),
              bert_1_seq_out.dtype)
        print("keras bert_res:\n {}".format(bert_2_seq_out[0, :2, :10]),
              bert_2_seq_out.dtype)

        abs_diff = np.abs(bert_1_seq_out - bert_2_seq_out).flatten()
        print("abs diff:", np.max(abs_diff), np.argmax(abs_diff))
        self.assertTrue(np.allclose(bert_1_seq_out, bert_2_seq_out, atol=1e-6))

예제 #4

0

파일 보기

    def test_finetune(self):

        model_dir = tempfile.TemporaryDirectory().name
        os.makedirs(model_dir)
        save_path = MiniBertFactory.create_mini_bert_weights(model_dir)
        tokenizer = bert_tokenization.FullTokenizer(vocab_file=os.path.join(
            model_dir, "vocab.txt"),
                                                    do_lower_case=True)

        # prepare input
        max_seq_len = 24
        input_str_batch = ["hello, bert!", "how are you doing!"]

        input_ids_batch = []
        token_type_ids_batch = []
        for input_str in input_str_batch:
            input_tokens = tokenizer.tokenize(input_str)
            input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"]

            print("input_tokens len:", len(input_tokens))

            input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
            input_ids = input_ids + [0] * (max_seq_len - len(input_tokens))
            token_type_ids = [0] * len(input_tokens) + [0] * (
                max_seq_len - len(input_tokens))

            input_ids_batch.append(input_ids)
            token_type_ids_batch.append(token_type_ids)

        input_ids = np.array(input_ids_batch, dtype=np.int32)
        token_type_ids = np.array(token_type_ids_batch, dtype=np.int32)

        print("   tokens:", input_tokens)
        print(
            "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len,
                                        input_ids), input_ids.shape,
            token_type_ids)

        model = CompareBertActivationsTest.load_keras_model(
            model_dir, max_seq_len)
        model.compile(optimizer=keras.optimizers.Adam(),
                      loss=keras.losses.mean_squared_error)

        pres = model.predict([input_ids, token_type_ids
                              ])  # just for fetching the shape of the output
        print("pres:", pres.shape)

        model.fit(x=(input_ids, token_type_ids),
                  y=np.zeros_like(pres),
                  batch_size=2,
                  epochs=2)

예제 #5

0

파일 보기

 def __init__(self, args, logger=None):
     self.args = args
     self.name = 'BERT'
     self.logger = logger
     self.manual_seed = args.seed
     self.max_seq_length = args.max_seq_length
     self.datapath = args.datapath
     self.bert_model_file = os.path.join(self.datapath,
                                         'pretrained_models/bert/')
     self.vocab_file = os.path.join(self.bert_model_file, 'vocab.txt')
     self.lower_case = True
     self.learning_rate = args.learning_rate
     self.finetuning_rate = args.finetuning_rate
     self.model_dir = args.logdir
     self.tokenizer = bert_tokenization.FullTokenizer(
         vocab_file=self.vocab_file, do_lower_case=self.lower_case)
     self.num_supervised_trials = args.num_supervised_trials
     self.sup_batch_size = args.train_batch_size
     self.sup_epochs = args.num_epochs
     self.unsup_epochs = args.num_unsup_epochs
     self.T = args.T

예제 #6

0

파일 보기

    def __init__(self):
        max_seq_len = 128
        title_col = 'title'
        text_col = 'text'
        label_col = 'type'
        model_dir = path.join(path.dirname(path.abspath(__file__)), 'uncased_L-12_H-768_A-12')

        print('loading bert data...')
        train_data = pd.read_csv("../CNN_data/all_data.csv")
        tokenizer = bert_tokenization.FullTokenizer(vocab_file=path.join(model_dir, "vocab.txt"))
        input_tokens = []
        input_labels = []

        print('tokenizing bert data...')
        for _, row in train_data.iterrows():
            text, title, label = row[text_col], row[title_col], row[label_col]
            total_text = text + title
            
            tokens = tokenizer.tokenize(total_text)
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            token_ids = tokenizer.convert_tokens_to_ids(tokens)
            token_ids = token_ids[:min(len(token_ids), max_seq_len)]
            token_ids = token_ids + [0] * (max_seq_len - len(token_ids))
            
            input_tokens.append(token_ids)
            input_labels.append(label)
        print('loaded and processed bert data!')

        doubles = list(zip(input_tokens, input_labels))
        np.random.shuffle(doubles)
        input_tokens, input_labels = zip(*doubles)
        print('shuffled bert data')

        self.train_data = np.array(input_tokens[:int(len(input_tokens) * 0.75)])
        self.test_data = np.array(input_tokens[int(len(input_tokens) * 0.75):])
        self.train_labels = np.array(input_labels[:int(len(input_labels) * 0.75)])
        self.test_labels = np.array(input_labels[int(len(input_labels) * 0.75):])
        self.max_seq_len = max_seq_len

예제 #7

0

파일 보기

                y.append(int(label))
                pbar.update()
        return np.array(x), np.array(y)

    def _pad(self, ids):
        x, t = [], []
        token_type_ids = [0] * self.max_seq_len
        for input_ids in ids:
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
            t.append(token_type_ids)
        return np.array(x), np.array(t)


tokenizer = bert_tokenization.FullTokenizer(
    vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
data = MovieReviewData(
    tokenizer,
    sample_size=10 * 128 * 2,  #5000, 
    max_seq_len=128)

##EDA for training data

print("            train_x", data.train_x.shape)
print("train_x_token_types", data.train_x_token_types.shape)
print("            train_y", data.train_y.shape)

print("             test_x", data.test_x.shape)

print("        max_seq_len", data.max_seq_len)

예제 #8

0

파일 보기

# prepare class encoder
le = ce.OneHotEncoder(return_df=False, handle_unknown="ignore")
# labels = le.fit(list(df['id']))
mapa = [0, 1]

labels_map = [0, 1]
# i = 0
# for a in mapa:
#    labels_map.append(a)
# print(labels_map)

# Tokenization
# Inizialize the tokenizer
from bert import bert_tokenization

tokenizer = bert_tokenization.FullTokenizer(vocab_path, do_lower_case=True)
# tokenizer = tokenization.FullTokenizer(vocab_path, do_lower_case=True)
# indices_train = []
indices_test = []

# for text in train['Desc']:
#  tk = tokenizer.tokenize(text)
#  tokens = ["[CLS]"] + tk + ["[SEP]"]
#  token_ids = tokenizer.convert_tokens_to_ids(tokens)
#  token_ids = _pad(token_ids,SEQ_LEN)
#  indices_train.append(token_ids)

for text in test['Desc']:
    tk = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tk + ["[SEP]"]
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

예제 #9

0

파일 보기

파일: train.py 프로젝트: salhasalman/Twitter-Authority

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import class_weight

base_model = hub.load(
    'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1')

bert_layer = hub.KerasLayer(base_model)

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

MAX_SEQ_LEN = 65
TEST_SIZE = 0.2
LR = 1e-4

N_EPOCHS = 5
BATCH_SIZE = 32

df = pd.read_csv('./data/clean_train.csv', index_col=False)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
print("Split Data")
X_data = df['text'][:5].to_numpy()
y_data = df['target'][:5].to_numpy()
y_data = y_data.reshape(-1, 1)

예제 #10

0

파일 보기

def load_dataset(dataset_path, model_path, vocab_file, max_seq_len=None, return_len=False):
    df = pd.read_csv(dataset_path, sep='\t', compression='infer', header=None, index_col=None)
    df.columns = ['pmid', 'paragraph', 'sentence', 'in_sent_id', 'entity1', 'entity2',
                  'old_sent', 'class', 'distance', 'sample_sentence']

    sentences = list(df['sample_sentence'])
    labels = list(df['class'])

    vocab_path = model_path + vocab_file
    tokenizer = bert_tokenization.FullTokenizer(vocab_path, False)

    i_ent_tag = re.compile('<I>')
    o_ent_tag = re.compile('<O>')

    sentences_tokens = list()  # [[1,2,3], [5,2,3]]
    entity_position = list()  # [[(1,2), (2,3)]]

    for sent in sentences:

        bert_tokens = list()
        bert_target_indices = list()
        split_sent = sent.split('<S>')

        bert_tokens.append('[CLS]')

        for split in split_sent:
            if i_ent_tag.findall(split):
                start = len(bert_tokens)
                cur_split = i_ent_tag.sub('', split)
                word_pieces = tokenizer.tokenize(cur_split)
                bert_tokens.extend(word_pieces)
                end = len(bert_tokens)
                bert_target_indices.append([start, end])

            elif o_ent_tag.findall(split):

                cur_split = o_ent_tag.sub('', split)
                word_pieces = tokenizer.tokenize(cur_split)
                bert_tokens.extend(word_pieces)

            else:
                cur_split = split
                word_pieces = tokenizer.tokenize(cur_split)
                bert_tokens.extend(word_pieces)

        bert_tokens.append('[SEP]')
        sample_ids = tokenizer.convert_tokens_to_ids(bert_tokens)
        sentences_tokens.append(sample_ids)
        bert_target_indices.sort()
        entity_position.append(bert_target_indices)

    if max_seq_len is not None:
        all_len = np.array([len(item) for item in sentences_tokens])
        is_shorter_than_max = all_len <= max_seq_len
        x = np.array(pad_sequences(sequences=sentences_tokens,
                                   maxlen=max_seq_len, padding="post"))[is_shorter_than_max, :]
        ent1_position = np.array([pair[0] for pair in entity_position])[is_shorter_than_max, :]
        ent2_position = np.array([pair[1] for pair in entity_position])[is_shorter_than_max, :]
        y = np.array(labels)[is_shorter_than_max]
        df = df.loc[is_shorter_than_max, :]
        if return_len:
            return df, (x, ent1_position, ent2_position, all_len[is_shorter_than_max]), y
        else:
            return df, (x, ent1_position, ent2_position), y
    else:
        x = np.array(sentences_tokens)
        ent1_position = np.array([pair[0] for pair in entity_position])  # start-end for slicing
        ent2_position = np.array([pair[1] for pair in entity_position])
        y = np.array(labels)
        return df, (x, ent1_position, ent2_position), y

예제 #11

0

파일 보기

파일: data.py 프로젝트: eri-areynolds/flsa-prediction

 def __init__(self, vocab_file, max_seq_len):
     self.vocab_file = vocab_file
     self.max_seq_len = max_seq_len
     self.tokenizer = bert_tokenization.FullTokenizer(vocab_file)

예제 #12

0

파일 보기

def get_BERT_Tokenizer():
    path = os.getcwd()[:os.getcwd().rfind('/')] + '/deeplearning/'
    vocab_file = path + 'uncased_L-12_H-768_A-12' + '/vocab.txt'
    tokenizer = bert_tokenization.FullTokenizer(vocab_file=vocab_file,
                                                do_lower_case=True)
    return tokenizer