Exemple #1
0
def construct_test(root,
                   path,
                   tokenizer_path,
                   MAX_SEQUENCE_LENGTH,
                   test=False,
                   batch_size=1):
    tokenizer = tokenization.FullTokenizer(tokenizer_path, True)
    #MAX_SEQUENCE_LENGTH = 512

    data = pd.read_csv(root + path)
    input_categories = list(data.columns[[1, 2, 5]])

    inputs = compute_input_arays(data, input_categories, tokenizer,
                                 MAX_SEQUENCE_LENGTH)

    def generator():
        for in1, in2, in3 in zip(inputs[0], inputs[1], inputs[2]):
            yield {
                'input_word_ids': in1,
                'input_masks': in2,
                'input_segments': in3
            }
            #yield [in1, in2, in3], out

    dataset = tf.data.Dataset.from_generator(generator, \
                                             {'input_word_ids':tf.int32, 'input_masks': tf.int32, 'input_segments':tf.int32})
    dataset = dataset.batch(batch_size)
    print('Test dataset constructed successfully with shape =', data.shape)

    return dataset
Exemple #2
0
    def load_model(self, gpu_id, vocab_file, gpu_memory_fraction, model_path,
                   max_seq_length):
        os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
        os.environ['CUDA_VISIBLE_DEVICES'] = gpu_id
        self.tokenizer = bert_tokenization.FullTokenizer(vocab_file=vocab_file,
                                                         do_lower_case=True)
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_memory_fraction)
        sess_config = tf.ConfigProto(gpu_options=gpu_options)
        self.sess = tf.Session(config=sess_config)
        with gfile.FastGFile(model_path, "rb") as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())
            self.sess.graph.as_default()
            tf.import_graph_def(graph_def, name="")

        self.sess.run(tf.global_variables_initializer())
        self.is_train = self.sess.graph.get_tensor_by_name("input/is_train:0")
        self.input_ids = self.sess.graph.get_tensor_by_name(
            "input/input_ids:0")
        self.input_mask = self.sess.graph.get_tensor_by_name(
            "input/input_mask:0")
        self.segment_ids = self.sess.graph.get_tensor_by_name(
            "input/segment_ids:0")
        self.predictions = self.sess.graph.get_tensor_by_name(
            "output_layer/predictions:0")
        self.max_seq_length = max_seq_length
    def __init__(self, is_training):

        self.is_training = is_training
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

        print(FLAGS.include_unknowns)
        print(FLAGS.max_seq_length)
Exemple #4
0
def prepro(config):
    tokenizer = bert_tokenization.FullTokenizer(vocab_file=config.vocab_file)
    trainBunch = build_features(config, config.train_file, "train", tokenizer)
    print("save train bunch")
    pickle.dump(trainBunch, open(config.train_eval, "wb"))
    devBunch = build_features(config, config.dev_file, "dev", tokenizer)
    print("save dev bunch")
    pickle.dump(devBunch, open(config.dev_eval, "wb"))
Exemple #5
0
def load_model_etc(model_dir):
    with open(_config_path(model_dir)) as f:
        config = json.load(f)
    model = load_model(_model_path(model_dir))
    tokenizer = tokenization.FullTokenizer(
        vocab_file=_vocab_path(model_dir),
        do_lower_case=config['do_lower_case'])
    labels = load_labels(_labels_path(model_dir))
    return model, tokenizer, labels, config
Exemple #6
0
    def load_bert_vocab(self):
        with tf.Graph().as_default():
            bert_model = hub.Module(self.bert_url)
            vocab_info = bert_model(signature="tokenization_info",
                                    as_dict=True)
            with tf.Session() as sess:
                vocab_file, do_lower_case = sess.run(
                    [vocab_info["vocab_file"], vocab_info["do_lower_case"]])

        return bert_tokenization.FullTokenizer(vocab_file=vocab_file,
                                               do_lower_case=do_lower_case)
Exemple #7
0
def load_ner_model(ner_model_dir):
    with open(_ner_config_path(ner_model_dir)) as f:
        config = json.load(f)
    model = keras.models.load_model(
        _ner_model_path(ner_model_dir),
        custom_objects=get_custom_objects()
    )
    tokenizer = tokenization.FullTokenizer(
        vocab_file=_ner_vocab_path(ner_model_dir),
        do_lower_case=config['do_lower_case']
    )
    labels = read_labels(_ner_labels_path(ner_model_dir))
    return model, tokenizer, labels, config
Exemple #8
0
def load_pretrained(options):
    model = load_trained_model_from_checkpoint(
        options.bert_config_file,
        options.init_checkpoint,
        training=False,
        trainable=True,
        seq_len=options.max_seq_length,
    )
    tokenizer = tokenization.FullTokenizer(
        vocab_file=options.vocab_file,
        do_lower_case=options.do_lower_case
    )
    return model, tokenizer
 def get_tokenizer(self):
     bert_layer = self.model.get_layer("bert")
     try:
         vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()  # noqa
         do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
         tokenizer = bert_tokenization.FullTokenizer(vocab_file,
                                                     do_lower_case)
     # AlBERT uses a different tokenizer.
     # AttributeError: '_UserObject' object has no attribute 'vocab_file'
     except AttributeError:
         sp_model_file = bert_layer.resolved_object.sp_model_file.asset_path.numpy()  # noqa
         tokenizer = bert_tokenization.FullSentencePieceTokenizer(
                 sp_model_file)
     return tokenizer
Exemple #10
0
def construct(root,
              path,
              tokenizer_path,
              MAX_SEQUENCE_LENGTH,
              test=False,
              batch_size=1):
    tokenizer = tokenization.FullTokenizer(tokenizer_path, True)
    #MAX_SEQUENCE_LENGTH = 512

    data = pd.read_csv(root + path)
    #df_test = pd.read_csv(PATH+'test.csv')
    #df_sub = pd.read_csv(PATH+'sample_submission.csv')
    #print('test shape =', df_test.shape)
    output_categories = list(data.columns[11:])
    input_categories = list(data.columns[[1, 2, 5]])

    outputs = compute_output_arrays(data, output_categories)
    inputs = compute_input_arays(data, input_categories, tokenizer,
                                 MAX_SEQUENCE_LENGTH)
    #test_inputs = compute_input_arays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
    #inputs = [np.expand_dims(x, axis=2) for x in inputs]
    #outputs = np.expand_dims(outputs, axis=2)
    if not test:

        def generator():
            for in1, in2, in3, out in zip(inputs[0], inputs[1], inputs[2],
                                          outputs):
                yield {
                    'input_word_ids': in1,
                    'input_masks': in2,
                    'input_segments': in3
                }, out
                #yield [in1, in2, in3], out

        dataset = tf.data.Dataset.from_generator(generator, \
                                                 ({'input_word_ids':tf.int32, 'input_masks': tf.int32, 'input_segments':tf.int32},tf.float32))
        dataset = dataset.batch(batch_size)
        print('Train dataset constructed successfully with shape =',
              data.shape)
    else:
        # evaluation
        return (inputs, outputs)
    return dataset
Exemple #11
0
def main(argv):
    args = argparser().parse_args(argv[1:])

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file,
        do_lower_case=args.do_lower_case
    )
    label_list = load_labels(args.labels)
    label_map = { l: i for i, l in enumerate(label_list) }

    examples = []
    for x, y in tsv_generator(args.input_file, tokenizer, label_map, args):
        examples.append(Example(x, y))
        if args.max_examples and len(examples) >= args.max_examples:
            break

    write_examples(examples, args.output_file)

    return 0
Exemple #12
0
def preprocess_():
    # Data Preparation
    # ==================================================
    # Load data
    print("Loading data...")
    tf.reset_default_graph()

    if BINARY:
        preprocess.ITALIAN = False
        x_text, y = preprocess.load_data_and_bin_labels(
            "./CrisisLexT26_preprocessed/")
        preprocess.ITALIAN = True
        x_text_italian, y_italian = preprocess.load_data_and_bin_labels(
            "./italian_preprocessed/")
    else:
        preprocess.ITALIAN = False
        x_text, y = preprocess.load_data_and_labels(
            "./CrisisLexT26_preprocessed/")
        preprocess.ITALIAN = True
        x_text_italian, y_italian = preprocess.load_data_and_labels(
            "./italian_preprocessed/")

    x_english_to_italian = preprocess.load_from_file(
        "../Data/EnglishData/CrisisLexT26_english_to_italian_output_related.txt"
    )
    x_italian_to_english = preprocess.load_from_file(
        "../Data/ItalianData/italian_to_english_output_related.txt")

    max_document_length = max([len(x.split(" ")) for x in x_text])
    max_document_length_italian = max(
        [len(x.split(" ")) for x in x_text_italian])
    print("Max Document length:", max_document_length)
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    vocab_processor_italian = learn.preprocessing.VocabularyProcessor(
        max_document_length_italian)

    if PRETRAINEDEMBEDDING and (main_pre_trained_embeddings.Embedding == "ELMo"
                                or main_pre_trained_embeddings.Embedding
                                == "Bert"):
        x = x_text
        x_italian = x_text_italian
    else:
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        x_italian = np.array(
            list(vocab_processor_italian.fit_transform(x_text_italian)))

    shuffle_indices = np.random.permutation(np.arange(len(y)))
    shuffle_indices_italian = np.random.permutation(np.arange(len(y_italian)))

    if CNN:
        if PRETRAINEDEMBEDDING == True and (
                main_pre_trained_embeddings.Embedding == "ELMo"
                or main_pre_trained_embeddings.Embedding == "Bert"
                or main_pre_trained_embeddings.Embedding == 'GloVe'
                or main_pre_trained_embeddings.Embedding == "fastText"):
            x_shuffled = x
            y_shuffled = y
            x_shuffled_italian = x_italian
            y_shuffled_italian = y_italian
        else:
            x_shuffled = x[shuffle_indices]
            y_shuffled = y[shuffle_indices]
            x_shuffled_italian = x_italian[shuffle_indices_italian]
            y_shuffled_italian = y_italian[shuffle_indices_italian]

    if SVM or NB:
        x_shuffled = x_text
        y_shuffled = y

        x_shuffled_italian = x_text_italian
        y_shuffled_italian = y_italian

    # Split train/test set

    if PRETRAINEDEMBEDDING and main_pre_trained_embeddings.Embedding == "Bert":
        import bert_tokenization
        if ITALIAN:
            tokenizer = bert_tokenization.FullTokenizer(
                vocab_file="../Data/bert_checkpoint_multilingual/vocab.txt",
                do_lower_case=True)
        else:
            tokenizer = bert_tokenization.FullTokenizer(
                vocab_file="../Data/bert_checkpoint/vocab.txt",
                do_lower_case=True)

        tokenized = [tokenizer.tokenize(j) for i, j in enumerate(x_shuffled)]
        x_t = []
        index = 0
        for i in tokenized:
            x_t.append([])
            for j in tokenized:
                x_t[-1].append(index)
                index += 1
        x_shuffled = x_t

    # x_ = [[np.array(i)] for i in range(len(x_train))]

    # SPLIT THE DATASET in 1) labeled training 2) unlabeled 3) validation 4) test

    percentage = 0.50
    dev_sample_index = -750
    dev_labeled_index = 3000
    dev_unlabeled_index = 7500

    x_train, x_unlabeled, x_dev = x_shuffled[:int(
        dev_labeled_index * percentage
    )], x_shuffled[dev_labeled_index:dev_unlabeled_index +
                   dev_labeled_index], x_shuffled[dev_sample_index:]
    y_train, y_unlabeled, y_dev = y_shuffled[:int(
        dev_labeled_index * percentage
    )], y_shuffled[dev_labeled_index:dev_unlabeled_index +
                   dev_labeled_index], y_shuffled[dev_sample_index:]

    italian_dev_sample_index = -250
    italian_dev_labeled_index = 1000
    italian_dev_unlabeled_index = 3000

    x_train_italian, x_unlabeled_italian, x_dev_italian = x_shuffled_italian[:int(
        italian_dev_labeled_index * percentage)], x_shuffled_italian[
            italian_dev_labeled_index:italian_dev_unlabeled_index +
            italian_dev_labeled_index], x_shuffled_italian[
                italian_dev_sample_index:]
    y_train_italian, y_unlabeled_italian, y_dev_italian = y_shuffled_italian[:int(
        italian_dev_labeled_index * percentage)], y_shuffled_italian[
            italian_dev_labeled_index:italian_dev_unlabeled_index +
            italian_dev_labeled_index], y_shuffled_italian[
                italian_dev_sample_index:]

    x_english_to_italian = x_english_to_italian[
        dev_labeled_index:dev_unlabeled_index + dev_labeled_index]
    x_italian_to_english = x_italian_to_english[
        italian_dev_labeled_index:italian_dev_unlabeled_index +
        italian_dev_labeled_index]
    del x, y, x_shuffled, y_shuffled, x_shuffled_italian, y_shuffled_italian

    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Unlabeled/Dev split: {:d}/{:d}/{:d}".format(
        len(y_train), len(y_unlabeled), len(y_dev)))
    return x_text, x_train, x_unlabeled, x_dev, vocab_processor, y_train, y_unlabeled, y_dev, x_text_italian, x_train_italian, x_unlabeled_italian, x_dev_italian, vocab_processor_italian, y_train_italian, y_unlabeled_italian, y_dev_italian, x_english_to_italian, x_italian_to_english
Exemple #13
0
                i += 1
        word_weights = np.asarray(values[-d:], dtype=np.float32)
        # print(len(word_weights))
        word2idx[word] = index + 1
        weights.append(word_weights)
        if index + 1 == 400:
            break
    embedding_size = len(weights[0])
    weights.insert(0, np.random.randn(embedding_size))
    UNKNOWN_TOKEN = len(weights)
    word2idx['UNK'] = UNKNOWN_TOKEN
    weights.append(np.random.randn(embedding_size))
if Embedding == "Bert":

    weights = []
    tokenizer = bert_tokenization.FullTokenizer(
        vocab_file="./bert_checkpoint/vocab.txt", do_lower_case=True)
    tokenized = [tokenizer.tokenize(j) for i, j in enumerate(x_text)]
    for i, j in enumerate(tokenized):
        if len(j) > max_seq_length - 2:
            tokenized[i] = tokenized[i][0:(max_seq_length - 2)]
    bert_config = []
    i = -1
    word2idx = {('PAD', 0): -1}
    j = 0
    for line in open('CrisisLexT26_english_output2.txt', 'r'):
        record = line.split()
        if record[0] == "[CLS]":
            i += 1
        word2idx[(record[0], i)] = j
        j += 1
        weights.append(record[1:])
Exemple #14
0
def get_tokenizer(options):
    tokenizer = tokenization.FullTokenizer(vocab_file=options.vocab_file,
                                           do_lower_case=options.do_lower_case)
    return tokenizer
Exemple #15
0
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow_hub as hub
import tensorflow as tf
import bert_tokenization as tokenization
import tensorflow.keras.backend as K
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil

np.set_printoptions(suppress=True)
PATH = '../input/google-quest-challenge/'
BERT_PATH = '../input/bert-base-from-tfhub/bert_en_uncased_L-12_H-768_A-12'
tokenizer = tokenization.FullTokenizer(BERT_PATH + '/assets/vocab.txt', True)
MAX_SEQUENCE_LENGTH = 512

df_train = pd.read_csv(PATH + 'train.csv')
df_test = pd.read_csv(PATH + 'test.csv')
df_sub = pd.read_csv(PATH + 'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1, 2, 5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)


def _get_masks(tokens, max_seq_length):
Exemple #16
0
import numpy as np

import bert_tokenization
from bert_serving.client import BertClient
from sentence_encoder import SentenceEncoder

BERT_BASE_DIR = 'external/bert/cased_L-24_H-1024_A-16/'
tokenizer = bert_tokenization.FullTokenizer(vocab_file=BERT_BASE_DIR +
                                            'vocab.txt',
                                            do_lower_case=False)


def bert_embed(bc, sents, merge_subtokens=True, merge_strategy='first'):
    sents_encodings_full = bc.encode(sents)
    sents_tokenized = [tokenizer.tokenize(s) for s in sents]

    sents_encodings = []
    for sent_tokens, sent_vecs in zip(sents_tokenized, sents_encodings_full):
        sent_encodings = []
        sent_vecs = sent_vecs[1:-1]  # ignoring [CLS] and [SEP]
        for token, vec in zip(sent_tokens, sent_vecs):
            layers_vecs = np.split(vec, 4)  # due to -pooling_layer -4 -3 -2 -1
            layers_sum = np.array(layers_vecs, dtype=np.float32).sum(axis=0)
            sent_encodings.append((token, layers_sum))
        sents_encodings.append(sent_encodings)

    if merge_subtokens:
        sents_encodings_merged = []
        for sent, sent_encodings in zip(sents, sents_encodings):

            sent_tokens_vecs = []
Exemple #17
0
flags.DEFINE_string(
    "predict_file",
    "/kaggle/input/tensorflow2-question-answering/simplified-nq-test.jsonl",
    "NQ json for predictions. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz")
flags.DEFINE_boolean("logtostderr", True, "Logs to stderr")
flags.DEFINE_boolean("undefok", True, "it's okay to be undefined")
flags.DEFINE_string('f', '', 'kernel')
flags.DEFINE_string('HistoryManager.hist_file', '', 'kernel')

FLAGS = flags.FLAGS
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

tf2baseline.validate_flags_or_throw(bert_config)
tf.io.gfile.makedirs(FLAGS.output_dir)

tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                       do_lower_case=FLAGS.do_lower_case)

tpu_cluster_resolver = None
if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

is_per_host = tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.compat.v1.estimator.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    master=FLAGS.master,
    model_dir=FLAGS.output_dir,
    save_checkpoints_steps=FLAGS.save_checkpoints_steps,
    tpu_config=tf.compat.v1.estimator.tpu.TPUConfig(
        iterations_per_loop=FLAGS.iterations_per_loop,
        num_shards=FLAGS.num_tpu_cores,
Exemple #18
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    layer_indexes = [int(x) for x in FLAGS.layers.split(",")]

    bert_config = bert_modeling.BertConfig.from_json_file(
        FLAGS.bert_config_file)

    tokenizer = bert_tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        master=FLAGS.master,
        tpu_config=tf.contrib.tpu.TPUConfig(
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    examples = read_examples(FLAGS.input_file)

    features = convert_examples_to_features(examples=examples,
                                            seq_length=FLAGS.max_seq_length,
                                            tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model_fn = model_fn_builder(
        bert_config=bert_config,
        init_checkpoint=FLAGS.init_checkpoint,
        layer_indexes=layer_indexes,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        predict_batch_size=FLAGS.batch_size)

    input_fn = input_fn_builder(features=features,
                                seq_length=FLAGS.max_seq_length)

    with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
                                                 "w")) as writer:
        for result in estimator.predict(input_fn, yield_single_examples=True):
            unique_id = int(result["unique_id"])
            feature = unique_id_to_feature[unique_id]
            output_json = collections.OrderedDict()
            output_json["linex_index"] = unique_id
            all_features = []
            for (i, token) in enumerate(feature.tokens):
                all_layers = []
                for (j, layer_index) in enumerate(layer_indexes):
                    layer_output = result["layer_output_%d" % j]
                    layers = collections.OrderedDict()
                    layers["index"] = layer_index
                    layers["values"] = [
                        round(float(x), 6)
                        for x in layer_output[i:(i + 1)].flat
                    ]
                    all_layers.append(layers)
                features = collections.OrderedDict()
                features["token"] = token
                features["layers"] = all_layers
                all_features.append(features)
            output_json["features"] = all_features
            writer.write(json.dumps(output_json) + "\n")