예제 #1
0
def tokenize_text_pairs_for_bert(text_pairs: List[Tuple[str, str]], bert_tokenizer: FullTokenizer) -> \
        List[Tuple[Sequence[int], int]]:
    res = []
    for left_text, right_text in text_pairs:
        tokenized_left_text = bert_tokenizer.tokenize(left_text)
        if tokenized_left_text[0] != '[CLS]':
            tokenized_left_text = ['[CLS]'] + tokenized_left_text
        if tokenized_left_text[-1] != '[SEP]':
            tokenized_left_text = tokenized_left_text + ['[SEP]']
        tokenized_right_text = bert_tokenizer.tokenize(right_text)
        if tokenized_right_text[0] == '[CLS]':
            tokenized_right_text = tokenized_right_text[1:]
        if tokenized_right_text[-1] == '[SEP]':
            tokenized_right_text = tokenized_right_text[0:-1]
        tokenized_text = tokenized_left_text + tokenized_right_text
        if len(tokenized_text) > MAX_SEQ_LENGTH:
            warnings.warn(
                "The text pair `{0}` - `{1}` contains too many sub-tokens!".
                format(left_text, right_text))
            res.append((array.array("l"), 0))
        else:
            token_IDs = bert_tokenizer.convert_tokens_to_ids(tokenized_text)
            res.append((array.array("l", token_IDs), len(tokenized_left_text)))
        del tokenized_left_text, tokenized_right_text, tokenized_text
    return res
예제 #2
0
class IntentDetection:
    
    def __init__(self):
        self.MAX_SEQ_LEN = 38
        self.modelDir = 'saved_model/1'
        self.vocabDir = 'config/vocab.txt'
        self.classes = ['PlayMusic', 'AddToPlaylist', 'RateBook', 'SearchScreeningEvent', 'BookRestaurant', 'GetWeather', 'SearchCreativeWork']
        self.tokenizer = FullTokenizer(vocab_file=self.vocabDir)
        print("============load model start=============")
        self.model = self.loadModel()
        print("============load model success=============")

    def loadModel(self):
        return tf.keras.models.load_model(self.modelDir) 

    def predict(self, sentence):
        pred_tokens = self.tokenizer.tokenize(sentence)
        pred_tokens = ["[CLS]"] + pred_tokens + ["[SEP]"]
        pred_token_ids = list(self.tokenizer.convert_tokens_to_ids(pred_tokens))
        pred_token_ids = pred_token_ids + [0]*(self.MAX_SEQ_LEN-len(pred_token_ids))
        #pred_token_ids = np.array([pred_token_ids,])
        pred_token_ids = np.array(pred_token_ids)
        pred_token_ids = np.expand_dims(pred_token_ids, axis=0)
        predictions = self.model.predict(pred_token_ids).argmax(axis=-1)
        return self.classes[predictions[0]]
예제 #3
0
 def __init__(self):
     self.MAX_SEQ_LEN = 38
     self.modelDir = 'saved_model/1'
     self.vocabDir = 'config/vocab.txt'
     self.classes = ['PlayMusic', 'AddToPlaylist', 'RateBook', 'SearchScreeningEvent', 'BookRestaurant', 'GetWeather', 'SearchCreativeWork']
     self.tokenizer = FullTokenizer(vocab_file=self.vocabDir)
     print("============load model start=============")
     self.model = self.loadModel()
     print("============load model success=============")
예제 #4
0
def construct_flat_datasets(args1, subwords_path):
    global tokenizer_bert, tokenizer_ro, args

    args = args1
    if args.bert:
        tokenizer_bert = FullTokenizer(
            vocab_file=join(args.bert_model_dir, "vocab.vocab"))
        tokenizer_bert.vocab_size = len(tokenizer_bert.vocab)

    samples = get_text_samples(args)

    if os.path.isfile(subwords_path + '.subwords'):
        tokenizer_ro = construct_tokenizer(None, subwords_path, args)
    else:
        tokenizer_ro = construct_tokenizer(list(samples), subwords_path, args)

    sample_train = int(args.total_samples * args.train_dev_split)

    if args.records:

        dataset = tf.data.Dataset.from_generator(
            generator_tensors_ids_and_segs, ((tf.int64, tf.int64), tf.int64),
            ((tf.TensorShape([None]), tf.TensorShape(
                [None])), tf.TensorShape([None])))
        if args.separate:
            train_dataset = dataset
            dev_dataset = tf.data.Dataset.from_generator(
                generator_tensors_ids_and_segs_dev,
                ((tf.int64, tf.int64), tf.int64), ((tf.TensorShape(
                    [None]), tf.TensorShape([None])), tf.TensorShape([None])))
            return train_dataset, dev_dataset
    else:
        gen_dataset = generator_tensors_ids()
        dataset = list(gen_dataset)
        nr_samples = len(dataset)
        sample_train = int(args.train_dev_split * nr_samples)
        # dataset = tf.convert_to_tensor(dataset, dtype=tf.int64)
        dataset = tf.data.Dataset.from_generator(
            generator_tensors_ids, (tf.int64, tf.int64), (tf.TensorShape(
                [2, args.seq_length]), tf.TensorShape([args.seq_length])))
        if args.separate:
            gen_dataset = generator_tensors_ids_dev()
            dev_dataset = list(gen_dataset)
            # dataset = tf.convert_to_tensor(dataset, dtype=tf.int64)
            dev_dataset = tf.data.Dataset.from_generator(
                generator_tensors_ids_dev, (tf.int64, tf.int64),
                (tf.TensorShape([2, args.seq_length
                                 ]), tf.TensorShape([args.seq_length])))
            return dataset, dev_dataset

    train_dataset = dataset.take(sample_train)
    dev_dataset = dataset.skip(sample_train)

    return train_dataset, dev_dataset
예제 #5
0
 def __init__(self, max_seq_length, trainable, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.bert_layer = hub.KerasLayer(
         "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
         trainable=trainable)
     vocab_file = self.bert_layer.resolved_object.vocab_file.asset_path.numpy(
     )
     do_lower_case = self.bert_layer.resolved_object.do_lower_case.numpy()
     self.tokenizer = FullTokenizer(vocab_file, do_lower_case)
     self.max_seq_length = max_seq_length
     """
    def test_direct_keras_to_stock_compare(self):
        from tests.ext.modeling import BertModel, BertConfig, get_assignment_map_from_checkpoint

        bert_config = BertConfig.from_json_file(self.bert_config_file)
        tokenizer = FullTokenizer(
            vocab_file=os.path.join(self.bert_ckpt_dir, "vocab.txt"))

        # prepare input
        max_seq_len = 6
        input_str = "Hello, Bert!"
        input_tokens = tokenizer.tokenize(input_str)
        input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        input_ids = input_ids + [0] * (max_seq_len - len(input_tokens))
        input_mask = [1] * len(input_tokens) + [0] * (max_seq_len -
                                                      len(input_tokens))
        token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len -
                                                          len(input_tokens))

        input_ids = np.array([input_ids], dtype=np.int32)
        input_mask = np.array([input_mask], dtype=np.int32)
        token_type_ids = np.array([token_type_ids], dtype=np.int32)

        print("   tokens:", input_tokens)
        print(
            "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len,
                                        input_ids), input_ids.shape,
            token_type_ids)

        s_res = self.predict_on_stock_model(input_ids, input_mask,
                                            token_type_ids)
        k_res = self.predict_on_keras_model(input_ids, input_mask,
                                            token_type_ids)

        np.set_printoptions(precision=9,
                            threshold=20,
                            linewidth=200,
                            sign="+",
                            floatmode="fixed")
        print("s_res", s_res.shape)
        print("k_res", k_res.shape)

        print("s_res:\n {}".format(s_res[0, :2, :10]), s_res.dtype)
        print("k_res:\n {}".format(k_res[0, :2, :10]), k_res.dtype)

        adiff = np.abs(s_res - k_res).flatten()
        print("diff:", np.max(adiff), np.argmax(adiff))
        self.assertTrue(np.allclose(s_res, k_res, atol=1e-6))
예제 #7
0
def infer(review):
  bert_abs_path = '/home/shravan/python_programs/bert_leraning/data/'
  bert_model_name = 'multi_cased_L-12_H-768_A-12'

  bert_ckpt_dir = os.path.join(bert_abs_path, bert_model_name)
  bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
  bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")


  tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, 'vocab.txt'))


  model_path = "/home/shravan/tf/tf/bert_tf2/bert_model"
  model = tf.keras.models.load_model(model_path)

  sentences = [review]
  classes   = ['Negative', 'Positive']
  pred_tokens = map(tokenizer.tokenize, sentences)
  pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
  pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

  pred_token_ids = map(lambda tids: tids +[0]*(128-len(tids)),pred_token_ids)
  pred_token_ids = np.array(list(pred_token_ids))

  predictions = model.predict(pred_token_ids).argmax(axis=-1)

  result = dict()
  for text, label in zip(sentences, predictions):
    result['text'] = text
    result['sentiment'] = classes[label]
    print("text:", text, "\nintent:", classes[label])
    print()


  return result
예제 #8
0
def get_tokenizers_ckeckpoint(args1):
    global args
    args = args1
    tokenizer_ro_path = join(args.checkpoint, 'tokenizer_ro')
    tokenizer_ro = tfds.features.text.SubwordTextEncoder.load_from_file(tokenizer_ro_path)
    tf.compat.v1.logging.info('restoring ro tokenizer from {}'.format(tokenizer_ro_path))

    tokenizer_bert = None
    if args.bert:
        tokenizer_bert_path = join(args.checkpoint, 'tokenizer_bert.vocab')
        tokenizer_bert = FullTokenizer(vocab_file=tokenizer_bert_path)
        tokenizer_bert.vocab_size = len(tokenizer_bert.vocab)
        tf.compat.v1.logging.info('restoring bert tokenizer from {}'.format(tokenizer_bert_path))

    tf.compat.v1.logging.info('tokenizers restored')
    return tokenizer_ro, tokenizer_bert 
예제 #9
0
def main(inputDataset, outputDir, modelType):
    print(inputDataset, outputDir, 'modelType : ', modelType)
    """Main method of this module."""
    print('--------------loading model----------------------')
    tokenizer = FullTokenizer(vocab_file=(
        "/home/zenith-kaju/NLP---Hyperpartisan-News-Detection/TensorBert/vocab.txt"
    ))
    model = load_model(
        '/home/zenith-kaju/NLP---Hyperpartisan-News-Detection/TensorBert/models/'
        + modelType,
        custom_objects={'BertModelLayer': BertModelLayer})
    print('------------------Model Loaded---------------------')
    with open(outputDir + "/" + runOutputFileName, 'w') as outFile:
        for file in os.listdir(inputDataset):
            if file.endswith(".xml"):
                tree = ElementTree.parse(inputDataset + "/" + file)
                root = tree.getroot()
                print('Total articles ', len(root))
                for article in tqdm(root.iter('article')):
                    articleID = article.attrib['id']
                    content = element_to_string(article)
                    prediction = predict(content, tokenizer, model)
                    outFile.write(articleID + " " + prediction + "\n")

    print("The predictions have been written to the output folder.")
예제 #10
0
def predict_new(doc, model, max_seq_len=150):
    """
    Predict new document using the trained model. 

    doc: input document in format of a string
    """

    # clean the text
    doc = clean_txt(doc)
    # split the string text into list of subtexts
    doc = get_split(doc)
    # tokenize the subtexts as well as padding
    tokenizer = FullTokenizer(
        vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
    pred_tokens = map(tokenizer.tokenize, doc)
    pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
    pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))
    pred_token_ids = map(lambda tids: tids + [0] * (max_seq_len - len(tids)),
                         pred_token_ids)
    pred_token_ids = np.array(list(pred_token_ids))

    # create model and load previous weights
    # model = create_model(max_seq_len = data.max_seq_len)
    # model.load_weights()

    # predict the subtexts and average the prediction
    predictions = model.predict(pred_token_ids)
    predictions = predictions[:, 1]
    avg_pred = predictions.mean()
    if avg_pred > 0.5:
        doc_label = 'fake'
    else:
        doc_label = 'Real'

    return doc_label, avg_pred
예제 #11
0
def bert_tokenizer(vocab_path=None):
  """Constructs a BERT tokenizer."""
  # This import is from https://github.com/google-research/bert which is not
  # listed as a dependency in trax.
  # TODO(piotrekp1): using SubwordTextEncoder instead after fixing the
  # differences
  from bert.tokenization.bert_tokenization import FullTokenizer  # pylint: disable=g-import-not-at-top
  if vocab_path is None:
    raise ValueError('vocab_path is required to construct the BERT tokenizer.')
  tokenizer = FullTokenizer(vocab_path, do_lower_case=True)
  return tokenizer
예제 #12
0
파일: bert_config.py 프로젝트: shravanc/tf
class BertConfig(BaseConfig):

    TRAIN_DATA_PATH = '/home/shravan/python_programs/generate_kannada_movie_reviews/modified_data/train'
    TEST_DATA_PATH = '/home/shravan/python_programs/generate_kannada_movie_reviews/modified_data/test'
    CSV_DELIMITER = '#'
    CSV_COLUMNS = ['Reviews', 'Sentiment']

    BERT_BASE_PATH = '/home/shravan/python_programs/bert_leraning/data/'
    BERT_MODEL_NAME = 'multi_cased_L-12_H-768_A-12'

    BERT_CKPT_DIR = os.path.join(BERT_BASE_PATH, BERT_MODEL_NAME)
    BERT_CKPT_FILE = os.path.join(BERT_CKPT_DIR, 'bert_model.ckpt')
    BERT_CONF_FILE = os.path.join(BERT_CKPT_DIR, 'bert_config.json')
    BERT_VOCAB_FILE = os.path.join(BERT_CKPT_DIR, 'vocab.txt')

    DATA_COLUMN = CSV_COLUMNS[0]
    LABEL_COLUMN = CSV_COLUMNS[1]
    CLASSES = [0, 1]

    TOKENIZER = FullTokenizer(vocab_file=BERT_VOCAB_FILE)

    MAX_LEN = 128

    @classmethod
    def classes(self):
        return self.CLASSES

    @classmethod
    def max_len(self):
        return self.MAX_LEN

    @classmethod
    def csv_columns(self):
        return self.CSV_COLUMNS

    @classmethod
    def delimiter(self):
        return self.CSV_DELIMITER

    @classmethod
    def train_data_path(self):
        return self.TRAIN_DATA_PATH

    @classmethod
    def test_data_path(self):
        return self.TEST_DATA_PATH

    @classmethod
    def data_column(self):
        return self.DATA_COLUMN

    @classmethod
    def label_column(self):
        return self.LABEL_COLUMN
예제 #13
0
    def tokenizer_factory(vocab_file: str) -> FullTokenizer:
        """This method will return a BERT tokenizer initialized
        using the vocabulary file at
        `WoodgateSettings.bert_vocab_path`.

        :return: A BERT tokenizer.
        :rtype: FullTokenizer
        """
        tokenizer: FullTokenizer = FullTokenizer(
            vocab_file=vocab_file
        )
        return tokenizer
예제 #14
0
def initialize_tokenizer(model_dir: str) -> FullTokenizer:
    model_name = os.path.basename(model_dir)
    assert len(model_name
               ) > 0, '`{0}` is wrong directory name for a BERT model.'.format(
                   model_dir)
    bert_model_ckpt = os.path.join(model_dir, "bert_model.ckpt")
    do_lower_case = not ((model_name.lower().find("cased") == 0) or
                         (model_name.lower().find("_cased") >= 0) or
                         (model_name.lower().find("-cased") >= 0))
    validate_case_matches_checkpoint(do_lower_case, bert_model_ckpt)
    vocab_file = os.path.join(model_dir, "vocab.txt")
    return FullTokenizer(vocab_file, do_lower_case)
예제 #15
0
 def __init__(self, config, cls_config, vocabs, vocab_file, num_labels=14):
     super(BertForTokenClassification, self).__init__(config)
     self.num_labels = num_labels
     self.tokenizer = FullTokenizer(vocab_file)
     self.max_length = cls_config['max_seq_length']
     self.tag_vocab = {
         i: w
         for i, w in enumerate(vocabs['tag_vocab']['idx_to_token'])
     }
     self.bert = BertModel(config)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.dense = nn.Linear(config.hidden_size, num_labels)
     self.apply(self.init_bert_weights)
예제 #16
0
def run():

    #load the traing data
    df = pd.read_csv(config.TRAINING_FILE)
    #initial preprocessing
    train, test = DatasetLoader(df).get_input()
    classes = df.cat.unique().tolist()
    tokenizer = FullTokenizer(
        vocab_file=os.path.join(config.BERT_CKPT_DIR, "vocab.txt"))
    #preparing the text for BERT classifier
    data = IntentProcessor(train, test, tokenizer, classes, max_seq_len=128)
    #initiate the BERT model
    model = BERTModel.create_model(config.MAX_LEN, classes,
                                   config.BERT_CKPT_FILE)
    print(model.summary())

    #model training
    model.compile(
        optimizer=keras.optimizers.Adam(1e-5),
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")])

    log_dir = config.OUTPUT_PATH + "/intent_classifier/" + datetime.datetime.now(
    ).strftime("%Y%m%d-%H%M%s")
    tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

    history = model.fit(x=data.train_x,
                        y=data.train_y,
                        validation_split=0.1,
                        batch_size=config.BATCH_SIZE,
                        shuffle=True,
                        epochs=config.EPOCHS,
                        callbacks=[tensorboard_callback])

    #saving model checkpoint
    model.save_weights(config.SAVE_WEIGHTS_PATH)

    #plotting accuracy and loss
    plt.title("Accuracy")
    plt.plot(history.history["acc"], label="acc")
    plt.plot(history.history["val_acc"], label="val_acc")
    plt.legend()
    plt.show()

    plt.title("Loss")
    plt.plot(history.history["loss"], label="loss")
    plt.plot(history.history["val_loss"], label="val_loss")
    plt.legend()
    plt.show()
예제 #17
0
    def prepare(self):
        sentence = self.sentence
        tokenizer = FullTokenizer(
            vocab_file=os.path.join(config.BERT_CKPT_DIR, "vocab.txt"))

        pred_tokens = map(tokenizer.tokenize, sentence)
        pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
        pred_token_ids = list(map(tokenizer.convert_tokens_to_ids,
                                  pred_tokens))

        pred_token_ids = map(
            lambda tids: tids + [0] * (config.MAX_LEN - len(tids)),
            pred_token_ids)
        pred_token_ids = np.array(list(pred_token_ids))
        return pred_token_ids
예제 #18
0
def predict_essay_grade(model, data, bert_ckpt_dir, pred_sentences):
    tokenizer = FullTokenizer(
        vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
    pred_tokens = map(tokenizer.tokenize, pred_sentences)
    pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
    pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

    pred_token_ids = map(
        lambda tids: tids + [0] * (data.max_seq_len - len(tids)),
        pred_token_ids)
    pred_token_ids = np.array(list(pred_token_ids))

    print('pred_token_ids', pred_token_ids.shape)

    res = model.predict(pred_token_ids).argmax(axis=-1)
    return res
    def predict_on_stock_model(self, input_ids, input_mask, token_type_ids):
        from tests.ext.modeling import BertModel, BertConfig, get_assignment_map_from_checkpoint

        tf.compat.v1.reset_default_graph()

        tf_placeholder = tf.compat.v1.placeholder

        max_seq_len = input_ids.shape[-1]
        pl_input_ids = tf.compat.v1.placeholder(tf.int32,
                                                shape=(1, max_seq_len))
        pl_mask = tf.compat.v1.placeholder(tf.int32, shape=(1, max_seq_len))
        pl_token_type_ids = tf.compat.v1.placeholder(tf.int32,
                                                     shape=(1, max_seq_len))

        bert_config = BertConfig.from_json_file(self.bert_config_file)
        tokenizer = FullTokenizer(
            vocab_file=os.path.join(self.bert_ckpt_dir, "vocab.txt"))

        s_model = BertModel(config=bert_config,
                            is_training=False,
                            input_ids=pl_input_ids,
                            input_mask=pl_mask,
                            token_type_ids=pl_token_type_ids,
                            use_one_hot_embeddings=False)

        tvars = tf.compat.v1.trainable_variables()
        (assignment_map,
         initialized_var_names) = get_assignment_map_from_checkpoint(
             tvars, self.bert_ckpt_file)
        tf.compat.v1.train.init_from_checkpoint(self.bert_ckpt_file,
                                                assignment_map)

        with tf.compat.v1.Session() as sess:
            sess.run(tf.compat.v1.global_variables_initializer())

            s_res = sess.run(s_model.get_sequence_output(),
                             feed_dict={
                                 pl_input_ids: input_ids,
                                 pl_token_type_ids: token_type_ids,
                                 pl_mask: input_mask,
                             })
        return s_res
예제 #20
0
def run(data_df, bert_ckpt_dir, bert_config_file, bert_ckpt_file,
        output_model):
    tokenizer = FullTokenizer(
        vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
    data = EssayGraderData(tokenizer,
                           data_df,
                           sample_size=10 * 128 * 2,
                           max_seq_len=512)

    print("            train_x", data.train_x.shape)
    print("train_x_token_types", data.train_x_token_types.shape)
    print("            train_y", data.train_y.shape)
    print("             test_x", data.test_x.shape)
    print("        max_seq_len", data.max_seq_len)

    adapter_size = None  # use None to fine-tune all of BERT
    model = create_model(data.max_seq_len,
                         bert_config_file,
                         bert_ckpt_file,
                         adapter_size=adapter_size)
    model = train_essay_grader(model, data)
    model.save_weights(output_model, overwrite=True)
예제 #21
0
    def predict(self):
        sent = self.sentence
        classes = ['food', 'transport', 'shopping', 'bills']
        init_time = datetime.datetime.now()
        tokenizer = FullTokenizer(
            vocab_file=os.path.join(config.BERT_CKPT_DIR, "vocab.txt"))

        pred_tokens = map(tokenizer.tokenize, sent)
        pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
        pred_token_ids = list(map(tokenizer.convert_tokens_to_ids,
                                  pred_tokens))

        pred_token_ids = map(
            lambda tids: tids + [0] * (config.MAX_LEN - len(tids)),
            pred_token_ids)
        pred_token_ids = np.array(list(pred_token_ids))

        #create the model
        model = BERTModel.create_model(config.MAX_LEN, classes,
                                       config.BERT_CKPT_FILE)
        print(model.summary())

        #model prediction using saved model checkpoints
        model.load_weights(config.SAVE_WEIGHTS_PATH)
        print("model loaded")
        predictions = model.predict(pred_token_ids).argmax(axis=-1)
        print(f"prediction: {predictions}")
        print("result")
        final_time = datetime.datetime.now() - init_time
        results_lst = []
        result = dict()
        for text, label in zip(sent, predictions):
            result['text'] = text
            result['label'] = classes[label]
            result['time_taken'] = final_time.total_seconds()
            results_lst.append(result.copy())

        return results_lst
예제 #22
0
    def __init__(self,
                 vectorizer=None,
                 load_weights=False,
                 use_bert=True,
                 bert_model_name=None):
        self.bert_model_name = bert_model_name

        if use_bert:
            self.vocabulary = set(
                line.strip()
                for line in open("D:/Development/Projects/bert_models/" +
                                 self.bert_model_name + "/vocab.txt",
                                 encoding="utf-8"))
            self.bert_tokenizer = FullTokenizer(vocab_file=os.path.join(
                "D:/Development/Projects/bert_models/" +
                self.bert_model_name, "vocab.txt"))
            self.vectorizer = MultiVectorizer(tokenizer=self.bert_tokenizer,
                                              use_bert=use_bert)
        else:
            self.vectorizer = vectorizer

        self.load_weights = load_weights
        self.max_shape = None

        if self.load_weights:
            self.vectorizer = self.vectorizer.load(
                "data/weights/vectorizer.dat")

        self.METRICS = [
            BinaryAccuracy(name='accuracy'),
            Precision(name='precision'),
            Recall(name='recall'),
            AUC(name='auc')
        ]

        self.checkpoint_path = "data/weights/checkpoints/cp-epoch_{epoch:02d}-accuracy_{accuracy:.3f}_val_precision_{val_precision:.3f}-val_recall_{val_recall:.3f}-val_auc_{val_auc:.3f}.ckpt"
        self.checkpoint_dir = os.path.dirname(self.checkpoint_path)
        print("Checkpoint dir:", self.checkpoint_dir)
예제 #23
0
파일: app.py 프로젝트: akanksha-devp/irapi
def predict():
    RANDOM_SEED = 42

    np.random.seed(RANDOM_SEED)
    tf.random.set_seed(RANDOM_SEED)
    train = pd.read_csv("HT_train.csv")
    valid = pd.read_csv("HT_valid.csv")
    test = pd.read_csv("HT_test.csv")
    train = train.append(valid).reset_index(drop=True)
    bert_model_name = "intent"

    bert_ckpt_dir = os.path.join("model2/",
                                 bert_model_name)  #checkpoint directory
    bert_ckpt_file = os.path.join(bert_ckpt_dir,
                                  "bert_model.ckpt")  #checkpoint file
    bert_config_file = os.path.join(bert_ckpt_dir,
                                    "bert_config.json")  #configuration file

    class IntentDetectionData:
        DATA_COLUMN = "text"
        LABEL_COLUMN = "intent"

        def __init__(self,
                     train,
                     test,
                     tokenizer: FullTokenizer,
                     classes,
                     max_seq_len=192):
            self.tokenizer = tokenizer
            self.max_seq_len = 0
            self.classes = classes

            train, test = map(
                lambda df: df.reindex(df[IntentDetectionData.DATA_COLUMN].str.
                                      len().sort_values().index),
                [train, test])

            ((self.train_x, self.train_y),
             (self.test_x, self.test_y)) = map(self._prepare, [train, test])

            print("max seq_len", self.max_seq_len)
            self.max_seq_len = min(self.max_seq_len, max_seq_len)
            self.train_x, self.test_x = map(self._pad,
                                            [self.train_x, self.test_x])

        def _prepare(self, df):
            x, y = [], []

            for _, row in tqdm(df.iterrows()):
                text, label = row[IntentDetectionData.DATA_COLUMN], row[
                    IntentDetectionData.LABEL_COLUMN]
                tokens = self.tokenizer.tokenize(text)
                tokens = ["[CLS]"] + tokens + ["[SEP]"]
                token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                self.max_seq_len = max(self.max_seq_len, len(token_ids))
                x.append(token_ids)
                y.append(self.classes.index(label))
            return np.array(x), np.array(y)

        def _pad(self, ids):
            x = []
            for input_ids in ids:
                input_ids = input_ids[:min(len(input_ids), self.max_seq_len -
                                           2)]
                input_ids = input_ids + [0] * (self.max_seq_len -
                                               len(input_ids))
                x.append(np.array(input_ids))
            return np.array(x)

    tokenizer = FullTokenizer(
        vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))

    def create_model(max_seq_len, bert_ckpt_file):
        with tf.io.gfile.GFile(bert_config_file, "r") as reader:
            bc = StockBertConfig.from_json_string(reader.read())
            bert_params = map_stock_config_to_params(bc)
            bert_params.adapter_size = None
            bert = BertModelLayer.from_params(bert_params, name="bert")
        input_ids = keras.layers.Input(shape=(max_seq_len, ),
                                       dtype='int32',
                                       name="input_ids")
        bert_output = bert(input_ids)

        print("bert shape", bert_output.shape)

        cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
        cls_out = keras.layers.Dropout(0.5)(cls_out)
        logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
        logits = keras.layers.Dropout(0.5)(logits)
        logits = keras.layers.Dense(units=len(classes),
                                    activation="softmax")(logits)

        model = keras.Model(inputs=input_ids, outputs=logits)
        model.build(input_shape=(None, max_seq_len))

        load_stock_weights(bert, bert_ckpt_file)

        return model

    classes = train.intent.unique().tolist()

    data = IntentDetectionData(train,
                               test,
                               tokenizer,
                               classes,
                               max_seq_len=128)
    model = create_model(data.max_seq_len, bert_ckpt_file)
    model.compile(
        optimizer=keras.optimizers.Adam(1e-5),
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")])
    history = model.fit(
        x=data.train_x,
        y=data.train_y,
        validation_split=0.1,
        batch_size=16,
        shuffle=True,
        epochs=5,
        #   callbacks=[tensorboard_callback]
    )
    model.save("save_model")
    model = keras.models.load_model('save_model')
    # saved_model = pickle.dumps(model)

    # # Load the pickled model
    # knn_from_pickle = pickle.loads(saved_model)

    # Use the loaded pickled model to make predictions
    # knn_from_pickle.predict(X_test)

    # joblib.dump(model, 'intent_model.pkl')
    # # intent_model = open('intent_model.pkl','rb')
    # model20 = joblib.load('intent_model.pkl')
    # with open('model_pickle','wb') as f:
    #     pickle.dump(model,f)
    # with open('model_pickle','rb') as f:
    #     mp=pickle.load(f)

    if request.method == 'POST':
        message = request.form['message']
        inputdata = [message]
        pred_tokens = map(tokenizer.tokenize, inputdata)
        pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
        pred_token_ids = list(map(tokenizer.convert_tokens_to_ids,
                                  pred_tokens))

        pred_token_ids = map(
            lambda tids: tids + [0] * (data.max_seq_len - len(tids)),
            pred_token_ids)
        pred_token_ids = np.array(list(pred_token_ids))

        my_prediction = model.predict(pred_token_ids).argmax(axis=-1)
        if my_prediction == 0:
            x = "Details"
        elif my_prediction == 1:
            x = "Create"
        elif my_prediction == 2:
            x = "Book"
    # return jsonify(inputdata)
    # return jsonify(my_prediction)
    return render_template('result.html', prediction=x)
예제 #24
0
bert_model_dir = "2018_10_18"
bert_model_name = "uncased_L-24_H-1024_A-16"

for fname in [
        "bert_config.json", "vocab.txt", "bert_model.ckpt.meta",
        "bert_model.ckpt.index", "bert_model.ckpt.data-00000-of-00001"
]:
    cmd = f"gsutil cp gs://bert_models/{bert_model_dir}/{bert_model_name}/{fname} .model/{bert_model_name}"
    print(cmd)

bert_ckpt_dir = os.path.join(
    "/Users/donghyoung/PycharmProjects/TensorFlowTest/model/", bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config")

tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
data = MovieReviewData(
    tokenizer,
    sample_size=10 * 128 * 2,  #5000,
    max_seq_len=128)

print("            train_x", data.train_x.shape)
print(data.train_x[:2])
print("train_x_token_types", data.train_x_token_types.shape)
print("            train_y", data.train_y.shape)

print("             test_x", data.test_x.shape)

print("        max_seq_len", data.max_seq_len)

예제 #25
0
classes = [
    'Research', 'Coding Guidelines', 'Case Study', 'Financial Reports',
    'CompanyDetails', 'AuditProposals'
]

while True:
    n = input("Enter query")
    if n == "EXIT":
        print("Exiting")
        break

    sentence = [n]

    print(sentence)

    tokenizer = FullTokenizer(
        vocab_file="/content/drive/MyDrive/EY_DATA/vocab.txt")

    pred_tokens = map(tokenizer.tokenize, sentence)
    pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
    pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

    max_seq_len = 128

    pred_token_ids = map(lambda tids: tids + [0] * (max_seq_len - len(tids)),
                         pred_token_ids)
    pred_token_ids = np.array(list(pred_token_ids))

    predictions = new_model.predict(pred_token_ids).argmax(axis=-1)

    intents_we_came_across = []
    for text, label in zip(sentence, predictions):
예제 #26
0
파일: main.py 프로젝트: akashdas765/chatbot
      x.append(token_ids)
      y.append(self.classes.index(label))

    return np.array(x), np.array(y)

  def _pad(self, ids):
    x = []
    for input_ids in ids:
      input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.array(input_ids))
    return np.array(x)



tokenizer = FullTokenizer(vocab_file=os.path.join(main_path+"uncased_L-12_H-768_A-12/vocab.txt"))


tokenizer.tokenize("I can't wait to visit Bulgaria again!")



classes = train.labels.unique().tolist()

data = IntentDetectionData(train, test, tokenizer, classes, max_seq_len=128)



responses=pd.read_csv(main_path+'response.csv')

예제 #27
0
class TfHubBert(Model):
    def __init__(self, max_seq_length, trainable, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=trainable)
        vocab_file = self.bert_layer.resolved_object.vocab_file.asset_path.numpy(
        )
        do_lower_case = self.bert_layer.resolved_object.do_lower_case.numpy()
        self.tokenizer = FullTokenizer(vocab_file, do_lower_case)
        self.max_seq_length = max_seq_length
        """
        self.input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                                    name="input_word_ids")
        self.input_mask_ = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                                 name="input_mask")
        self.segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                                 name="segment_ids")
        """

    def prep(self, s, get='id'):
        stokens = self.tokenizer.tokenize(s)
        stokens = ["[CLS]"] + stokens + ["[SEP]"]
        if get == 'id':
            input_ids = get_ids(stokens, self.tokenizer, self.max_seq_length)
            return input_ids
        elif get == 'mask':
            input_masks = get_masks(stokens, self.max_seq_length)
            return input_masks
        else:
            input_segments = get_segments(stokens, self.max_seq_length)
            return input_segments

    def call(self, inputs, training=None, mask=None):
        pooled_output, _ = self.bert_layer([inputs[0], inputs[1], inputs[2]],
                                           training=training)
        return pooled_output

    def text_to_bert_input(self, text):
        stokens1 = self.tokenizer.tokenize(text)

        tokens = list(chunks(stokens1, utils.BERT_SEQ_LENGTH))

        input_ids1 = []
        input_masks1 = []
        input_segments1 = []
        for stok in tokens:
            stokens1 = ["[CLS]"] + stok + ["[SEP]"]
            input_ids1.append(
                get_ids(stokens1, self.tokenizer, self.max_seq_length))
            input_masks1.append(get_masks(stokens1, self.max_seq_length))
            input_segments1.append(get_segments(stokens1, self.max_seq_length))
        return input_ids1, input_masks1, input_segments1

    def dataframe_to_bert_input(self, df):
        input_word_ids = []
        input_mask = []
        segment_ids = []
        ys = []
        for i, row in df.iterrows():
            a, b, c = self.text_to_bert_input(row["opinion"])
            ys = ys + [row["outcome"]] * len(a)
            """
            input_word_ids.append(a)
            input_mask.append(b)
            segment_ids.append(c)
            """
            input_word_ids = input_word_ids + a
            input_mask = input_mask + b
            segment_ids = segment_ids + c

        input_word_ids = np.array(input_word_ids)
        input_mask = np.array(input_mask)
        segment_ids = np.array(segment_ids)
        return [input_word_ids, input_mask, segment_ids], np.array(ys)

    def get_predictor(self):
        def bert_predict(text):
            if isinstance(text, list):
                res = []
                for t in text:
                    a, b, c = self.text_to_bert_input(t)
                    x = [np.array([a]), np.array([b]), np.array([c])]
                    res.append(self.predict(x)[0])
                return np.array(res)
            else:
                a, b, c = self.text_to_bert_input(text)
                x = [np.array([a]), np.array([b]), np.array([c])]
                return self.predict(x)

        return bert_predict

    def compile(self,
                optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5,
                                                   epsilon=1e-08,
                                                   clipnorm=1.0),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                metrics=None,
                loss_weights=None,
                sample_weight_mode=None,
                weighted_metrics=None,
                **kwargs):
        super().compile(
            optimizer=optimizer,
            loss=loss,
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
예제 #28
0
  # For training, we want a lot of parallel reading and shuffling.
  # For eval, we want no shuffling and parallel reading doesn't matter.
  dataset = tf.data.TFRecordDataset(input_file)
  if is_training:
    dataset = dataset.repeat()
    dataset = dataset.shuffle(buffer_size=100)

  dataset = dataset.apply(
      tf.data.experimental.map_and_batch(
          lambda record: _decode_record(record, name_to_features),
          batch_size=BATCH_SIZE,
          drop_remainder=drop_remainder))

  return dataset

tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))

tokenizer.tokenize("I can't wait to visit Bulgaria again!")

tokens = tokenizer.tokenize("I can't wait to visit Bulgaria again!")
tokenizer.convert_tokens_to_ids(tokens)


def flatten_layers(root_layer):
    if isinstance(root_layer, keras.layers.Layer):
        yield root_layer
    for layer in root_layer._layers:
        for sub_layer in flatten_layers(layer):
            yield sub_layer

예제 #29
0
jwt = JWTManager(app)

punctuate_model_name = 'PT_Punctuator.pcl'
punctuate_model_directory = './punctuate_model/'
punctuate_model_path = punctuate_model_directory + punctuate_model_name
app.config['punctuate_model_path'] = punctuate_model_path
punctuator_model = Punctuator(app.config['punctuate_model_path'])

classifier_model_name = 'saved_model/my_model'
classifier_model_directory = './classifier_model/'
classifier_model_path = classifier_model_directory + classifier_model_name
app.config['classifier_model_path'] = classifier_model_path
classifier_model = tf.keras.models.load_model(
    app.config['classifier_model_path'])
vocab_path = classifier_model_directory + 'vocab.txt'
tokenizer = FullTokenizer(vocab_file=vocab_path)


def punctuateTextFile(file_name):
    with open(file_name, "r") as file:
        text_to_punctuate = file.read()
        text_to_punctuate = text_to_punctuate.lower()
        text_to_punctuate = text_to_punctuate.translate(
            str.maketrans('', '', string.punctuation))
        punctuated_text = model.punctuate(text_to_punctuate)
        tokenize_sentences(punctuated_text)


@app.route('/process', methods=['POST'])
def trancribe():
    data = request.get_json()
https://www.tensorflow.org/guide/keras/save_and_serialize
https://machinelearningspace.com/sentiment-analysis-tensorflow/
https://keras.io/api/models/model/
"""

from tensorflow.keras.models import load_model
from bert import BertModelLayer
import pandas as pd
from bert.tokenization.bert_tokenization import FullTokenizer
import os
import numpy as np
from tqdm import tqdm

bert_model_name="uncased_L-12_H-768_A-12"
bert_ckpt_dir = os.path.join("model/", bert_model_name)
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
print('Loading the model')
model= load_model('saved_models/tensorflow_10000.h5', custom_objects={'BertModelLayer': BertModelLayer})
print('Model is loaded')
print(model.summary())
test=pd.read_csv('articles-validation-bypublisher.csv')
conlist=test['content'].tolist()
idlist=test['id'].tolist()
print('conlist', len(conlist),'idlist',len(idlist))

resList=[]

for id, content in tqdm(zip(idlist, conlist)):
    pred_tokens = tokenizer.tokenize(content)
    pred_tokens = ["[CLS]"] + pred_tokens + ["[SEP]"]
    pred_token_ids = list(tokenizer.convert_tokens_to_ids(pred_tokens))