def main(inputDataset, outputDir, modelType): print(inputDataset, outputDir, 'modelType : ', modelType) """Main method of this module.""" print('--------------loading model----------------------') tokenizer = FullTokenizer(vocab_file=( "/home/zenith-kaju/NLP---Hyperpartisan-News-Detection/TensorBert/vocab.txt" )) model = load_model( '/home/zenith-kaju/NLP---Hyperpartisan-News-Detection/TensorBert/models/' + modelType, custom_objects={'BertModelLayer': BertModelLayer}) print('------------------Model Loaded---------------------') with open(outputDir + "/" + runOutputFileName, 'w') as outFile: for file in os.listdir(inputDataset): if file.endswith(".xml"): tree = ElementTree.parse(inputDataset + "/" + file) root = tree.getroot() print('Total articles ', len(root)) for article in tqdm(root.iter('article')): articleID = article.attrib['id'] content = element_to_string(article) prediction = predict(content, tokenizer, model) outFile.write(articleID + " " + prediction + "\n") print("The predictions have been written to the output folder.")
def infer(review): bert_abs_path = '/home/shravan/python_programs/bert_leraning/data/' bert_model_name = 'multi_cased_L-12_H-768_A-12' bert_ckpt_dir = os.path.join(bert_abs_path, bert_model_name) bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt") bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json") tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, 'vocab.txt')) model_path = "/home/shravan/tf/tf/bert_tf2/bert_model" model = tf.keras.models.load_model(model_path) sentences = [review] classes = ['Negative', 'Positive'] pred_tokens = map(tokenizer.tokenize, sentences) pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens) pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens)) pred_token_ids = map(lambda tids: tids +[0]*(128-len(tids)),pred_token_ids) pred_token_ids = np.array(list(pred_token_ids)) predictions = model.predict(pred_token_ids).argmax(axis=-1) result = dict() for text, label in zip(sentences, predictions): result['text'] = text result['sentiment'] = classes[label] print("text:", text, "\nintent:", classes[label]) print() return result
def predict_new(doc, model, max_seq_len=150): """ Predict new document using the trained model. doc: input document in format of a string """ # clean the text doc = clean_txt(doc) # split the string text into list of subtexts doc = get_split(doc) # tokenize the subtexts as well as padding tokenizer = FullTokenizer( vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt")) pred_tokens = map(tokenizer.tokenize, doc) pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens) pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens)) pred_token_ids = map(lambda tids: tids + [0] * (max_seq_len - len(tids)), pred_token_ids) pred_token_ids = np.array(list(pred_token_ids)) # create model and load previous weights # model = create_model(max_seq_len = data.max_seq_len) # model.load_weights() # predict the subtexts and average the prediction predictions = model.predict(pred_token_ids) predictions = predictions[:, 1] avg_pred = predictions.mean() if avg_pred > 0.5: doc_label = 'fake' else: doc_label = 'Real' return doc_label, avg_pred
def __init__(self): self.MAX_SEQ_LEN = 38 self.modelDir = 'saved_model/1' self.vocabDir = 'config/vocab.txt' self.classes = ['PlayMusic', 'AddToPlaylist', 'RateBook', 'SearchScreeningEvent', 'BookRestaurant', 'GetWeather', 'SearchCreativeWork'] self.tokenizer = FullTokenizer(vocab_file=self.vocabDir) print("============load model start=============") self.model = self.loadModel() print("============load model success=============")
def __init__(self, max_seq_length, trainable, *args, **kwargs): super().__init__(*args, **kwargs) self.bert_layer = hub.KerasLayer( "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=trainable) vocab_file = self.bert_layer.resolved_object.vocab_file.asset_path.numpy( ) do_lower_case = self.bert_layer.resolved_object.do_lower_case.numpy() self.tokenizer = FullTokenizer(vocab_file, do_lower_case) self.max_seq_length = max_seq_length """
def bert_tokenizer(vocab_path=None): """Constructs a BERT tokenizer.""" # This import is from https://github.com/google-research/bert which is not # listed as a dependency in trax. # TODO(piotrekp1): using SubwordTextEncoder instead after fixing the # differences from bert.tokenization.bert_tokenization import FullTokenizer # pylint: disable=g-import-not-at-top if vocab_path is None: raise ValueError('vocab_path is required to construct the BERT tokenizer.') tokenizer = FullTokenizer(vocab_path, do_lower_case=True) return tokenizer
class BertConfig(BaseConfig): TRAIN_DATA_PATH = '/home/shravan/python_programs/generate_kannada_movie_reviews/modified_data/train' TEST_DATA_PATH = '/home/shravan/python_programs/generate_kannada_movie_reviews/modified_data/test' CSV_DELIMITER = '#' CSV_COLUMNS = ['Reviews', 'Sentiment'] BERT_BASE_PATH = '/home/shravan/python_programs/bert_leraning/data/' BERT_MODEL_NAME = 'multi_cased_L-12_H-768_A-12' BERT_CKPT_DIR = os.path.join(BERT_BASE_PATH, BERT_MODEL_NAME) BERT_CKPT_FILE = os.path.join(BERT_CKPT_DIR, 'bert_model.ckpt') BERT_CONF_FILE = os.path.join(BERT_CKPT_DIR, 'bert_config.json') BERT_VOCAB_FILE = os.path.join(BERT_CKPT_DIR, 'vocab.txt') DATA_COLUMN = CSV_COLUMNS[0] LABEL_COLUMN = CSV_COLUMNS[1] CLASSES = [0, 1] TOKENIZER = FullTokenizer(vocab_file=BERT_VOCAB_FILE) MAX_LEN = 128 @classmethod def classes(self): return self.CLASSES @classmethod def max_len(self): return self.MAX_LEN @classmethod def csv_columns(self): return self.CSV_COLUMNS @classmethod def delimiter(self): return self.CSV_DELIMITER @classmethod def train_data_path(self): return self.TRAIN_DATA_PATH @classmethod def test_data_path(self): return self.TEST_DATA_PATH @classmethod def data_column(self): return self.DATA_COLUMN @classmethod def label_column(self): return self.LABEL_COLUMN
def construct_flat_datasets(args1, subwords_path): global tokenizer_bert, tokenizer_ro, args args = args1 if args.bert: tokenizer_bert = FullTokenizer( vocab_file=join(args.bert_model_dir, "vocab.vocab")) tokenizer_bert.vocab_size = len(tokenizer_bert.vocab) samples = get_text_samples(args) if os.path.isfile(subwords_path + '.subwords'): tokenizer_ro = construct_tokenizer(None, subwords_path, args) else: tokenizer_ro = construct_tokenizer(list(samples), subwords_path, args) sample_train = int(args.total_samples * args.train_dev_split) if args.records: dataset = tf.data.Dataset.from_generator( generator_tensors_ids_and_segs, ((tf.int64, tf.int64), tf.int64), ((tf.TensorShape([None]), tf.TensorShape( [None])), tf.TensorShape([None]))) if args.separate: train_dataset = dataset dev_dataset = tf.data.Dataset.from_generator( generator_tensors_ids_and_segs_dev, ((tf.int64, tf.int64), tf.int64), ((tf.TensorShape( [None]), tf.TensorShape([None])), tf.TensorShape([None]))) return train_dataset, dev_dataset else: gen_dataset = generator_tensors_ids() dataset = list(gen_dataset) nr_samples = len(dataset) sample_train = int(args.train_dev_split * nr_samples) # dataset = tf.convert_to_tensor(dataset, dtype=tf.int64) dataset = tf.data.Dataset.from_generator( generator_tensors_ids, (tf.int64, tf.int64), (tf.TensorShape( [2, args.seq_length]), tf.TensorShape([args.seq_length]))) if args.separate: gen_dataset = generator_tensors_ids_dev() dev_dataset = list(gen_dataset) # dataset = tf.convert_to_tensor(dataset, dtype=tf.int64) dev_dataset = tf.data.Dataset.from_generator( generator_tensors_ids_dev, (tf.int64, tf.int64), (tf.TensorShape([2, args.seq_length ]), tf.TensorShape([args.seq_length]))) return dataset, dev_dataset train_dataset = dataset.take(sample_train) dev_dataset = dataset.skip(sample_train) return train_dataset, dev_dataset
def tokenizer_factory(vocab_file: str) -> FullTokenizer: """This method will return a BERT tokenizer initialized using the vocabulary file at `WoodgateSettings.bert_vocab_path`. :return: A BERT tokenizer. :rtype: FullTokenizer """ tokenizer: FullTokenizer = FullTokenizer( vocab_file=vocab_file ) return tokenizer
def initialize_tokenizer(model_dir: str) -> FullTokenizer: model_name = os.path.basename(model_dir) assert len(model_name ) > 0, '`{0}` is wrong directory name for a BERT model.'.format( model_dir) bert_model_ckpt = os.path.join(model_dir, "bert_model.ckpt") do_lower_case = not ((model_name.lower().find("cased") == 0) or (model_name.lower().find("_cased") >= 0) or (model_name.lower().find("-cased") >= 0)) validate_case_matches_checkpoint(do_lower_case, bert_model_ckpt) vocab_file = os.path.join(model_dir, "vocab.txt") return FullTokenizer(vocab_file, do_lower_case)
def __init__(self, config, cls_config, vocabs, vocab_file, num_labels=14): super(BertForTokenClassification, self).__init__(config) self.num_labels = num_labels self.tokenizer = FullTokenizer(vocab_file) self.max_length = cls_config['max_seq_length'] self.tag_vocab = { i: w for i, w in enumerate(vocabs['tag_vocab']['idx_to_token']) } self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dense = nn.Linear(config.hidden_size, num_labels) self.apply(self.init_bert_weights)
def run(): #load the traing data df = pd.read_csv(config.TRAINING_FILE) #initial preprocessing train, test = DatasetLoader(df).get_input() classes = df.cat.unique().tolist() tokenizer = FullTokenizer( vocab_file=os.path.join(config.BERT_CKPT_DIR, "vocab.txt")) #preparing the text for BERT classifier data = IntentProcessor(train, test, tokenizer, classes, max_seq_len=128) #initiate the BERT model model = BERTModel.create_model(config.MAX_LEN, classes, config.BERT_CKPT_FILE) print(model.summary()) #model training model.compile( optimizer=keras.optimizers.Adam(1e-5), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]) log_dir = config.OUTPUT_PATH + "/intent_classifier/" + datetime.datetime.now( ).strftime("%Y%m%d-%H%M%s") tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir) history = model.fit(x=data.train_x, y=data.train_y, validation_split=0.1, batch_size=config.BATCH_SIZE, shuffle=True, epochs=config.EPOCHS, callbacks=[tensorboard_callback]) #saving model checkpoint model.save_weights(config.SAVE_WEIGHTS_PATH) #plotting accuracy and loss plt.title("Accuracy") plt.plot(history.history["acc"], label="acc") plt.plot(history.history["val_acc"], label="val_acc") plt.legend() plt.show() plt.title("Loss") plt.plot(history.history["loss"], label="loss") plt.plot(history.history["val_loss"], label="val_loss") plt.legend() plt.show()
def prepare(self): sentence = self.sentence tokenizer = FullTokenizer( vocab_file=os.path.join(config.BERT_CKPT_DIR, "vocab.txt")) pred_tokens = map(tokenizer.tokenize, sentence) pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens) pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens)) pred_token_ids = map( lambda tids: tids + [0] * (config.MAX_LEN - len(tids)), pred_token_ids) pred_token_ids = np.array(list(pred_token_ids)) return pred_token_ids
def test_direct_keras_to_stock_compare(self): from tests.ext.modeling import BertModel, BertConfig, get_assignment_map_from_checkpoint bert_config = BertConfig.from_json_file(self.bert_config_file) tokenizer = FullTokenizer( vocab_file=os.path.join(self.bert_ckpt_dir, "vocab.txt")) # prepare input max_seq_len = 6 input_str = "Hello, Bert!" input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) input_mask = [1] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) input_ids = np.array([input_ids], dtype=np.int32) input_mask = np.array([input_mask], dtype=np.int32) token_type_ids = np.array([token_type_ids], dtype=np.int32) print(" tokens:", input_tokens) print( "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len, input_ids), input_ids.shape, token_type_ids) s_res = self.predict_on_stock_model(input_ids, input_mask, token_type_ids) k_res = self.predict_on_keras_model(input_ids, input_mask, token_type_ids) np.set_printoptions(precision=9, threshold=20, linewidth=200, sign="+", floatmode="fixed") print("s_res", s_res.shape) print("k_res", k_res.shape) print("s_res:\n {}".format(s_res[0, :2, :10]), s_res.dtype) print("k_res:\n {}".format(k_res[0, :2, :10]), k_res.dtype) adiff = np.abs(s_res - k_res).flatten() print("diff:", np.max(adiff), np.argmax(adiff)) self.assertTrue(np.allclose(s_res, k_res, atol=1e-6))
def predict_essay_grade(model, data, bert_ckpt_dir, pred_sentences): tokenizer = FullTokenizer( vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt")) pred_tokens = map(tokenizer.tokenize, pred_sentences) pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens) pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens)) pred_token_ids = map( lambda tids: tids + [0] * (data.max_seq_len - len(tids)), pred_token_ids) pred_token_ids = np.array(list(pred_token_ids)) print('pred_token_ids', pred_token_ids.shape) res = model.predict(pred_token_ids).argmax(axis=-1) return res
def get_tokenizers_ckeckpoint(args1): global args args = args1 tokenizer_ro_path = join(args.checkpoint, 'tokenizer_ro') tokenizer_ro = tfds.features.text.SubwordTextEncoder.load_from_file(tokenizer_ro_path) tf.compat.v1.logging.info('restoring ro tokenizer from {}'.format(tokenizer_ro_path)) tokenizer_bert = None if args.bert: tokenizer_bert_path = join(args.checkpoint, 'tokenizer_bert.vocab') tokenizer_bert = FullTokenizer(vocab_file=tokenizer_bert_path) tokenizer_bert.vocab_size = len(tokenizer_bert.vocab) tf.compat.v1.logging.info('restoring bert tokenizer from {}'.format(tokenizer_bert_path)) tf.compat.v1.logging.info('tokenizers restored') return tokenizer_ro, tokenizer_bert
def predict_on_stock_model(self, input_ids, input_mask, token_type_ids): from tests.ext.modeling import BertModel, BertConfig, get_assignment_map_from_checkpoint tf.compat.v1.reset_default_graph() tf_placeholder = tf.compat.v1.placeholder max_seq_len = input_ids.shape[-1] pl_input_ids = tf.compat.v1.placeholder(tf.int32, shape=(1, max_seq_len)) pl_mask = tf.compat.v1.placeholder(tf.int32, shape=(1, max_seq_len)) pl_token_type_ids = tf.compat.v1.placeholder(tf.int32, shape=(1, max_seq_len)) bert_config = BertConfig.from_json_file(self.bert_config_file) tokenizer = FullTokenizer( vocab_file=os.path.join(self.bert_ckpt_dir, "vocab.txt")) s_model = BertModel(config=bert_config, is_training=False, input_ids=pl_input_ids, input_mask=pl_mask, token_type_ids=pl_token_type_ids, use_one_hot_embeddings=False) tvars = tf.compat.v1.trainable_variables() (assignment_map, initialized_var_names) = get_assignment_map_from_checkpoint( tvars, self.bert_ckpt_file) tf.compat.v1.train.init_from_checkpoint(self.bert_ckpt_file, assignment_map) with tf.compat.v1.Session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) s_res = sess.run(s_model.get_sequence_output(), feed_dict={ pl_input_ids: input_ids, pl_token_type_ids: token_type_ids, pl_mask: input_mask, }) return s_res
def run(data_df, bert_ckpt_dir, bert_config_file, bert_ckpt_file, output_model): tokenizer = FullTokenizer( vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt")) data = EssayGraderData(tokenizer, data_df, sample_size=10 * 128 * 2, max_seq_len=512) print(" train_x", data.train_x.shape) print("train_x_token_types", data.train_x_token_types.shape) print(" train_y", data.train_y.shape) print(" test_x", data.test_x.shape) print(" max_seq_len", data.max_seq_len) adapter_size = None # use None to fine-tune all of BERT model = create_model(data.max_seq_len, bert_config_file, bert_ckpt_file, adapter_size=adapter_size) model = train_essay_grader(model, data) model.save_weights(output_model, overwrite=True)
def predict(self): sent = self.sentence classes = ['food', 'transport', 'shopping', 'bills'] init_time = datetime.datetime.now() tokenizer = FullTokenizer( vocab_file=os.path.join(config.BERT_CKPT_DIR, "vocab.txt")) pred_tokens = map(tokenizer.tokenize, sent) pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens) pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens)) pred_token_ids = map( lambda tids: tids + [0] * (config.MAX_LEN - len(tids)), pred_token_ids) pred_token_ids = np.array(list(pred_token_ids)) #create the model model = BERTModel.create_model(config.MAX_LEN, classes, config.BERT_CKPT_FILE) print(model.summary()) #model prediction using saved model checkpoints model.load_weights(config.SAVE_WEIGHTS_PATH) print("model loaded") predictions = model.predict(pred_token_ids).argmax(axis=-1) print(f"prediction: {predictions}") print("result") final_time = datetime.datetime.now() - init_time results_lst = [] result = dict() for text, label in zip(sent, predictions): result['text'] = text result['label'] = classes[label] result['time_taken'] = final_time.total_seconds() results_lst.append(result.copy()) return results_lst
def __init__(self, vectorizer=None, load_weights=False, use_bert=True, bert_model_name=None): self.bert_model_name = bert_model_name if use_bert: self.vocabulary = set( line.strip() for line in open("D:/Development/Projects/bert_models/" + self.bert_model_name + "/vocab.txt", encoding="utf-8")) self.bert_tokenizer = FullTokenizer(vocab_file=os.path.join( "D:/Development/Projects/bert_models/" + self.bert_model_name, "vocab.txt")) self.vectorizer = MultiVectorizer(tokenizer=self.bert_tokenizer, use_bert=use_bert) else: self.vectorizer = vectorizer self.load_weights = load_weights self.max_shape = None if self.load_weights: self.vectorizer = self.vectorizer.load( "data/weights/vectorizer.dat") self.METRICS = [ BinaryAccuracy(name='accuracy'), Precision(name='precision'), Recall(name='recall'), AUC(name='auc') ] self.checkpoint_path = "data/weights/checkpoints/cp-epoch_{epoch:02d}-accuracy_{accuracy:.3f}_val_precision_{val_precision:.3f}-val_recall_{val_recall:.3f}-val_auc_{val_auc:.3f}.ckpt" self.checkpoint_dir = os.path.dirname(self.checkpoint_path) print("Checkpoint dir:", self.checkpoint_dir)
# freeze weights if adapter-BERT is used if adapter_size is not None: freeze_bert_layers(bert) model.compile( optimizer=keras.optimizers.Adam(), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]) model.summary() return model if __name__ == "__main__": tokenizer = FullTokenizer(vocab_file=CHECKPOINT_VOCAB, do_lower_case=True) data = MovieReviewData(tokenizer, sample_size=10 * 128 * 19531, max_seq_len=128) print(" train_x", data.train_x.shape) print("train_x_token_types", data.train_x_token_types.shape) print(" train_y", data.train_y.shape) print(" test_x", data.test_x.shape) print(" max_seq_len", data.max_seq_len) adapter_size = None # use None to fine-tune all of BERT or 32,64 model = create_model(data.max_seq_len, adapter_size=adapter_size)
bert_model_dir = "2018_10_18" bert_model_name = "uncased_L-24_H-1024_A-16" for fname in [ "bert_config.json", "vocab.txt", "bert_model.ckpt.meta", "bert_model.ckpt.index", "bert_model.ckpt.data-00000-of-00001" ]: cmd = f"gsutil cp gs://bert_models/{bert_model_dir}/{bert_model_name}/{fname} .model/{bert_model_name}" print(cmd) bert_ckpt_dir = os.path.join( "/Users/donghyoung/PycharmProjects/TensorFlowTest/model/", bert_model_name) bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt") bert_config_file = os.path.join(bert_ckpt_dir, "bert_config") tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt")) data = MovieReviewData( tokenizer, sample_size=10 * 128 * 2, #5000, max_seq_len=128) print(" train_x", data.train_x.shape) print(data.train_x[:2]) print("train_x_token_types", data.train_x_token_types.shape) print(" train_y", data.train_y.shape) print(" test_x", data.test_x.shape) print(" max_seq_len", data.max_seq_len)
CHECKPOINT = './bert/checkpoints/bert_base' CHECKPOINT_VOCAB = os.path.join(CHECKPOINT, 'vocab.txt') model = create_model(23, adapter_size=None) model.load_weights("twitter_initial.h5") pred_sentences = [ "<user> yay ! ! #lifecompleted . tweet / facebook me to let me", "workin hard or hardly workin rt <user> at hardee's with my future", "<user> check my tweet pic out . that was the outfit before . this is", "wish i could be out all night tonight ! <user>", "<user> i got kicked out the wgm", "rt <user> <user> <user> yes she is ! u tell it ! my lips are" ] tokenizer = FullTokenizer(vocab_file=CHECKPOINT_VOCAB) pred_tokens = map(tokenizer.tokenize, pred_sentences) pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens) pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens)) pred_token_ids = map(lambda tids: tids + [0] * (23 - len(tids)), pred_token_ids) pred_token_ids = np.array(list(pred_token_ids)) print('pred_token_ids', pred_token_ids.shape) res = model.predict(pred_token_ids).argmax(axis=-1) for text, sentiment in zip(pred_sentences, res): print(" text:", text) print(" res:", ["negative", "positive"][sentiment])
x.append(token_ids) y.append(self.classes.index(label)) return np.array(x), np.array(y) def _pad(self, ids): x = [] for input_ids in ids: input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)] input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids)) x.append(np.array(input_ids)) return np.array(x) tokenizer = FullTokenizer(vocab_file=os.path.join(main_path+"uncased_L-12_H-768_A-12/vocab.txt")) tokenizer.tokenize("I can't wait to visit Bulgaria again!") classes = train.labels.unique().tolist() data = IntentDetectionData(train, test, tokenizer, classes, max_seq_len=128) responses=pd.read_csv(main_path+'response.csv')
parser.add_argument('--val_split_ratio', help='Validation size ration', default=0.2, type=float) parser.add_argument('--bert_folder_path', help='Folder path of model BERT', default=os.path.join('/home/vahidsanei_google_com/', 'data','uncased_L-12_H-768_A-12')) parser.add_argument('--bert_embedding_size', help='BERT output embedding size', default=768, type=int) parser.add_argument('--keep_prob', help='Kept rate of dropout layers', default=0.6, type=float) parser.add_argument('--max_content_length', help='Maximum content length of from each leaf of DOM', default=256, type=int) parser.add_argument('--n_hidden_layers', help='Number of hidden layers', default=2, type=int) parser.add_argument('--url', help='URL link of business website', default=None) parser.add_argument('--best_weight_path', help='URL link of business website', default=None) parser.add_argument('--chrome_path', help='The path to chrome engine for Python package selenium', default=CHROME_PATH) st_args = utils.style() print(f'{st_args.BOLD}{st_args.RED}',end='') args = parser.parse_args() print(st_args.RESET, end='') tokenizer = FullTokenizer(vocab_file=os.path.join(args.bert_folder_path, 'vocab.txt')) bert_ckpt_file = os.path.join(args.bert_folder_path, 'bert_model.ckpt') bert_config_file = os.path.join(args.bert_folder_path, 'bert_config.json') if args.tasktype == 'C': is_multicase = True label_colname = 'categories' class_names = ['housework', 'health', 'financial', 'fitness', 'entertainment', 'car', 'law', 'food', 'beauty', 'education'] class_names.sort() else: is_multicase = False label_colname = 'stars' class_names = ['stars'] if args.url is not None: if args.best_weight_path is None:
classes = [ 'Research', 'Coding Guidelines', 'Case Study', 'Financial Reports', 'CompanyDetails', 'AuditProposals' ] while True: n = input("Enter query") if n == "EXIT": print("Exiting") break sentence = [n] print(sentence) tokenizer = FullTokenizer( vocab_file="/content/drive/MyDrive/EY_DATA/vocab.txt") pred_tokens = map(tokenizer.tokenize, sentence) pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens) pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens)) max_seq_len = 128 pred_token_ids = map(lambda tids: tids + [0] * (max_seq_len - len(tids)), pred_token_ids) pred_token_ids = np.array(list(pred_token_ids)) predictions = new_model.predict(pred_token_ids).argmax(axis=-1) intents_we_came_across = [] for text, label in zip(sentence, predictions):
import tensorflow as tf from tensorflow.keras.layers import Input, Dense from tensorflow.keras import Model import tensorflow_hub as hub from bert.tokenization.bert_tokenization import FullTokenizer from settings import * from preprocessing import build_dataset # Collect pretrained BERT layer. Note BERT takes 3 inputs: tokens, masks, segments bert_layer = hub.KerasLayer( 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1', trainable=False) # Pull out data for BERT and build tokenizer vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = FullTokenizer(vocab_file, do_lower_case) # Build datasets train_dataset = build_dataset( 'dataset/training.1600000.processed.noemoticon.csv', tokenizer, MAX_LENGTH, header=None, encoding='latin1') test_dataset = build_dataset('dataset/testdata.manual.2009.06.14.csv', tokenizer, MAX_LENGTH, header=None, encoding='latin1') # Shuffle and batch datasets
def load_tokenizer(): print("\n loading tokenizer") current_dir = dirname(realpath(__file__)) # model_path = join(current_dir, model_folder, model_name) vocab_file = join(current_dir, model_dir, "vocab.txt") return FullTokenizer(vocab_file, do_lower_case=True)
def predict(): RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) tf.random.set_seed(RANDOM_SEED) train = pd.read_csv("HT_train.csv") valid = pd.read_csv("HT_valid.csv") test = pd.read_csv("HT_test.csv") train = train.append(valid).reset_index(drop=True) bert_model_name = "intent" bert_ckpt_dir = os.path.join("model2/", bert_model_name) #checkpoint directory bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt") #checkpoint file bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json") #configuration file class IntentDetectionData: DATA_COLUMN = "text" LABEL_COLUMN = "intent" def __init__(self, train, test, tokenizer: FullTokenizer, classes, max_seq_len=192): self.tokenizer = tokenizer self.max_seq_len = 0 self.classes = classes train, test = map( lambda df: df.reindex(df[IntentDetectionData.DATA_COLUMN].str. len().sort_values().index), [train, test]) ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test]) print("max seq_len", self.max_seq_len) self.max_seq_len = min(self.max_seq_len, max_seq_len) self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x]) def _prepare(self, df): x, y = [], [] for _, row in tqdm(df.iterrows()): text, label = row[IntentDetectionData.DATA_COLUMN], row[ IntentDetectionData.LABEL_COLUMN] tokens = self.tokenizer.tokenize(text) tokens = ["[CLS]"] + tokens + ["[SEP]"] token_ids = self.tokenizer.convert_tokens_to_ids(tokens) self.max_seq_len = max(self.max_seq_len, len(token_ids)) x.append(token_ids) y.append(self.classes.index(label)) return np.array(x), np.array(y) def _pad(self, ids): x = [] for input_ids in ids: input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)] input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids)) x.append(np.array(input_ids)) return np.array(x) tokenizer = FullTokenizer( vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt")) def create_model(max_seq_len, bert_ckpt_file): with tf.io.gfile.GFile(bert_config_file, "r") as reader: bc = StockBertConfig.from_json_string(reader.read()) bert_params = map_stock_config_to_params(bc) bert_params.adapter_size = None bert = BertModelLayer.from_params(bert_params, name="bert") input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids") bert_output = bert(input_ids) print("bert shape", bert_output.shape) cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output) cls_out = keras.layers.Dropout(0.5)(cls_out) logits = keras.layers.Dense(units=768, activation="tanh")(cls_out) logits = keras.layers.Dropout(0.5)(logits) logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits) model = keras.Model(inputs=input_ids, outputs=logits) model.build(input_shape=(None, max_seq_len)) load_stock_weights(bert, bert_ckpt_file) return model classes = train.intent.unique().tolist() data = IntentDetectionData(train, test, tokenizer, classes, max_seq_len=128) model = create_model(data.max_seq_len, bert_ckpt_file) model.compile( optimizer=keras.optimizers.Adam(1e-5), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]) history = model.fit( x=data.train_x, y=data.train_y, validation_split=0.1, batch_size=16, shuffle=True, epochs=5, # callbacks=[tensorboard_callback] ) model.save("save_model") model = keras.models.load_model('save_model') # saved_model = pickle.dumps(model) # # Load the pickled model # knn_from_pickle = pickle.loads(saved_model) # Use the loaded pickled model to make predictions # knn_from_pickle.predict(X_test) # joblib.dump(model, 'intent_model.pkl') # # intent_model = open('intent_model.pkl','rb') # model20 = joblib.load('intent_model.pkl') # with open('model_pickle','wb') as f: # pickle.dump(model,f) # with open('model_pickle','rb') as f: # mp=pickle.load(f) if request.method == 'POST': message = request.form['message'] inputdata = [message] pred_tokens = map(tokenizer.tokenize, inputdata) pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens) pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens)) pred_token_ids = map( lambda tids: tids + [0] * (data.max_seq_len - len(tids)), pred_token_ids) pred_token_ids = np.array(list(pred_token_ids)) my_prediction = model.predict(pred_token_ids).argmax(axis=-1) if my_prediction == 0: x = "Details" elif my_prediction == 1: x = "Create" elif my_prediction == 2: x = "Book" # return jsonify(inputdata) # return jsonify(my_prediction) return render_template('result.html', prediction=x)
jwt = JWTManager(app) punctuate_model_name = 'PT_Punctuator.pcl' punctuate_model_directory = './punctuate_model/' punctuate_model_path = punctuate_model_directory + punctuate_model_name app.config['punctuate_model_path'] = punctuate_model_path punctuator_model = Punctuator(app.config['punctuate_model_path']) classifier_model_name = 'saved_model/my_model' classifier_model_directory = './classifier_model/' classifier_model_path = classifier_model_directory + classifier_model_name app.config['classifier_model_path'] = classifier_model_path classifier_model = tf.keras.models.load_model( app.config['classifier_model_path']) vocab_path = classifier_model_directory + 'vocab.txt' tokenizer = FullTokenizer(vocab_file=vocab_path) def punctuateTextFile(file_name): with open(file_name, "r") as file: text_to_punctuate = file.read() text_to_punctuate = text_to_punctuate.lower() text_to_punctuate = text_to_punctuate.translate( str.maketrans('', '', string.punctuation)) punctuated_text = model.punctuate(text_to_punctuate) tokenize_sentences(punctuated_text) @app.route('/process', methods=['POST']) def trancribe(): data = request.get_json()