def process_line(line): inputs = [] outputs = [] for token in line.split('|'): if len(token) == 0: continue inputs += ThaiWordSegmentLabeller.get_input_labels(token) outputs += ThaiWordSegmentLabeller.get_output_labels(token) return inputs, outputs
def predict(df, core): print(f'loading tensorflow to core {core}') import tensorflow as tf sys.path.append('./thai-word-segmentation') from thainlplib import ThaiWordSegmentLabeller as tlabel config = tf.ConfigProto(device_count={'GPU': 0}) sess = tf.Session(config=config) model = tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], model_path) graph = tf.get_default_graph() g_inputs = graph.get_tensor_by_name('IteratorGetNext:1') g_lengths = graph.get_tensor_by_name('IteratorGetNext:0') g_training = graph.get_tensor_by_name('Placeholder_1:0') g_outputs = graph.get_tensor_by_name('boolean_mask_1/Gather:0') results = [] for idx, row in df.iterrows(): test_input = clean.fixing(row['text']) inputs = [tlabel.get_input_labels(test_input)] len_input = [len(test_input)] result = sess.run(g_outputs, feed_dict={ g_inputs: inputs, g_lengths: len_input, g_training: False }) cut_word = split(test_input, nonzero(result)) cut_word = clean_n_sub(cut_word) results.append(cut_word) return results
def sertis_tokenizer(text, saved_model_path): text = text.strip() if text == '': return [''] inputs = [ThaiWordSegmentLabeller.get_input_labels(text)] lengths = [len(text)] output = [] with tf.Session() as session: model = tf.saved_model.loader.load( session, [tf.saved_model.tag_constants.SERVING], saved_model_path) signature = model.signature_def[tf.saved_model.signature_constants. DEFAULT_SERVING_SIGNATURE_DEF_KEY] graph = tf.get_default_graph() g_inputs = graph.get_tensor_by_name(signature.inputs['inputs'].name) g_lengths = graph.get_tensor_by_name(signature.inputs['lengths'].name) g_training = graph.get_tensor_by_name( signature.inputs['training'].name) g_outputs = graph.get_tensor_by_name(signature.outputs['outputs'].name) y = session.run(g_outputs, feed_dict={ g_inputs: inputs, g_lengths: lengths, g_training: False }) return split(text, nonzero(y))
def sertis_tokenizer(text, saved_model_path): # inputs = [ThaiWordSegmentLabeller.get_input_labels(text)] inputs = [[ThaiWordSegmentLabeller.get_input_labels(i)] for i in text] print(inputs) lengths = [[len(i)] for i in text] print(lengths) with tf.Session() as session: model = tf.saved_model.loader.load( session, [tf.saved_model.tag_constants.SERVING], saved_model_path) signature = model.signature_def[tf.saved_model.signature_constants. DEFAULT_SERVING_SIGNATURE_DEF_KEY] graph = tf.get_default_graph() g_inputs = graph.get_tensor_by_name(signature.inputs['inputs'].name) g_lengths = graph.get_tensor_by_name(signature.inputs['lengths'].name) g_training = graph.get_tensor_by_name( signature.inputs['training'].name) g_outputs = graph.get_tensor_by_name(signature.outputs['outputs'].name) # y = session.run(g_outputs, feed_dict = {g_inputs: inputs, g_lengths: lengths, g_training: False}) # y = session.run(g_outputs, feed_dict = {g_inputs: inputs, g_lengths: lengths, g_training: False}) for i, j in enumerate(inputs): print(i) print(j) print(lengths[i]) y = session.run(g_outputs, feed_dict={ g_inputs: j, g_lengths: lengths[i], g_training: False }) print(split(text[i], nonzero(y))) #print(y) return [split(text, nonzero(y))]
from thainlplib import ThaiWordSegmentLabeller, ThaiWordSegmentationModel # Training and validation data configuration training_data_file = '/tmp/training.tf_record' validation_data_file = '/tmp/validation.tf_record' vocabulary_size = ThaiWordSegmentLabeller.get_input_vocabulary_size() num_output_labels = ThaiWordSegmentLabeller.get_output_vocabulary_size() # Model hyperparameters dropout = 0.50 state_size = 128 learning_rate = 0.001 # Other configuration buffer_size = 150000 # Read all data to CPU memory batch_size = 112 # Lower/increase this depending on your GPU memory size validate_every_n_iterations = 100 checkpoint_path = 'checkpoints' model = ThaiWordSegmentationModel(training_data_file, validation_data_file, buffer_size, batch_size, vocabulary_size, num_output_labels, state_size, dropout) model.train(learning_rate, validate_every_n_iterations, checkpoint_path, restore_checkpoint=False)
from thainlplib import ThaiWordSegmentLabeller import numpy as np import tensorflow as tf # Pretrained model weights location saved_model_path = 'saved_model' # Input text thai_txt = open("/Users/korn/Desktop/thai-word-data.txt", "r") text = thai_txt.read() # Convert text to labels inputs = [ThaiWordSegmentLabeller.get_input_labels(text)] lengths = [len(text)] def nonzero(a): return [i for i, e in enumerate(a) if e != 0] def split(s, indices): return [s[i:j] for i, j in zip(indices, indices[1:] + [None])] with tf.Session() as session: # Read model weights model = tf.saved_model.loader.load(session, [tf.saved_model.tag_constants.SERVING], saved_model_path) # Get model input variables
def sertis_tokenizer(text, saved_model_path): #print(text) text = text.strip().split('||') #print(len(text)) #print(text) for i in range(len(text)): if text[i] == '': text[i] = ' ' inputs = [[ThaiWordSegmentLabeller.get_input_labels(i)] for i in text] #print(len(inputs)) #print(inputs) lengths = [[len(i)] for i in text] #print(lengths) with tf.Session() as session: model = tf.saved_model.loader.load( session, [tf.saved_model.tag_constants.SERVING], saved_model_path) signature = model.signature_def[tf.saved_model.signature_constants. DEFAULT_SERVING_SIGNATURE_DEF_KEY] graph = tf.get_default_graph() g_inputs = graph.get_tensor_by_name(signature.inputs['inputs'].name) g_lengths = graph.get_tensor_by_name(signature.inputs['lengths'].name) g_training = graph.get_tensor_by_name( signature.inputs['training'].name) g_outputs = graph.get_tensor_by_name(signature.outputs['outputs'].name) label = [] all_words = [] for i, j in enumerate(inputs): # if j == [ThaiWordSegmentLabeller.get_input_labels('')]: # print('YES') #print(j) #print(lengths[i]) y = session.run(g_outputs, feed_dict={ g_inputs: j, g_lengths: lengths[i], g_training: False }) words = split(text[i], nonzero(y)) words = [word.strip() for word in words if word.strip() != ''] #print(i) if i % 2: print('label') label = label + ['1'] * len(words) else: print('not label') label = label + ['0'] * len(words) all_words = all_words + words print(len(all_words)) #print(all_words) if len(all_words) != len(label): print('-------------------------------------------------') print('-------------------------------------------------') print('WTF') print(all_words) print(label) print('-------------------------------------------------') print('-------------------------------------------------') #print(all_words) #print(label) #print('finished!!!') return ('||'.join(all_words), '||'.join(label), len(label))