コード例 #1
0
ファイル: preprocess.py プロジェクト: KornCode/Sen_to_Word
def process_line(line):
    inputs = []
    outputs = []
    for token in line.split('|'):
        if len(token) == 0: continue
        inputs += ThaiWordSegmentLabeller.get_input_labels(token)
        outputs += ThaiWordSegmentLabeller.get_output_labels(token)
    return inputs, outputs
コード例 #2
0
ファイル: data2token.py プロジェクト: LXZE/DL_SA
def predict(df, core):
    print(f'loading tensorflow to core {core}')
    import tensorflow as tf
    sys.path.append('./thai-word-segmentation')
    from thainlplib import ThaiWordSegmentLabeller as tlabel

    config = tf.ConfigProto(device_count={'GPU': 0})
    sess = tf.Session(config=config)
    model = tf.saved_model.loader.load(sess,
                                       [tf.saved_model.tag_constants.SERVING],
                                       model_path)

    graph = tf.get_default_graph()
    g_inputs = graph.get_tensor_by_name('IteratorGetNext:1')
    g_lengths = graph.get_tensor_by_name('IteratorGetNext:0')
    g_training = graph.get_tensor_by_name('Placeholder_1:0')
    g_outputs = graph.get_tensor_by_name('boolean_mask_1/Gather:0')

    results = []
    for idx, row in df.iterrows():
        test_input = clean.fixing(row['text'])
        inputs = [tlabel.get_input_labels(test_input)]
        len_input = [len(test_input)]
        result = sess.run(g_outputs,
                          feed_dict={
                              g_inputs: inputs,
                              g_lengths: len_input,
                              g_training: False
                          })
        cut_word = split(test_input, nonzero(result))
        cut_word = clean_n_sub(cut_word)
        results.append(cut_word)
    return results
def sertis_tokenizer(text, saved_model_path):
    text = text.strip()
    if text == '':
        return ['']
    inputs = [ThaiWordSegmentLabeller.get_input_labels(text)]
    lengths = [len(text)]
    output = []
    with tf.Session() as session:
        model = tf.saved_model.loader.load(
            session, [tf.saved_model.tag_constants.SERVING], saved_model_path)
        signature = model.signature_def[tf.saved_model.signature_constants.
                                        DEFAULT_SERVING_SIGNATURE_DEF_KEY]
        graph = tf.get_default_graph()
        g_inputs = graph.get_tensor_by_name(signature.inputs['inputs'].name)
        g_lengths = graph.get_tensor_by_name(signature.inputs['lengths'].name)
        g_training = graph.get_tensor_by_name(
            signature.inputs['training'].name)
        g_outputs = graph.get_tensor_by_name(signature.outputs['outputs'].name)
        y = session.run(g_outputs,
                        feed_dict={
                            g_inputs: inputs,
                            g_lengths: lengths,
                            g_training: False
                        })
    return split(text, nonzero(y))
コード例 #4
0
def sertis_tokenizer(text, saved_model_path):
    # inputs = [ThaiWordSegmentLabeller.get_input_labels(text)]
    inputs = [[ThaiWordSegmentLabeller.get_input_labels(i)] for i in text]
    print(inputs)
    lengths = [[len(i)] for i in text]
    print(lengths)
    with tf.Session() as session:
        model = tf.saved_model.loader.load(
            session, [tf.saved_model.tag_constants.SERVING], saved_model_path)
        signature = model.signature_def[tf.saved_model.signature_constants.
                                        DEFAULT_SERVING_SIGNATURE_DEF_KEY]
        graph = tf.get_default_graph()

        g_inputs = graph.get_tensor_by_name(signature.inputs['inputs'].name)
        g_lengths = graph.get_tensor_by_name(signature.inputs['lengths'].name)
        g_training = graph.get_tensor_by_name(
            signature.inputs['training'].name)
        g_outputs = graph.get_tensor_by_name(signature.outputs['outputs'].name)
        # y = session.run(g_outputs, feed_dict = {g_inputs: inputs, g_lengths: lengths, g_training: False})
        # y = session.run(g_outputs, feed_dict = {g_inputs: inputs, g_lengths: lengths, g_training: False})
        for i, j in enumerate(inputs):
            print(i)
            print(j)
            print(lengths[i])
            y = session.run(g_outputs,
                            feed_dict={
                                g_inputs: j,
                                g_lengths: lengths[i],
                                g_training: False
                            })
            print(split(text[i], nonzero(y)))
        #print(y)
        return [split(text, nonzero(y))]
コード例 #5
0
from thainlplib import ThaiWordSegmentLabeller, ThaiWordSegmentationModel

# Training and validation data configuration
training_data_file = '/tmp/training.tf_record'
validation_data_file = '/tmp/validation.tf_record'
vocabulary_size = ThaiWordSegmentLabeller.get_input_vocabulary_size()
num_output_labels = ThaiWordSegmentLabeller.get_output_vocabulary_size()

# Model hyperparameters
dropout = 0.50
state_size = 128
learning_rate = 0.001

# Other configuration
buffer_size = 150000  # Read all data to CPU memory
batch_size = 112  # Lower/increase this depending on your GPU memory size
validate_every_n_iterations = 100
checkpoint_path = 'checkpoints'

model = ThaiWordSegmentationModel(training_data_file, validation_data_file,
                                  buffer_size, batch_size, vocabulary_size,
                                  num_output_labels, state_size, dropout)
model.train(learning_rate,
            validate_every_n_iterations,
            checkpoint_path,
            restore_checkpoint=False)
コード例 #6
0
from thainlplib import ThaiWordSegmentLabeller
import numpy as np
import tensorflow as tf

# Pretrained model weights location
saved_model_path = 'saved_model'

# Input text
thai_txt = open("/Users/korn/Desktop/thai-word-data.txt", "r")
text = thai_txt.read()

# Convert text to labels
inputs = [ThaiWordSegmentLabeller.get_input_labels(text)]
lengths = [len(text)]


def nonzero(a):
    return [i for i, e in enumerate(a) if e != 0]


def split(s, indices):
    return [s[i:j] for i, j in zip(indices, indices[1:] + [None])]


with tf.Session() as session:
    # Read model weights
    model = tf.saved_model.loader.load(session,
                                       [tf.saved_model.tag_constants.SERVING],
                                       saved_model_path)

    # Get model input variables
コード例 #7
0
def sertis_tokenizer(text, saved_model_path):
    #print(text)
    text = text.strip().split('||')
    #print(len(text))
    #print(text)
    for i in range(len(text)):
        if text[i] == '':
            text[i] = ' '
    inputs = [[ThaiWordSegmentLabeller.get_input_labels(i)] for i in text]
    #print(len(inputs))
    #print(inputs)
    lengths = [[len(i)] for i in text]
    #print(lengths)
    with tf.Session() as session:
        model = tf.saved_model.loader.load(
            session, [tf.saved_model.tag_constants.SERVING], saved_model_path)
        signature = model.signature_def[tf.saved_model.signature_constants.
                                        DEFAULT_SERVING_SIGNATURE_DEF_KEY]
        graph = tf.get_default_graph()
        g_inputs = graph.get_tensor_by_name(signature.inputs['inputs'].name)
        g_lengths = graph.get_tensor_by_name(signature.inputs['lengths'].name)
        g_training = graph.get_tensor_by_name(
            signature.inputs['training'].name)
        g_outputs = graph.get_tensor_by_name(signature.outputs['outputs'].name)
        label = []
        all_words = []
        for i, j in enumerate(inputs):
            # if j == [ThaiWordSegmentLabeller.get_input_labels('')]:
            # print('YES')
            #print(j)
            #print(lengths[i])
            y = session.run(g_outputs,
                            feed_dict={
                                g_inputs: j,
                                g_lengths: lengths[i],
                                g_training: False
                            })
            words = split(text[i], nonzero(y))
            words = [word.strip() for word in words if word.strip() != '']
            #print(i)
            if i % 2:
                print('label')
                label = label + ['1'] * len(words)
            else:
                print('not label')
                label = label + ['0'] * len(words)
            all_words = all_words + words
    print(len(all_words))
    #print(all_words)
    if len(all_words) != len(label):
        print('-------------------------------------------------')
        print('-------------------------------------------------')
        print('WTF')
        print(all_words)
        print(label)
        print('-------------------------------------------------')
        print('-------------------------------------------------')
    #print(all_words)
    #print(label)
    #print('finished!!!')
    return ('||'.join(all_words), '||'.join(label), len(label))