def train(self, corpus_path):
        self.__char_processor = VocabularyProcessor(
            max_document_length=self.MAX_SENTENCE_LENGTH,
            tokenizer_fn=NeuralPosTagger.char_tokenizer_fn)

        self.__tag_processor = VocabularyProcessor(
            max_document_length=self.MAX_SENTENCE_LENGTH,
            tokenizer_fn=NeuralPosTagger.tag_tokenizer_fn)

        training_corpus = Corpus(corpus_path)
        items = [{
            'x': list(self.__char_processor.transform(item['text']))[0],
            'y': list(self.__tag_processor.transform(item['tag']))[0],
            'length': item['length']
        } for item in training_corpus.items()]

        self.__char_processor.fit('')
        self.__tag_processor.fit('')

        if os.path.exists(self.__model_path):
            shutil.rmtree(self.__model_path)

        os.makedirs(self.__model_path)
        self.__char_processor.save(self.__char_processor_path)
        self.__tag_processor.save(self.__tag_processor_path)
        self.__estimator = self.__create_estimator()

        print('Training: %d' % len(training_corpus))
        print('Character: %d, Tag: %d' %
              (len(self.__char_processor.vocabulary_),
               len(self.__tag_processor.vocabulary_)))

        random.shuffle(items)
        pivot = int(len(items) * 0.8)
        train_set = items[:pivot]
        dev_set = items[pivot:]

        class ValidationHook(tf.train.SessionRunHook):
            def __init__(self, estimator, input_fn, dataset):
                self.__every_n_steps = 100
                self.__estimator = estimator
                self.__input_fn = input_fn
                self.__dataset = dataset

            def before_run(self, run_context):
                graph = run_context.session.graph
                return tf.train.SessionRunArgs(tf.train.get_global_step(graph))

            def after_run(self, run_context, run_values):
                if run_values.results % self.__every_n_steps == 0:
                    result = self.__estimator.evaluate(
                        input_fn=lambda: self.__input_fn(self.__dataset), )
                    print('#%d %s' % (run_values.results, result))

        self.__estimator.train(
            input_fn=lambda: self.__input_fn(
                train_set, epoch=self.__params['epoch_size'], shuffle=True),
            hooks=[ValidationHook(self.__estimator, self.__input_fn, dev_set)],
        )
        print('Training completed.')
    def __init__(self, model_dir):
        self.__params = {
            'batch_size': 1000,
            'epoch_size': 2,
            'cell_size': 300,
            'char_embedding_size': 300,
            'learning_rate': 0.001,
        }

        self.__model_path = model_dir
        self.__char_processor_path = os.path.join(self.__model_path,
                                                  self.__CHAR_PROCESSOR_NAME)
        self.__tag_processor_path = os.path.join(self.__model_path,
                                                 self.__TAG_PROCESSOR_NAME)

        self.__char_processor = None
        self.__tag_processor = None
        self.__estimator = None

        if os.path.exists(self.__model_path):
            self.__char_processor = VocabularyProcessor.restore(
                self.__char_processor_path)
            self.__tag_processor = VocabularyProcessor.restore(
                self.__tag_processor_path)
            self.__estimator = self.__create_estimator()
Beispiel #3
0
def create_vocab(input_iter, min_frequency):
    """
  用tf自带的tensorflow.contrib.learn.python.learn.preprocessing类处理句子
  """
    vocab_processor = VocabularyProcessor(
        config.max_seq_len,
        min_frequency=min_frequency,
        tokenizer_fn=tokenizer_fn,
        vocabulary=CategoricalVocabularyMy(),  #扩展词库,默认前面4个词是预设的
    )
    vocab_processor.fit(input_iter)
    return vocab_processor
Beispiel #4
0
def process(fild_dir, max_length=70, sep=':'):
    """
    该函数的作用是把诗向量化
    :param fild_dir: 路径
    :param max_length: # 所能接受的古诗的最大长度(汉字+标点)
    :param sep: 分隔符,这儿为了同时兼容两个数据集 以poems.txt为训练集时spe=':',以poetry.txt为训练集时sep=' '(空格)
    :return:

    example:
    输入必须每行为一首诗

    寒随穷律变,春逐鸟声开。
    初风飘带柳,晚雪间花梅。

    则对应为:
    [  1 235 297 ... 303 304 305]
    [  1 321 350 ... 470 263 471]

    """
    print("数据预处理中……")
    from tensorflow.contrib.learn.python.learn.preprocessing import VocabularyProcessor
    poems = []
    with open(fild_dir, encoding='utf-8') as f:
        for line in f.readlines():
            line = line.strip('\n')
            line = line.split(sep=sep)[-1]
            line = line.replace(',', D_token)
            line = line.replace('。', J_token)
            line = line.replace('?', W_token)
            content = line.replace('!', G_token)
            if len(content) > max_length or '(' in content:  # 所能接受的古诗的最大长度(汉字+标点)
                continue
            content = start_token + content + end_token
            poems.append(" ".join(content))
    # print(poems)
    vocab_processor = VocabularyProcessor(max_document_length=max_length,min_frequency=5)
    x = np.array(list(vocab_processor.fit_transform(poems)))
    dictionary = vocab_processor.vocabulary_.__dict__.copy()
    fre = dictionary['_freq']
    # print(sorted(fre.items(), key=lambda x: x[1], reverse=True))
    word_to_int = dictionary['_mapping']# {'<UNK>': 0, 'D': 1, 'J': 2, 'B': 3, 'E': 4, '不': 5, '人': 6}
    int_to_word = dictionary['_reverse_mapping']#['<UNK>', 'D', 'J', 'B', 'E', '不', '人',]er
    np.random.seed(50)
    shuffle_index = np.random.permutation(x.shape[0])
    shuffle_x = x[shuffle_index]
    shuffle_y = np.copy(shuffle_x)
    shuffle_y[:, :-1] = shuffle_x[:, 1:]
    # print(len(word_to_int))
    return shuffle_x, shuffle_y, word_to_int, int_to_word
def fitData(fileName='../data/train.csv', max_len=40, batch_size=512):
    questions1, questions2, y = readData(fileName)
    vocab_processor = VocabularyProcessor(max_len)
    vocab_processor.fit(questions1 + questions2)
    X_q1 = np.array(list(vocab_processor.transform(questions1)))
    X_q2 = np.array(list(vocab_processor.transform(questions2)))

    vocab_dict = vocab_processor.vocabulary_._mapping

    glove_matrix = read_embeddings(vocab_dict)

    print type(vocab_dict)

    all_data = zip(X_q1, X_q2)

    X_train, X_val, y_train, y_val = train_test_split(all_data,
                                                      y,
                                                      test_size=0.30,
                                                      random_state=42)

    X_val, X_test, y_val, y_test = train_test_split(X_val,
                                                    y_val,
                                                    test_size=0.50,
                                                    random_state=42)

    X_train, y_train = generate_rsample(X_train, y_train, batch_size)
    X_val, y_val = generate_rsample(X_val, y_val, batch_size)
    X_test, y_test = generate_rsample(X_test, y_test, batch_size)

    X_train_q1, X_train_q2 = zip(*X_train)
    X_val_q1, X_val_q2 = zip(*X_val)
    X_test_q1, X_test_q2 = zip(*X_test)

    print 'len(X_train_q1): ', len(X_train_q1)
    print 'len(X_train_q2): ', len(X_train_q2)
    print 'len(X_test_q1): ', len(X_test_q1)
    print 'len(X_test_q2): ', len(X_test_q2)

    return X_train_q1, X_train_q2, X_val_q1, X_val_q2, X_test_q1, X_test_q2, y_train, y_val, y_test, vocab_dict, glove_matrix
Beispiel #6
0
def fitData(fileName = '../data/train.csv', max_len = 40, batch_size = 512):

    questions1, questions2, y = readData(fileName)

    _, test_q1, test_q2 = read_test_data('../data/test.csv')

    global vocab_processor

    vocab_processor = VocabularyProcessor(max_len)

    vocab_processor.fit(questions1 + questions2 + test_q1 + test_q2)

    X_q1 = np.array(list(vocab_processor.transform(questions1)))

    X_q2 = np.array(list(vocab_processor.transform(questions2)))

    vocab_dict = vocab_processor.vocabulary_._mapping

    glove_matrix = read_embeddings(vocab_dict)

    print 'Embedding matrix created!'

    print type(vocab_dict)

    all_data = zip(X_q1, X_q2)

    X_train, X_dev, y_train, y_dev = train_test_split(all_data, y, test_size = 0.2, random_state = 42)

    X_train, y_train = generate_rsample(X_train, y_train, batch_size)

    X_dev, y_dev = generate_rsample(X_dev, y_dev, batch_size)

    X_train_q1, X_train_q2 = zip(*X_train)

    X_dev_q1, X_dev_q2 = zip(*X_dev)


    print 'len(X_train_q1): ', len(X_train_q1)

    print 'len(X_train_q2): ', len(X_train_q2)

    print 'len(X_dev_q2): ', len(X_dev_q2)

    return X_train_q1, X_train_q2, y_train, X_dev_q1, X_dev_q2, y_dev, vocab_dict, glove_matrix
Beispiel #7
0

if __name__ == "__main__":
    if not os.path.exists(config.vocabulary_path):
        print("创建词库...")
        input_iter = create_csv_iter(config.TRAIN_PATH)
        input_iter = (x[0] + " " + x[1] for x in input_iter)
        vocab = create_vocab(input_iter,
                             min_frequency=config.min_word_frequency)
        print("词库大小: {}".format(len(vocab.vocabulary_)))
        # Create vocabulary.txt file
        write_vocabulary(vocab, config.vocabulary_path)
        # 保存词汇库,后面直接restore
        vocab.save(config.vocabulary_path_bin)
    else:
        vocab = VocabularyProcessor.restore(config.vocabulary_path_bin)

    # Create validation.tfrecords
    create_tfrecords_file(input_filename=VALIDATION_PATH,
                          output_filename=os.path.join(FLAGS.output_dir,
                                                       "validation.tfrecords"),
                          example_fn=functools.partial(create_example_test,
                                                       vocab=vocab))

    # Create test.tfrecords
    create_tfrecords_file(input_filename=TEST_PATH,
                          output_filename=os.path.join(FLAGS.output_dir,
                                                       "test.tfrecords"),
                          example_fn=functools.partial(create_example_test,
                                                       vocab=vocab))
Beispiel #8
0
import numpy as np
from tensorflow.contrib.learn.python.learn.preprocessing import VocabularyProcessor

x_text = ['This is a cat', 'This must be boy', 'This is a a dog']
max_document_length = max([len(x.split(" ")) for x in x_text])

## Create the vocabularyprocessor object, setting the max lengh of the documents.
vocab_processor = VocabularyProcessor(max_document_length)

## Transform the documents using the vocabulary.
x = np.array(list(vocab_processor.fit_transform(x_text)))
print x

## Extract word:id mapping from the object.
vocab_dict = vocab_processor.vocabulary_._mapping
print vocab_dict
## Sort the vocabulary dictionary on the basis of values(id).
## Both statements perform same task.
#sorted_vocab = sorted(vocab_dict.items(), key=operator.itemgetter(1))
sorted_vocab = sorted(vocab_dict.items(), key=lambda x: x[1])

## Treat the id's as index into list and create a list of words in the ascending order of id's
## word with id i goes at index i of the list.
vocabulary = list(list(zip(*sorted_vocab))[0])
print("Vocabulary : ")
print(vocabulary)
print("Transformed documents : ")
print(x)
Beispiel #9
0
    def __init__(self, slot_vocab, data_path: str = None):
        if DataSet.__word_vocab is None:

            def space_tokenizer_fn(iterator):
                yield iterator

            DataSet.__word_vocab = VocabularyProcessor(
                max_document_length=DataSet.MAX_SENTENCE_LENGTH,
                tokenizer_fn=space_tokenizer_fn)

        if type(slot_vocab) is str:
            self.__slot_vocab = DataSet.__load_slot_vocab(slot_vocab)
        elif type(slot_vocab) is dict:
            self.__slot_vocab = slot_vocab
        else:
            raise ValueError('slot_vocab error.')

        self.__epoch = 1
        self.__last_idx = 0

        if data_path is None:
            self.__inputs = []
            self.__lengths = []
            self.__masks = []
            self.__labels = []
            self.__size = 0
        else:
            data = []
            target = []
            with open(data_path, 'r') as file:
                for line in file:
                    for match in DataSet.IOB_REGEX.finditer(line):
                        tokens = match.group(1).split(' ')
                        iob = ' '.join([
                            '{}/{}-{}'.format(tokens[i],
                                              (i == 0 and 'b' or 'i'),
                                              match.group(2))
                            for i in range(len(tokens))
                        ]).strip()
                        line = line.replace(match.group(0), iob)

                    words = []
                    tags = []
                    tokens = line.strip().lower().split(' ')
                    for token in tokens:
                        if '/' in token:
                            part = token.partition('/')
                            words.append(part[0])
                            tags.append(part[2])
                        else:
                            words.append(token)
                            tags.append('o')

                    if len(words) > DataSet.MAX_SENTENCE_LENGTH:
                        raise OverflowError('size:%d, %s' % (len(words), line))

                    data.append(words)
                    target.append(tags)

            result = self.__parse_data(data, target)
            self.__inputs = result['inputs']
            self.__lengths = result['lengths']
            self.__masks = result['masks']
            self.__labels = result['labels']
            self.__size = len(data)
class NeuralPosTagger:
    MAX_SENTENCE_LENGTH = 100

    __CHAR_PROCESSOR_NAME = 'char_processor.pkl'
    __TAG_PROCESSOR_NAME = 'tag_processor.pkl'

    def __init__(self, model_dir):
        self.__params = {
            'batch_size': 1000,
            'epoch_size': 2,
            'cell_size': 300,
            'char_embedding_size': 300,
            'learning_rate': 0.001,
        }

        self.__model_path = model_dir
        self.__char_processor_path = os.path.join(self.__model_path,
                                                  self.__CHAR_PROCESSOR_NAME)
        self.__tag_processor_path = os.path.join(self.__model_path,
                                                 self.__TAG_PROCESSOR_NAME)

        self.__char_processor = None
        self.__tag_processor = None
        self.__estimator = None

        if os.path.exists(self.__model_path):
            self.__char_processor = VocabularyProcessor.restore(
                self.__char_processor_path)
            self.__tag_processor = VocabularyProcessor.restore(
                self.__tag_processor_path)
            self.__estimator = self.__create_estimator()

    def __create_estimator(self):
        params = dict(self.__params)
        params.update({
            'output_size': len(self.__tag_processor.vocabulary_),
            'vocab_size': len(self.__char_processor.vocabulary_),
        })

        return tf.estimator.Estimator(
            model_fn=self.__model_fn,
            model_dir=self.__model_path,
            config=tf.estimator.RunConfig(
                save_summary_steps=10,
                save_checkpoints_steps=10,
            ),
            params=params,
        )

    @staticmethod
    def char_tokenizer_fn(raw):
        return [[ch for ch in raw]]

    @staticmethod
    def tag_tokenizer_fn(raw):
        return [raw.split(' ')]

    def __input_fn(self, inputs, epoch=1, shuffle=False):
        batch_size = self.__params[
            'batch_size'] if self.__params['batch_size'] > 0 else len(inputs)
        max_length = self.MAX_SENTENCE_LENGTH

        def gen(records: list):
            for record in records:
                yield {
                    'ids':
                    record['x'],
                    'length':
                    record['length']
                    if record['length'] < max_length else max_length,
                    'mask': [
                        1 if n < record['length'] else 0
                        for n in range(max_length)
                    ],
                }, record['y']

        dataset = tf.data.Dataset.from_generator(
            lambda: gen(inputs), ({
                'ids': tf.int32,
                'length': tf.int32,
                'mask': tf.int32
            }, tf.int32), ({
                'ids': tf.TensorShape([max_length]),
                'length': tf.TensorShape([]),
                'mask': tf.TensorShape([max_length])
            }, tf.TensorShape([max_length])))

        if shuffle:
            dataset = dataset.shuffle(batch_size)
        dataset = dataset.batch(batch_size)
        dataset = dataset.repeat(epoch)

        iterator = dataset.make_one_shot_iterator()
        features, label = iterator.get_next()

        return features, label

    @staticmethod
    def __model_fn(features, labels, mode, params):
        cell_size = params['cell_size']
        output_size = params['output_size']
        vocab_size = params['vocab_size']
        embedding_size = params['char_embedding_size']
        learning_rate = params['learning_rate']
        keep_prob = 1.0 if mode != tf.contrib.learn.ModeKeys.TRAIN else 0.5

        ids = features['ids']
        length = features['length']
        mask = features['mask']

        char_embeddings = tf.get_variable(
            name='char_embeddings',
            shape=[vocab_size, embedding_size],
            initializer=tf.random_uniform_initializer(-1, 1))

        inputs = tf.nn.embedding_lookup(char_embeddings, ids)

        def rnn_cell(cell_size):
            cell = tf.contrib.rnn.GRUCell(cell_size)
            cell = tf.contrib.rnn.DropoutWrapper(cell,
                                                 output_keep_prob=keep_prob)
            return cell

        outputs, _ = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=rnn_cell(cell_size),
            cell_bw=rnn_cell(cell_size),
            inputs=inputs,
            sequence_length=length,
            dtype=tf.float32)

        outputs = outputs[0] + outputs[1]

        outputs = tf.layers.dense(
            inputs=outputs,
            units=output_size,
            activation=tf.nn.relu,
            kernel_initializer=tf.contrib.layers.xavier_initializer())

        predictions = tf.argmax(outputs, 2)

        loss = None
        if mode != tf.estimator.ModeKeys.PREDICT:
            loss = tf.losses.softmax_cross_entropy(onehot_labels=tf.one_hot(
                labels, output_size, dtype=tf.float32),
                                                   logits=outputs,
                                                   weights=mask)

        eval_metric_ops = None
        if mode == tf.estimator.ModeKeys.EVAL:
            weights = []
            precisions = []
            recalls = []
            for label in range(output_size):
                y_true = tf.equal(labels, label)
                y_pred = tf.equal(predictions, label)
                weights.append(tf.metrics.mean(y_true, mask))
                precisions.append(tf.metrics.precision(y_true, y_pred, mask))
                recalls.append(tf.metrics.recall(y_true, y_pred, mask))

            def compute_mean(values, weights):
                return (
                    tf.reduce_sum([
                        tf.multiply(v[0], w[0])
                        for v, w in zip(values, weights)
                    ]),
                    tf.reduce_sum([
                        tf.multiply(v[1], w[1])
                        for v, w in zip(values, weights)
                    ]),
                )

            precision = compute_mean(precisions, weights)
            recall = compute_mean(recalls, weights)

            def compute_f1(precision, recall):
                return (tf.multiply(
                    2.0,
                    tf.div(tf.multiply(precision[0], recall[0]),
                           tf.add(precision[0], recall[0]))),
                        tf.multiply(
                            2.0,
                            tf.div(tf.multiply(precision[1], recall[1]),
                                   tf.add(precision[1], recall[1]))))

            eval_metric_ops = {
                'accuracy': tf.metrics.accuracy(labels, predictions, mask),
                'precision': precision,
                'recall': recall,
                'f1': compute_f1(precision, recall)
            }

        train_op = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            learning_rate = tf.train.exponential_decay(
                learning_rate=learning_rate,
                global_step=tf.train.get_global_step(),
                decay_steps=10,
                decay_rate=0.96)

            train_op = tf.train.AdamOptimizer(
                learning_rate=learning_rate).minimize(
                    loss=loss,
                    global_step=tf.train.get_global_step(),
                )

        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions,
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metric_ops=eval_metric_ops)

    def train(self, corpus_path):
        self.__char_processor = VocabularyProcessor(
            max_document_length=self.MAX_SENTENCE_LENGTH,
            tokenizer_fn=NeuralPosTagger.char_tokenizer_fn)

        self.__tag_processor = VocabularyProcessor(
            max_document_length=self.MAX_SENTENCE_LENGTH,
            tokenizer_fn=NeuralPosTagger.tag_tokenizer_fn)

        training_corpus = Corpus(corpus_path)
        items = [{
            'x': list(self.__char_processor.transform(item['text']))[0],
            'y': list(self.__tag_processor.transform(item['tag']))[0],
            'length': item['length']
        } for item in training_corpus.items()]

        self.__char_processor.fit('')
        self.__tag_processor.fit('')

        if os.path.exists(self.__model_path):
            shutil.rmtree(self.__model_path)

        os.makedirs(self.__model_path)
        self.__char_processor.save(self.__char_processor_path)
        self.__tag_processor.save(self.__tag_processor_path)
        self.__estimator = self.__create_estimator()

        print('Training: %d' % len(training_corpus))
        print('Character: %d, Tag: %d' %
              (len(self.__char_processor.vocabulary_),
               len(self.__tag_processor.vocabulary_)))

        random.shuffle(items)
        pivot = int(len(items) * 0.8)
        train_set = items[:pivot]
        dev_set = items[pivot:]

        class ValidationHook(tf.train.SessionRunHook):
            def __init__(self, estimator, input_fn, dataset):
                self.__every_n_steps = 100
                self.__estimator = estimator
                self.__input_fn = input_fn
                self.__dataset = dataset

            def before_run(self, run_context):
                graph = run_context.session.graph
                return tf.train.SessionRunArgs(tf.train.get_global_step(graph))

            def after_run(self, run_context, run_values):
                if run_values.results % self.__every_n_steps == 0:
                    result = self.__estimator.evaluate(
                        input_fn=lambda: self.__input_fn(self.__dataset), )
                    print('#%d %s' % (run_values.results, result))

        self.__estimator.train(
            input_fn=lambda: self.__input_fn(
                train_set, epoch=self.__params['epoch_size'], shuffle=True),
            hooks=[ValidationHook(self.__estimator, self.__input_fn, dev_set)],
        )
        print('Training completed.')

    def evaluate(self, corpus_path):
        test_corpus = Corpus(corpus_path)
        test_set = [{
            'x': list(self.__char_processor.transform(item['text']))[0],
            'y': list(self.__tag_processor.transform(item['tag']))[0],
            'length': item['length']
        } for item in test_corpus.items()]

        result = self.__estimator.evaluate(
            input_fn=lambda: self.__input_fn(test_set), )

        print('Test: %d' % len(test_corpus))
        print(result)

    def predict(self, characters: list):
        data_set = [{
            'x': list(self.__char_processor.transform(characters))[0],
            'y': [0 for _ in range(self.MAX_SENTENCE_LENGTH)],
            'length': len(characters)
        }]

        result = list(
            self.__estimator.predict(
                input_fn=lambda: self.__input_fn(data_set),
            ))[0][:len(characters)]

        result = list(self.__tag_processor.reverse([result]))[0]
        result = result.split(' ')

        return result
Beispiel #11
0
 def word_identify(self, dataframe):
     contents = dataframe["content"].values.tolist()
     vocab_processor = VocabularyProcessor(self.max_document_length)
     word_ids = np.array(list(vocab_processor.fit_transform(contents)))
     self.vocabulary_size = np.max(word_ids)
     return word_ids