def train(self, corpus_path): self.__char_processor = VocabularyProcessor( max_document_length=self.MAX_SENTENCE_LENGTH, tokenizer_fn=NeuralPosTagger.char_tokenizer_fn) self.__tag_processor = VocabularyProcessor( max_document_length=self.MAX_SENTENCE_LENGTH, tokenizer_fn=NeuralPosTagger.tag_tokenizer_fn) training_corpus = Corpus(corpus_path) items = [{ 'x': list(self.__char_processor.transform(item['text']))[0], 'y': list(self.__tag_processor.transform(item['tag']))[0], 'length': item['length'] } for item in training_corpus.items()] self.__char_processor.fit('') self.__tag_processor.fit('') if os.path.exists(self.__model_path): shutil.rmtree(self.__model_path) os.makedirs(self.__model_path) self.__char_processor.save(self.__char_processor_path) self.__tag_processor.save(self.__tag_processor_path) self.__estimator = self.__create_estimator() print('Training: %d' % len(training_corpus)) print('Character: %d, Tag: %d' % (len(self.__char_processor.vocabulary_), len(self.__tag_processor.vocabulary_))) random.shuffle(items) pivot = int(len(items) * 0.8) train_set = items[:pivot] dev_set = items[pivot:] class ValidationHook(tf.train.SessionRunHook): def __init__(self, estimator, input_fn, dataset): self.__every_n_steps = 100 self.__estimator = estimator self.__input_fn = input_fn self.__dataset = dataset def before_run(self, run_context): graph = run_context.session.graph return tf.train.SessionRunArgs(tf.train.get_global_step(graph)) def after_run(self, run_context, run_values): if run_values.results % self.__every_n_steps == 0: result = self.__estimator.evaluate( input_fn=lambda: self.__input_fn(self.__dataset), ) print('#%d %s' % (run_values.results, result)) self.__estimator.train( input_fn=lambda: self.__input_fn( train_set, epoch=self.__params['epoch_size'], shuffle=True), hooks=[ValidationHook(self.__estimator, self.__input_fn, dev_set)], ) print('Training completed.')
def __init__(self, model_dir): self.__params = { 'batch_size': 1000, 'epoch_size': 2, 'cell_size': 300, 'char_embedding_size': 300, 'learning_rate': 0.001, } self.__model_path = model_dir self.__char_processor_path = os.path.join(self.__model_path, self.__CHAR_PROCESSOR_NAME) self.__tag_processor_path = os.path.join(self.__model_path, self.__TAG_PROCESSOR_NAME) self.__char_processor = None self.__tag_processor = None self.__estimator = None if os.path.exists(self.__model_path): self.__char_processor = VocabularyProcessor.restore( self.__char_processor_path) self.__tag_processor = VocabularyProcessor.restore( self.__tag_processor_path) self.__estimator = self.__create_estimator()
def create_vocab(input_iter, min_frequency): """ 用tf自带的tensorflow.contrib.learn.python.learn.preprocessing类处理句子 """ vocab_processor = VocabularyProcessor( config.max_seq_len, min_frequency=min_frequency, tokenizer_fn=tokenizer_fn, vocabulary=CategoricalVocabularyMy(), #扩展词库,默认前面4个词是预设的 ) vocab_processor.fit(input_iter) return vocab_processor
def process(fild_dir, max_length=70, sep=':'): """ 该函数的作用是把诗向量化 :param fild_dir: 路径 :param max_length: # 所能接受的古诗的最大长度(汉字+标点) :param sep: 分隔符,这儿为了同时兼容两个数据集 以poems.txt为训练集时spe=':',以poetry.txt为训练集时sep=' '(空格) :return: example: 输入必须每行为一首诗 寒随穷律变,春逐鸟声开。 初风飘带柳,晚雪间花梅。 则对应为: [ 1 235 297 ... 303 304 305] [ 1 321 350 ... 470 263 471] """ print("数据预处理中……") from tensorflow.contrib.learn.python.learn.preprocessing import VocabularyProcessor poems = [] with open(fild_dir, encoding='utf-8') as f: for line in f.readlines(): line = line.strip('\n') line = line.split(sep=sep)[-1] line = line.replace(',', D_token) line = line.replace('。', J_token) line = line.replace('?', W_token) content = line.replace('!', G_token) if len(content) > max_length or '(' in content: # 所能接受的古诗的最大长度(汉字+标点) continue content = start_token + content + end_token poems.append(" ".join(content)) # print(poems) vocab_processor = VocabularyProcessor(max_document_length=max_length,min_frequency=5) x = np.array(list(vocab_processor.fit_transform(poems))) dictionary = vocab_processor.vocabulary_.__dict__.copy() fre = dictionary['_freq'] # print(sorted(fre.items(), key=lambda x: x[1], reverse=True)) word_to_int = dictionary['_mapping']# {'<UNK>': 0, 'D': 1, 'J': 2, 'B': 3, 'E': 4, '不': 5, '人': 6} int_to_word = dictionary['_reverse_mapping']#['<UNK>', 'D', 'J', 'B', 'E', '不', '人',]er np.random.seed(50) shuffle_index = np.random.permutation(x.shape[0]) shuffle_x = x[shuffle_index] shuffle_y = np.copy(shuffle_x) shuffle_y[:, :-1] = shuffle_x[:, 1:] # print(len(word_to_int)) return shuffle_x, shuffle_y, word_to_int, int_to_word
def fitData(fileName='../data/train.csv', max_len=40, batch_size=512): questions1, questions2, y = readData(fileName) vocab_processor = VocabularyProcessor(max_len) vocab_processor.fit(questions1 + questions2) X_q1 = np.array(list(vocab_processor.transform(questions1))) X_q2 = np.array(list(vocab_processor.transform(questions2))) vocab_dict = vocab_processor.vocabulary_._mapping glove_matrix = read_embeddings(vocab_dict) print type(vocab_dict) all_data = zip(X_q1, X_q2) X_train, X_val, y_train, y_val = train_test_split(all_data, y, test_size=0.30, random_state=42) X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.50, random_state=42) X_train, y_train = generate_rsample(X_train, y_train, batch_size) X_val, y_val = generate_rsample(X_val, y_val, batch_size) X_test, y_test = generate_rsample(X_test, y_test, batch_size) X_train_q1, X_train_q2 = zip(*X_train) X_val_q1, X_val_q2 = zip(*X_val) X_test_q1, X_test_q2 = zip(*X_test) print 'len(X_train_q1): ', len(X_train_q1) print 'len(X_train_q2): ', len(X_train_q2) print 'len(X_test_q1): ', len(X_test_q1) print 'len(X_test_q2): ', len(X_test_q2) return X_train_q1, X_train_q2, X_val_q1, X_val_q2, X_test_q1, X_test_q2, y_train, y_val, y_test, vocab_dict, glove_matrix
def fitData(fileName = '../data/train.csv', max_len = 40, batch_size = 512): questions1, questions2, y = readData(fileName) _, test_q1, test_q2 = read_test_data('../data/test.csv') global vocab_processor vocab_processor = VocabularyProcessor(max_len) vocab_processor.fit(questions1 + questions2 + test_q1 + test_q2) X_q1 = np.array(list(vocab_processor.transform(questions1))) X_q2 = np.array(list(vocab_processor.transform(questions2))) vocab_dict = vocab_processor.vocabulary_._mapping glove_matrix = read_embeddings(vocab_dict) print 'Embedding matrix created!' print type(vocab_dict) all_data = zip(X_q1, X_q2) X_train, X_dev, y_train, y_dev = train_test_split(all_data, y, test_size = 0.2, random_state = 42) X_train, y_train = generate_rsample(X_train, y_train, batch_size) X_dev, y_dev = generate_rsample(X_dev, y_dev, batch_size) X_train_q1, X_train_q2 = zip(*X_train) X_dev_q1, X_dev_q2 = zip(*X_dev) print 'len(X_train_q1): ', len(X_train_q1) print 'len(X_train_q2): ', len(X_train_q2) print 'len(X_dev_q2): ', len(X_dev_q2) return X_train_q1, X_train_q2, y_train, X_dev_q1, X_dev_q2, y_dev, vocab_dict, glove_matrix
if __name__ == "__main__": if not os.path.exists(config.vocabulary_path): print("创建词库...") input_iter = create_csv_iter(config.TRAIN_PATH) input_iter = (x[0] + " " + x[1] for x in input_iter) vocab = create_vocab(input_iter, min_frequency=config.min_word_frequency) print("词库大小: {}".format(len(vocab.vocabulary_))) # Create vocabulary.txt file write_vocabulary(vocab, config.vocabulary_path) # 保存词汇库,后面直接restore vocab.save(config.vocabulary_path_bin) else: vocab = VocabularyProcessor.restore(config.vocabulary_path_bin) # Create validation.tfrecords create_tfrecords_file(input_filename=VALIDATION_PATH, output_filename=os.path.join(FLAGS.output_dir, "validation.tfrecords"), example_fn=functools.partial(create_example_test, vocab=vocab)) # Create test.tfrecords create_tfrecords_file(input_filename=TEST_PATH, output_filename=os.path.join(FLAGS.output_dir, "test.tfrecords"), example_fn=functools.partial(create_example_test, vocab=vocab))
import numpy as np from tensorflow.contrib.learn.python.learn.preprocessing import VocabularyProcessor x_text = ['This is a cat', 'This must be boy', 'This is a a dog'] max_document_length = max([len(x.split(" ")) for x in x_text]) ## Create the vocabularyprocessor object, setting the max lengh of the documents. vocab_processor = VocabularyProcessor(max_document_length) ## Transform the documents using the vocabulary. x = np.array(list(vocab_processor.fit_transform(x_text))) print x ## Extract word:id mapping from the object. vocab_dict = vocab_processor.vocabulary_._mapping print vocab_dict ## Sort the vocabulary dictionary on the basis of values(id). ## Both statements perform same task. #sorted_vocab = sorted(vocab_dict.items(), key=operator.itemgetter(1)) sorted_vocab = sorted(vocab_dict.items(), key=lambda x: x[1]) ## Treat the id's as index into list and create a list of words in the ascending order of id's ## word with id i goes at index i of the list. vocabulary = list(list(zip(*sorted_vocab))[0]) print("Vocabulary : ") print(vocabulary) print("Transformed documents : ") print(x)
def __init__(self, slot_vocab, data_path: str = None): if DataSet.__word_vocab is None: def space_tokenizer_fn(iterator): yield iterator DataSet.__word_vocab = VocabularyProcessor( max_document_length=DataSet.MAX_SENTENCE_LENGTH, tokenizer_fn=space_tokenizer_fn) if type(slot_vocab) is str: self.__slot_vocab = DataSet.__load_slot_vocab(slot_vocab) elif type(slot_vocab) is dict: self.__slot_vocab = slot_vocab else: raise ValueError('slot_vocab error.') self.__epoch = 1 self.__last_idx = 0 if data_path is None: self.__inputs = [] self.__lengths = [] self.__masks = [] self.__labels = [] self.__size = 0 else: data = [] target = [] with open(data_path, 'r') as file: for line in file: for match in DataSet.IOB_REGEX.finditer(line): tokens = match.group(1).split(' ') iob = ' '.join([ '{}/{}-{}'.format(tokens[i], (i == 0 and 'b' or 'i'), match.group(2)) for i in range(len(tokens)) ]).strip() line = line.replace(match.group(0), iob) words = [] tags = [] tokens = line.strip().lower().split(' ') for token in tokens: if '/' in token: part = token.partition('/') words.append(part[0]) tags.append(part[2]) else: words.append(token) tags.append('o') if len(words) > DataSet.MAX_SENTENCE_LENGTH: raise OverflowError('size:%d, %s' % (len(words), line)) data.append(words) target.append(tags) result = self.__parse_data(data, target) self.__inputs = result['inputs'] self.__lengths = result['lengths'] self.__masks = result['masks'] self.__labels = result['labels'] self.__size = len(data)
class NeuralPosTagger: MAX_SENTENCE_LENGTH = 100 __CHAR_PROCESSOR_NAME = 'char_processor.pkl' __TAG_PROCESSOR_NAME = 'tag_processor.pkl' def __init__(self, model_dir): self.__params = { 'batch_size': 1000, 'epoch_size': 2, 'cell_size': 300, 'char_embedding_size': 300, 'learning_rate': 0.001, } self.__model_path = model_dir self.__char_processor_path = os.path.join(self.__model_path, self.__CHAR_PROCESSOR_NAME) self.__tag_processor_path = os.path.join(self.__model_path, self.__TAG_PROCESSOR_NAME) self.__char_processor = None self.__tag_processor = None self.__estimator = None if os.path.exists(self.__model_path): self.__char_processor = VocabularyProcessor.restore( self.__char_processor_path) self.__tag_processor = VocabularyProcessor.restore( self.__tag_processor_path) self.__estimator = self.__create_estimator() def __create_estimator(self): params = dict(self.__params) params.update({ 'output_size': len(self.__tag_processor.vocabulary_), 'vocab_size': len(self.__char_processor.vocabulary_), }) return tf.estimator.Estimator( model_fn=self.__model_fn, model_dir=self.__model_path, config=tf.estimator.RunConfig( save_summary_steps=10, save_checkpoints_steps=10, ), params=params, ) @staticmethod def char_tokenizer_fn(raw): return [[ch for ch in raw]] @staticmethod def tag_tokenizer_fn(raw): return [raw.split(' ')] def __input_fn(self, inputs, epoch=1, shuffle=False): batch_size = self.__params[ 'batch_size'] if self.__params['batch_size'] > 0 else len(inputs) max_length = self.MAX_SENTENCE_LENGTH def gen(records: list): for record in records: yield { 'ids': record['x'], 'length': record['length'] if record['length'] < max_length else max_length, 'mask': [ 1 if n < record['length'] else 0 for n in range(max_length) ], }, record['y'] dataset = tf.data.Dataset.from_generator( lambda: gen(inputs), ({ 'ids': tf.int32, 'length': tf.int32, 'mask': tf.int32 }, tf.int32), ({ 'ids': tf.TensorShape([max_length]), 'length': tf.TensorShape([]), 'mask': tf.TensorShape([max_length]) }, tf.TensorShape([max_length]))) if shuffle: dataset = dataset.shuffle(batch_size) dataset = dataset.batch(batch_size) dataset = dataset.repeat(epoch) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label @staticmethod def __model_fn(features, labels, mode, params): cell_size = params['cell_size'] output_size = params['output_size'] vocab_size = params['vocab_size'] embedding_size = params['char_embedding_size'] learning_rate = params['learning_rate'] keep_prob = 1.0 if mode != tf.contrib.learn.ModeKeys.TRAIN else 0.5 ids = features['ids'] length = features['length'] mask = features['mask'] char_embeddings = tf.get_variable( name='char_embeddings', shape=[vocab_size, embedding_size], initializer=tf.random_uniform_initializer(-1, 1)) inputs = tf.nn.embedding_lookup(char_embeddings, ids) def rnn_cell(cell_size): cell = tf.contrib.rnn.GRUCell(cell_size) cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob) return cell outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=rnn_cell(cell_size), cell_bw=rnn_cell(cell_size), inputs=inputs, sequence_length=length, dtype=tf.float32) outputs = outputs[0] + outputs[1] outputs = tf.layers.dense( inputs=outputs, units=output_size, activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer()) predictions = tf.argmax(outputs, 2) loss = None if mode != tf.estimator.ModeKeys.PREDICT: loss = tf.losses.softmax_cross_entropy(onehot_labels=tf.one_hot( labels, output_size, dtype=tf.float32), logits=outputs, weights=mask) eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: weights = [] precisions = [] recalls = [] for label in range(output_size): y_true = tf.equal(labels, label) y_pred = tf.equal(predictions, label) weights.append(tf.metrics.mean(y_true, mask)) precisions.append(tf.metrics.precision(y_true, y_pred, mask)) recalls.append(tf.metrics.recall(y_true, y_pred, mask)) def compute_mean(values, weights): return ( tf.reduce_sum([ tf.multiply(v[0], w[0]) for v, w in zip(values, weights) ]), tf.reduce_sum([ tf.multiply(v[1], w[1]) for v, w in zip(values, weights) ]), ) precision = compute_mean(precisions, weights) recall = compute_mean(recalls, weights) def compute_f1(precision, recall): return (tf.multiply( 2.0, tf.div(tf.multiply(precision[0], recall[0]), tf.add(precision[0], recall[0]))), tf.multiply( 2.0, tf.div(tf.multiply(precision[1], recall[1]), tf.add(precision[1], recall[1])))) eval_metric_ops = { 'accuracy': tf.metrics.accuracy(labels, predictions, mask), 'precision': precision, 'recall': recall, 'f1': compute_f1(precision, recall) } train_op = None if mode == tf.estimator.ModeKeys.TRAIN: learning_rate = tf.train.exponential_decay( learning_rate=learning_rate, global_step=tf.train.get_global_step(), decay_steps=10, decay_rate=0.96) train_op = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize( loss=loss, global_step=tf.train.get_global_step(), ) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops) def train(self, corpus_path): self.__char_processor = VocabularyProcessor( max_document_length=self.MAX_SENTENCE_LENGTH, tokenizer_fn=NeuralPosTagger.char_tokenizer_fn) self.__tag_processor = VocabularyProcessor( max_document_length=self.MAX_SENTENCE_LENGTH, tokenizer_fn=NeuralPosTagger.tag_tokenizer_fn) training_corpus = Corpus(corpus_path) items = [{ 'x': list(self.__char_processor.transform(item['text']))[0], 'y': list(self.__tag_processor.transform(item['tag']))[0], 'length': item['length'] } for item in training_corpus.items()] self.__char_processor.fit('') self.__tag_processor.fit('') if os.path.exists(self.__model_path): shutil.rmtree(self.__model_path) os.makedirs(self.__model_path) self.__char_processor.save(self.__char_processor_path) self.__tag_processor.save(self.__tag_processor_path) self.__estimator = self.__create_estimator() print('Training: %d' % len(training_corpus)) print('Character: %d, Tag: %d' % (len(self.__char_processor.vocabulary_), len(self.__tag_processor.vocabulary_))) random.shuffle(items) pivot = int(len(items) * 0.8) train_set = items[:pivot] dev_set = items[pivot:] class ValidationHook(tf.train.SessionRunHook): def __init__(self, estimator, input_fn, dataset): self.__every_n_steps = 100 self.__estimator = estimator self.__input_fn = input_fn self.__dataset = dataset def before_run(self, run_context): graph = run_context.session.graph return tf.train.SessionRunArgs(tf.train.get_global_step(graph)) def after_run(self, run_context, run_values): if run_values.results % self.__every_n_steps == 0: result = self.__estimator.evaluate( input_fn=lambda: self.__input_fn(self.__dataset), ) print('#%d %s' % (run_values.results, result)) self.__estimator.train( input_fn=lambda: self.__input_fn( train_set, epoch=self.__params['epoch_size'], shuffle=True), hooks=[ValidationHook(self.__estimator, self.__input_fn, dev_set)], ) print('Training completed.') def evaluate(self, corpus_path): test_corpus = Corpus(corpus_path) test_set = [{ 'x': list(self.__char_processor.transform(item['text']))[0], 'y': list(self.__tag_processor.transform(item['tag']))[0], 'length': item['length'] } for item in test_corpus.items()] result = self.__estimator.evaluate( input_fn=lambda: self.__input_fn(test_set), ) print('Test: %d' % len(test_corpus)) print(result) def predict(self, characters: list): data_set = [{ 'x': list(self.__char_processor.transform(characters))[0], 'y': [0 for _ in range(self.MAX_SENTENCE_LENGTH)], 'length': len(characters) }] result = list( self.__estimator.predict( input_fn=lambda: self.__input_fn(data_set), ))[0][:len(characters)] result = list(self.__tag_processor.reverse([result]))[0] result = result.split(' ') return result
def word_identify(self, dataframe): contents = dataframe["content"].values.tolist() vocab_processor = VocabularyProcessor(self.max_document_length) word_ids = np.array(list(vocab_processor.fit_transform(contents))) self.vocabulary_size = np.max(word_ids) return word_ids