def __init__(self, conf): self.task_type = 'classify' self.conf = conf for attr in conf: setattr(self, attr, conf[attr]) self.pre = Preprocess() self.model_loaded = False self.zdy = {} csv = pd.read_csv(self.ori_path, header=0, sep=",", error_bad_lines=False) self.text_list = list(csv['text']) self.label_list = list(csv['target']) self.num_class = len(set(self.label_list)) self.num_output = self.num_class logging.info( f">>>>>>>>>>>> class num:{self.num_class} <<<<<<<<<<<<<<<") for idx, text in enumerate(self.text_list): self.text_list[idx] = self.pre.get_dl_input_by_text(text) if len(self.text_list[idx]) == 0: logging.error(f"find blank lines in {idx}") self.conf.update({ "maxlen": self.maxlen, "maxlen1": self.maxlen, "maxlen2": self.maxlen, "num_class": self.num_class, "embedding_size": self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_output, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**self.conf)
def __init__(self, conf): self.task_type = 'match' self.conf = conf for attr in conf: setattr(self, attr, conf[attr]) self.graph = tf.get_default_graph() self.pre = Preprocess() self.model_loaded = False self.zdy = {} csv = pd.read_csv(self.ori_path, header = 0, sep=",", error_bad_lines=False) self.text_list = list(csv['text']) self.label_list = list(csv['target']) self.num_class = len(set(self.label_list)) logging.info(f">>>>>>>>>>>>>>class num:{self.num_class}") self.text_list = [self.pre.get_dl_input_by_text(text) for text in \ self.text_list] self.conf.update({ "maxlen": self.maxlen, "maxlen1": self.maxlen, "maxlen2": self.maxlen, "num_class": self.num_class, "embedding_size": self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_output, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**self.conf)
def read_data(self): self.pre = Preprocess() self.util = NERUtil() self.text_list, self.label_list = self.util.load_ner_data(self.ori_path) self.text_list = [self.pre.get_dl_input_by_text(text, self.use_generalization) for text in self.text_list] self.num_class = self.num_output = len(set(list(chain.from_iterable(self.label_list)))) self.data_type = 'column_2'
def read_data(self): self.pre = Preprocess() csv = pd.read_csv(self.ori_path, header=0, sep="\t", error_bad_lines=False) if 'text' in csv.keys() and 'target' in csv.keys(): #format: text \t target #for this format, the size for each class should be larger than 2 self.text_list = list(csv['text']) self.label_list = list(csv['target']) self.data_type = 'column_2' elif 'text_a' in csv.keys() and 'text_b' in csv.keys( ) and 'target' in csv.keys(): #format: text_a \t text_b \t target #for this format, target value can only be choosen from 0 or 1 self.text_a_list = list(csv['text_a']) self.text_b_list = list(csv['text_b']) self.text_list = self.text_a_list + self.text_b_list self.label_list = list(csv['target']) self.data_type = 'column_3' else: raise ValueError('error format for train file') self.text_list = [self.pre.get_dl_input_by_text(text) for text in \ self.text_list]
def __init__(self, conf): self.conf = conf self.task_type = 'classify' for attr in conf: setattr(self, attr, conf[attr]) self.is_training = tf.placeholder(tf.bool, [], name="is_training") self.global_step = tf.Variable(0, trainable=False) self.keep_prob = tf.where(self.is_training, 0.5, 1.0) self.pre = Preprocess() self.text_list, self.label_list = load_classify_data(self.train_path) self.text_list = [self.pre.get_dl_input_by_text(text) for text in self.text_list] if not self.use_language_model: #build vocabulary map using training data self.vocab_dict = embedding[self.embedding_type].build_dict(dict_path = self.dict_path, text_list = self.text_list) #define embedding object by embedding_type self.embedding = embedding[self.embedding_type](text_list = self.text_list, vocab_dict = self.vocab_dict, dict_path = self.dict_path, random=self.rand_embedding, batch_size = self.batch_size, maxlen = self.maxlen, embedding_size = self.embedding_size, conf = self.conf) self.embed = self.embedding(name = 'x') self.y = tf.placeholder(tf.int32, [None], name="y") #model params params = conf params.update({ "maxlen":self.maxlen, "embedding_size":self.embedding_size, "keep_prob":self.keep_prob, "batch_size": self.batch_size, "num_output": self.num_class, "is_training": self.is_training }) self.encoder = encoder[self.encoder_type](**params) if not self.use_language_model: self.out = self.encoder(self.embed) else: self.out = self.encoder() self.output_nodes = self.out.name.split(':')[0] self.loss(self.out) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) if self.use_language_model: tvars = tf.trainable_variables() init_checkpoint = conf['init_checkpoint_path'] (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint,assignment_map)
def read_data(self): self.pre = Preprocess() csv = pd.read_csv(self.ori_path, header=0, sep="\t", error_bad_lines=False) self.text_list = list(csv['text']) self.label_list = list(csv['target']) for idx, text in enumerate(self.text_list): self.text_list[idx] = self.pre.get_dl_input_by_text(text) if len(self.text_list[idx]) == 0: logging.error("find blank lines in %s" % idx) self.data_type = 'column_2'
def main(_): for map_name in env_names: if rl_algo == 'ddpg': from agent.ddpg import DDPGAgent from networks.acnetwork_q_seperated import ActorNet, CriticNet from utils.memory import SequentialMemory actor = ActorNet() critic = CriticNet() memory = SequentialMemory(limit=arglist.DDPG.memory_limit) learner = DDPGAgent(actor, critic, memory) elif rl_algo == 'ppo': from agent.ppo import PPOAgent from networks.acnetwork_v_seperated import ActorNet, CriticNet from utils.memory import EpisodeMemory actor = ActorNet() critic = CriticNet() memory = EpisodeMemory(limit=arglist.PPO.memory_limit, action_shape=arglist.action_shape, observation_shape=arglist.observation_shape) learner = PPOAgent(actor, critic, memory) else: raise NotImplementedError() preprocess = Preprocess() game = MiniGame(map_name, learner, preprocess, nb_episodes=10000) game.run_ddpg() return 0
def read_data(self): self.pre = Preprocess() encode_list, decode_list, target_list =\ load_chat_data(self.ori_path) self.text_list = encode_list + decode_list self.label_list = target_list self.data_type = 'translation'
class Classify(TaskBase): def __init__(self, conf): super(Classify, self).__init__(conf) self.task_type = 'classify' self.conf = conf self.read_data() self.num_class = len(set(self.label_list)) self.num_output = self.num_class logging.info(">>>>>>>>>>>> class num:%s <<<<<<<<<<<<<<<" % self.num_class) self.conf.update({ "maxlen": self.maxlen, "maxlen1": self.maxlen, "maxlen2": self.maxlen, "num_class": self.num_class, "embedding_size": self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_output, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**self.conf) def read_data(self): self.pre = Preprocess() csv = pd.read_csv(self.ori_path, header=0, sep="\t", error_bad_lines=False) self.text_list = list(csv['text']) self.label_list = list(csv['target']) for idx, text in enumerate(self.text_list): self.text_list[idx] = self.pre.get_dl_input_by_text(text) if len(self.text_list[idx]) == 0: logging.error("find blank lines in %s" % idx) self.data_type = 'column_2' def create_model_fn(self): def cal_loss(pred, labels, batch_size, conf): loss = get_loss(type=self.loss_type, logits=pred, labels=labels, labels_sparse=True, **conf) return loss def model_fn(features, labels, mode, params): #model params self.encoder.keep_prob = params['keep_prob'] self.encoder.is_training = params['is_training'] global_step = tf.train.get_or_create_global_step() ############# encoder ################# if not self.use_language_model: self.embedding, _ = self.init_embedding() self.embed_query = self.embedding(features=features, name='x_query') out = self.encoder(self.embed_query, name='x_query', features=features) else: out = self.encoder(features=features) #pred = tf.nn.softmax(tf.layers.dense(out, self.num_class)) pred = tf.nn.softmax(out) pred_labels = tf.argmax(pred, axis=-1) ############### predict ################## if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'encode': out, 'logit': pred, 'label': features['label'] } return tf.estimator.EstimatorSpec(mode, predictions=predictions) ############### loss ################## loss = cal_loss(pred, labels, self.batch_size, self.conf) ############### train ################## if mode == tf.estimator.ModeKeys.TRAIN: return self.train_estimator_spec(mode, loss, global_step, params) ############### eval ################## if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = { "accuracy": tf.metrics.accuracy(labels=labels, predictions=pred_labels) } return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): size = self.num_class num_classes_per_batch = self.num_class_per_batch assert num_classes_per_batch <= self.num_class, \ "num_classes_per_batch is %s > %s"%(num_classes_per_batch, self.num_class) num_sentences_per_class = self.batch_size // num_classes_per_batch filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith('train') ] if len(filenames) == 0: logging.warn( "Can't find any tfrecords file for train, prepare now!") self.prepare() filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith('train') ] assert size == len(filenames), "each file represent one class" logging.info("tfrecords train class num: {}".format( len(filenames))) logging.info("tfrecords num_sentences_per_class:{}".format( num_sentences_per_class)) logging.info("tfrecords num_classes_per_batch:{}".format( num_classes_per_batch)) datasets = [ tf.data.TFRecordDataset(filename) for filename in filenames ] datasets = [dataset.repeat() for dataset in datasets] #assert self.batch_size == num_sentences_per_class* num_classes_per_batch def generator(): while True: labels = np.random.choice(range(size), num_classes_per_batch, replace=False) for label in labels: for _ in range(num_sentences_per_class): yield label choice_dataset = tf.data.Dataset.from_generator( generator, tf.int64) dataset = tf.contrib.data.choose_from_datasets( datasets, choice_dataset) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(4 * self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() #test #sess = tf.Session() #features,label = sess.run([features,label]) #features['x_query_pred'] = [item.decode('utf-8') for item in # features['x_query_pred'][1]] return features, label def test_input_fn(mode): filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith(mode) ] assert self.num_class == len( filenames), "the num of tfrecords file error!" logging.info("tfrecords test class num: {}".format(len(filenames))) dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda: test_input_fn("test") elif mode == 'dev': return lambda: test_input_fn("dev") else: raise ValueError("unknown input_fn type!") def train(self): params = {'is_training': True, 'keep_prob': 0.7} estimator = self.get_train_estimator(self.create_model_fn(), params) estimator.train(input_fn=self.create_input_fn("train"), max_steps=self.max_steps) self.save() def test(self, mode='test'): params = {'is_training': False, 'keep_prob': 1} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) if mode == 'dev': estimator.evaluate(input_fn=self.create_input_fn('dev')) elif mode == 'test': estimator.evaluate(input_fn=self.create_input_fn('test')) else: raise ValueError("unknown mode:[%s]" % mode) def save(self): params = {'is_training': False, 'keep_prob': 1} def get_features(): features = { 'x_query': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query'), 'x_query_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length'), 'label': tf.placeholder(dtype=tf.int64, shape=[None], name='label') } features.update(self.encoder.features) return features self.save_model(self.create_model_fn(), params, get_features)
import sys sys.path.append(".") import pickle from utils.preprocess import Preprocess f = open("./train/chatbot_bin.bin", "rb") word_index = pickle.load(f) f.close() sent = "갑자기 짜장면 먹고 싶네 ㅋㅋ" p = Preprocess("./train/chatbot_bin.bin") pos = p.pos(sent) keywords = p.get_keywords(pos, without_tag=True) print(p.word_index) print(p.get_wordidx_sequence(keywords)) for word in keywords: try: print(word, word_index[word]) except KeyError: print(word, word_index["OOV"])
class NER(TaskBase): def __init__(self, conf): super(NER, self).__init__(conf) self.task_type = 'ner' self.conf = conf self.read_data() if self.maxlen == -1: self.maxlen = max([len(text.split()) for text in self.text_list]) #model params params = conf params.update({ "maxlen":self.maxlen, "embedding_size":self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_class, "keep_prob": 1, "is_training": False, }) #params['num_output'] = 128 #self.encoder_base = encoder['transformer'](**params) #params['num_output'] = self.num_class self.encoder = encoder[self.encoder_type](**params) def read_data(self): self.pre = Preprocess() self.util = NERUtil() self.text_list, self.label_list = self.util.load_ner_data(self.ori_path) self.text_list = [self.pre.get_dl_input_by_text(text, self.use_generalization) for text in self.text_list] self.num_class = self.num_output = len(set(list(chain.from_iterable(self.label_list)))) self.data_type = 'column_2' def create_model_fn(self): def model_fn(features, labels, mode, params): if mode == tf.estimator.ModeKeys.TRAIN: self.encoder.keep_prob = 0.5 self.encoder.is_training = True else: self.encoder.keep_prob = 1 self.encoder.is_training = False seq_len = features['x_query_length'] global_step = tf.train.get_or_create_global_step() ################ encode ################## if not self.use_language_model: self.embedding, _ = self.init_embedding() embed = self.embedding(features = features, name = 'x_query') out = self.encoder(embed, 'x_query', features = features, middle_flag = True) #out = self.encoder_base(embed, 'x_query', features = features, middle_flag = True) #out = self.encoder(out, 'x_query', features = features, middle_flag = True) else: out = self.encoder(features = features) logits = tf.reshape(out, [-1, int(out.shape[1]), self.num_class]) batch_size = get_placeholder_batch_size(logits) small = -1000 start_logits = tf.concat([ small*tf.ones(shape=[batch_size, 1, self.num_class]), tf.zeros(shape=[batch_size, 1, 1])], axis=-1) pad_logits = tf.cast(small * tf.ones(shape=[batch_size, self.maxlen, 1]), tf.float32) logits = tf.concat([logits, pad_logits], axis = -1) logits = tf.concat([start_logits, logits], axis = 1) seq_len += 1 transition_params = tf.get_variable('crf', [self.num_class + 1,self.num_class + 1], dtype=tf.float32) pred_ids, _ = tf.contrib.crf.crf_decode(logits, transition_params, seq_len) ############### predict ################## if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'logit': logits, 'pred_ids': pred_ids, } return tf.estimator.EstimatorSpec(mode, predictions=predictions) else: ############### loss #################### labels = tf.concat([ tf.cast(self.num_class * tf.ones(shape=[batch_size, 1]), tf.int64), labels ], axis = -1) log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(logits, labels, seq_len, transition_params) loss = -tf.reduce_mean(log_likelihood) if mode == tf.estimator.ModeKeys.TRAIN: return self.train_estimator_spec(mode, loss, global_step, params) if mode == tf.estimator.ModeKeys.EVAL: weights = tf.sequence_mask(seq_len, self.maxlen+1) metrics = {'acc': tf.metrics.accuracy(labels, pred_ids, weights)} return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): filenames = [os.path.join(self.tfrecords_path,item) for item in os.listdir(self.tfrecords_path) if item.startswith('train')] if len(filenames) == 0: logging.warn("Can't find any tfrecords file for train, prepare now!") self.prepare() filenames = [os.path.join(self.tfrecords_path,item) for item in os.listdir(self.tfrecords_path) if item.startswith('train')] dataset = tf.data.TFRecordDataset(filenames) dataset = dataset.repeat() gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.shuffle(buffer_size=100*self.batch_size) dataset = dataset.prefetch(4*self.batch_size) dataset = dataset.batch(self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label def test_input_fn(mode): filenames = [os.path.join(self.tfrecords_path,item) for item in os.listdir(self.tfrecords_path) if item.startswith(mode)] assert len(filenames) > 0, "Can't find any tfrecords file for %s!"%mode dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda : test_input_fn("test") elif mode == 'dev': return lambda : test_input_fn("dev") else: raise ValueError("unknown input_fn type!") def save(self): def get_features(): features = {'x_query': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query'), 'x_query_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length'), } features.update(self.encoder.get_features()) return features self.save_model(self.create_model_fn(), None, get_features) def train(self): estimator = self.get_train_estimator(self.create_model_fn(), None) estimator.train(input_fn = self.create_input_fn("train"), max_steps = self.max_steps) self.save() def test(self, mode = 'test'): config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(), config = config) if mode == 'dev': estimator.evaluate(input_fn=self.create_input_fn('dev')) elif mode == 'test': estimator.evaluate(input_fn=self.create_input_fn('test')) else: raise ValueError("unknown mode:[%s]"%mode) def train_and_evaluate(self): config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path, save_checkpoints_steps=self.save_interval, keep_checkpoint_max=5) estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(), config = config) early_stop = tf.estimator.experimental.stop_if_no_decrease_hook( estimator=estimator, metric_name="loss", max_steps_without_decrease=estimator.config.save_checkpoints_steps * 2, run_every_secs=None, run_every_steps=estimator.config.save_checkpoints_steps, ) train_spec=tf.estimator.TrainSpec( input_fn = self.create_input_fn("train"), max_steps = self.max_steps, hooks=[early_stop]) eval_spec=tf.estimator.EvalSpec( input_fn = self.create_input_fn("dev"), steps = None, start_delay_secs = 1, # start evaluating after N seconds throttle_secs = 10, # evaluate every N seconds ) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) self.save()
class Match(object): def __init__(self, conf): self.task_type = 'match' self.conf = conf for attr in conf: setattr(self, attr, conf[attr]) self.graph = tf.get_default_graph() self.pre = Preprocess() self.model_loaded = False self.zdy = {} csv = pd.read_csv(self.ori_path, header = 0, sep=",", error_bad_lines=False) self.text_list = list(csv['text']) self.label_list = list(csv['target']) self.num_class = len(set(self.label_list)) logging.info(f">>>>>>>>>>>>>>class num:{self.num_class}") self.text_list = [self.pre.get_dl_input_by_text(text) for text in \ self.text_list] self.conf.update({ "maxlen": self.maxlen, "maxlen1": self.maxlen, "maxlen2": self.maxlen, "num_class": self.num_class, "embedding_size": self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_output, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**self.conf) def init_embedding(self): self.vocab_dict = embedding[self.embedding_type].build_dict(\ dict_path = self.dict_path, text_list = self.text_list, mode = self.mode) self.embedding = embedding[self.embedding_type](text_list = self.text_list, vocab_dict = self.vocab_dict, dict_path = self.dict_path, random=self.rand_embedding, maxlen = self.maxlen, batch_size = self.batch_size, embedding_size = self.embedding_size, conf = self.conf) def prepare(self): self.init_embedding() self.gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) self.gt.process(self.text_list, self.label_list, self.embedding.text2id, self.encoder.encoder_fun, self.vocab_dict, self.tfrecords_path, self.label_path) def cal_loss(self, pred, labels, pos_target, neg_target, batch_size, conf): if self.loss_type == 'hinge_loss': if self.sub_loss_type == 'all': loss = batch_all_triplet_loss(labels, pred, conf['margin']) else: loss = batch_hard_triplet_loss(labels, pred, conf['margin']) else: loss = get_loss(type = self.loss_type, logits = pred, labels = labels, **conf) return loss def create_model_fn(self): def model_fn(features, labels, mode, params): if not self.use_language_model: self.init_embedding() if self.tfrecords_mode == 'class': self.embed_query = self.embedding(features = features, name = 'x_query') else: self.embed_query = self.embedding(features = features, name = 'x_query') self.embed_sample = self.embedding(features = features, name = 'x_sample') else: self.embedding = None #model params self.encoder.keep_prob = params['keep_prob'] self.encoder.is_training = params['is_training'] global_step = tf.train.get_or_create_global_step() if self.sim_mode == 'cross': if not self.use_language_model: pred = self.encoder(x_query = self.embed_query, x_sample = self.embed_sample, features = features) else: pred = self.encoder(features = features) elif self.sim_mode == 'represent': if not self.use_language_model: #features['x_query_length'] = features['length'] pred = self.encoder(self.embed_query, name = 'x_query', features = features) else: pred = self.encoder(features = features) else: raise ValueError('unknown sim mode') pos_target = tf.ones(shape = [int(self.batch_size/2)], dtype = tf.float32) neg_target = tf.zeros(shape = [int(self.batch_size/2)], dtype = tf.float32) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'pred': pred, 'label': features['label'] } return tf.estimator.EstimatorSpec(mode, predictions=predictions) loss = self.cal_loss(pred, labels, pos_target, neg_target, self.batch_size, self.conf) if mode == tf.estimator.ModeKeys.TRAIN: if self.use_clr: self.learning_rate = cyclic_learning_rate(global_step=global_step, learning_rate = self.learning_rate, mode = self.clr_mode) optimizer = get_train_op(global_step, self.optimizer_type, loss, self.learning_rate, clip_grad = 5) return tf.estimator.EstimatorSpec(mode, loss = loss, train_op=optimizer) if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = {} #{"accuracy": tf.metrics.accuracy( # labels=labels, predictions=predictions["classes"])} return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): if self.tfrecords_mode == 'pair': size = self.num_pair num_classes_per_batch = 2 num_sentences_per_class = self.batch_size // num_classes_per_batch else: size = self.num_class num_classes_per_batch = 16 num_sentences_per_class = self.batch_size // num_classes_per_batch filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \ for i in range(size)] logging.info("tfrecords train class num: {}".format(len(filenames))) datasets = [tf.data.TFRecordDataset(filename) for filename in filenames] datasets = [dataset.repeat() for dataset in datasets] #assert self.batch_size == num_sentences_per_class* num_classes_per_batch def generator(): while True: labels = np.random.choice(range(size), num_classes_per_batch, replace=False) for label in labels: for _ in range(num_sentences_per_class): yield label choice_dataset = tf.data.Dataset.from_generator(generator, tf.int64) dataset = tf.contrib.data.choose_from_datasets(datasets, choice_dataset) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(4*self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() #test #sess = tf.Session() #features,label = sess.run([features,label]) #features['x_query_pred'] = [item.decode('utf-8') for item in # features['x_query_pred'][1]] return features, label def test_input_fn(mode): filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \ for i in range(self.num_class)] assert self.num_class == len(filenames), "the num of tfrecords file error!" logging.info("tfrecords test class num: {}".format(len(filenames))) dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda : test_input_fn("test") elif mode == 'label': return lambda : test_input_fn("train") else: raise ValueError("unknown input_fn type!") def train(self): params = { 'is_training': True, 'keep_prob': 0.5 } config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(), config = config, params = params) estimator.train(input_fn = self.create_input_fn("train"), max_steps = self.max_steps) self.save() def save(self): params = { 'is_training': False, 'keep_prob': 1 } config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(), config = config, params = params) def serving_input_receiver_fn(): x_query = tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query') length = tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length') label = tf.placeholder(dtype=tf.int64, shape=[None], name='label') receiver_tensors = {'x_query': x_query, 'x_query_length': length, 'label': label} features = {'x_query': x_query, 'x_query_length': length, 'label': label} return tf.estimator.export.ServingInputReceiver(receiver_tensors, features) estimator.export_savedmodel( self.export_dir_path, # 目录 serving_input_receiver_fn, # 返回ServingInputReceiver的函数 assets_extra=None, as_text=False, checkpoint_path=None) def test(self): params = { 'is_training': False, 'keep_prob': 1 } config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(), config = config, params = params) predictions = estimator.predict(input_fn=self.create_input_fn("test")) predictions = list(predictions) predictions_vec = [item['pred'] for item in predictions] predictions_label = [item['label'] for item in predictions] if self.tfrecords_mode == 'class': refers = estimator.predict(input_fn=self.create_input_fn("label")) refers = list(refers) refers_vec = [item['pred'] for item in refers] refers_label = [item['label'] for item in refers] right = 0 thre_right = 0 sum = 0 scores = cosine_similarity(predictions_vec, refers_vec) max_id = np.argmax(scores, axis=-1) #max_id = self.knn(scores, predictions_label, refers_label) for idx, item in enumerate(max_id): if refers_label[item] == predictions_label[idx]: if scores[idx][item] > self.score_thre: thre_right += 1 right += 1 sum += 1 print("Acc:{}".format(float(right)/sum)) print("ThreAcc:{}".format(float(thre_right)/sum)) else: #TODO: 对于pair方式的评估 pdb.set_trace() def knn(self, scores, predictions_label, refers_label, k = 4): sorted_id = np.argsort(-scores, axis = -1) shape = np.shape(sorted_id) max_id = [] for idx in range(shape[0]): mp = defaultdict(int) for idy in range(k): mp[refers_label[int(sorted_id[idx][idy])]] += 1 max_id.append(max(mp,key=mp.get)) return max_id def test_unit(self, text): #######################init######################### if self.model_loaded == False: #添加不参与训练样本 if os.path.exists(self.no_train_path): csv = pd.read_csv(self.no_train_path, header = 0, sep=",", error_bad_lines=False) self.text_list += list(csv['text']) self.label_list += list(csv['target']) subdirs = [x for x in Path(self.export_dir_path).iterdir() if x.is_dir() and 'temp' not in str(x)] latest = str(sorted(subdirs)[-1]) self.predict_fn = predictor.from_saved_model(latest) self.init_embedding() self.model_loaded = True self.vec_list = self._get_vecs(self.predict_fn, self.text_list) #self.set_zdy_labels(['睡觉','我回家了','晚安','娃娃了','周杰伦','自然语言处理'], # ['打开情景模式','打开情景模式','打开情景模式', # '打开情景模式','打开情景模式','打开情景模式']) text_list = self.text_list vec_list = self.vec_list label_list = self.label_list #用于添加自定义问句(自定义优先) if self.zdy != {}: text_list = self.zdy['text_list'] + text_list vec_list = np.concatenate([self.zdy['vec_list'], self.vec_list], axis = 0) label_list = self.zdy['label_list'] + label_list vec = self._get_vecs(self.predict_fn, [text], need_preprocess = True) scores = cosine_similarity(vec, vec_list)[0] max_id = np.argmax(scores) max_score = scores[max_id] max_similar = text_list[max_id] logging.info("test result: {}, {}, {}".format(label_list[max_id], max_score, max_similar)) return label_list[max_id], max_score, max_id def set_zdy_labels(self, text_list, label_list): if len(text_list) == 0 or len(label_list) == 0: self.zdy = {} return self.zdy['text_list'] = text_list self.zdy['vec_list'] = self._get_vecs(self.predict_fn, text_list, need_preprocess = True) self.zdy['label_list'] = label_list def _get_vecs(self, predict_fn, text_list, need_preprocess = False): #根据batches数据生成向量 text_list_pred, x_query, x_query_length = self.embedding.text2id(text_list, self.vocab_dict, need_preprocess) label = [0 for _ in range(len(text_list))] predictions = predict_fn({'x_query': x_query, 'x_query_length': x_query_length, 'label': label}) return predictions['pred']
import sys sys.path.append('..') import pickle from utils.preprocess import Preprocess f = open('../train_tools/dict/chatbot_dict.bin', 'rb') word_index = pickle.load(f) f.close() sentence = "오늘 오후 5시 30분에 닭고기를 먹고 싶어 ㅎㅎㅎ" # 전처리 객체 생성 p = Preprocess(userdic='../utils/user_dic.tsv') # 형태소 분석기 실행 pos = p.pos(sentence) # 품사 태그와 같이 키워드 출력 keywords = p.get_keywords(pos, without_tag=True) for word in keywords: try: print(word, word_index[word]) except KeyError: # 해당 단어가 사전에 없는 경우 OOV 처리 print(word, word_index['OOV'])
import pickle # 말뭉치 데이터 읽기 def read_corpus_data(filename): with open(filename, 'r', encoding='utf-8') as f: data = [line.split('\t') for line in f.read().splitlines()] data = data[1:] # 헤더 제거 return data # 말뭉치 데이터 가져오기 corpus_data = read_corpus_data('./corpus.txt') # 말뭉치 데이터에서 키워드 추출하여 dictionary list 생성 p = Preprocess() dict = [] for c in corpus_data: pos = p.pos(c[1]) for k in pos: dict.append(k[0]) # 사전에 사용될 word2index 생성 # 사전 첫번째 인덱스에 OOV(Out of Vocabulary) 사용 tokenizer = preprocessing.text.Tokenizer(oov_token='OOV') tokenizer.fit_on_texts(dict) word_index = tokenizer.word_index # 사전 파일 생성 f = open("chatbot_dict.bin", 'wb') try: pickle.dump(word_index, f)
class Match(TaskBase): def __init__(self, conf): super(Match, self).__init__(conf) self.task_type = 'match' self.conf = conf self.read_data() self.num_class = len(set(self.label_list)) logging.info(">>>>>>>>>>>> class num:%s <<<<<<<<<<<<<<<" % self.num_class) self.conf.update({ "maxlen": self.maxlen, "maxlen1": self.maxlen, "maxlen2": self.maxlen, "num_class": self.num_class, "embedding_size": self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_output, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**self.conf) def read_data(self): self.pre = Preprocess() csv = pd.read_csv(self.ori_path, header=0, sep="\t", error_bad_lines=False) if 'text' in csv.keys() and 'target' in csv.keys(): #format: text \t target #for this format, the size for each class should be larger than 2 self.text_list = list(csv['text']) self.label_list = list(csv['target']) self.data_type = 'column_2' elif 'text_a' in csv.keys() and 'text_b' in csv.keys( ) and 'target' in csv.keys(): #format: text_a \t text_b \t target #for this format, target value can only be choosen from 0 or 1 self.text_a_list = list(csv['text_a']) self.text_b_list = list(csv['text_b']) self.text_list = self.text_a_list + self.text_b_list self.label_list = list(csv['target']) self.data_type = 'column_3' else: raise ValueError('error format for train file') self.text_list = [self.pre.get_dl_input_by_text(text) for text in \ self.text_list] def create_model_fn(self): def cal_loss(pred, labels, batch_size, conf): if self.tfrecords_mode == 'class': pos_scores, neg_scores = batch_hard_triplet_scores( labels, pred, is_distance=self.is_distance) # pos/neg scores pos_scores = tf.squeeze(pos_scores, -1) neg_scores = tf.squeeze(neg_scores, -1) #for represent, # pred is a batch of tensors which size >1 # we can use triplet loss(hinge loss) or contrastive loss #if use hinge loss, we don't need labels #if use other loss(contrastive loss), we need define pos/neg target before if self.loss_type in ['hinge_loss', 'improved_triplet_loss']: #pairwise loss = get_loss(type=self.loss_type, pos_logits=pos_scores, neg_logits=neg_scores, **conf) else: #pointwise pos_target = tf.ones(shape=[int(self.batch_size)], dtype=tf.float32) neg_target = tf.zeros(shape=[int(self.batch_size)], dtype=tf.float32) pos_loss = get_loss(type=self.loss_type, logits=pos_scores, labels=pos_target, **conf) neg_loss = get_loss(type=self.loss_type, logits=neg_scores, labels=neg_target, **conf) loss = pos_loss + neg_loss elif self.tfrecords_mode in ['pair', 'point']: if self.loss_type in ['hinge_loss', 'improved_triplet_loss']: assert self.tfrecords_mode == 'pair', "only pair mode can provide <query, pos, neg> format data" #pairwise if self.num_output == 1: pred = tf.nn.sigmoid(pred) elif self.num_output == 2: pred = tf.nn.softmax(pred)[:, 0] pred = tf.expand_dims(pred, -1) else: raise ValueError( 'unsupported num_output, 1(sigmoid) or 2(softmax)?' ) pos_scores = tf.strided_slice(pred, [0], [batch_size], [2]) neg_scores = tf.strided_slice(pred, [1], [batch_size], [2]) loss = get_loss(type=self.loss_type, pos_logits=pos_scores, neg_logits=neg_scores, **conf) elif self.loss_type in ['sigmoid_loss']: #pointwise labels = tf.expand_dims(labels, axis=-1) loss = get_loss(type=self.loss_type, logits=pred, labels=labels, **conf) else: raise ValueError('unsupported loss for pair/point match') else: raise ValueError('unknown tfrecords_mode?') return loss def model_fn(features, labels, mode, params): #model params self.encoder.keep_prob = params['keep_prob'] self.encoder.is_training = params['is_training'] global_step = tf.train.get_or_create_global_step() ############# encode ################# if not self.use_language_model: self.embedding, _ = self.init_embedding() if self.tfrecords_mode == 'class': self.embed_query = self.embedding(features=features, name='x_query') output = self.encoder(self.embed_query, name='x_query', features=features) output = tf.nn.l2_normalize(output, -1) elif self.tfrecords_mode in ['pair', 'point']: if self.sim_mode == 'cross': self.embed_query = self.embedding(features=features, name='x_query') self.embed_sample = self.embedding(features=features, name='x_sample') output = self.encoder(x_query=self.embed_query, x_sample=self.embed_sample, features=features) elif self.sim_mode == 'represent': self.embed_query = self.embedding(features=features, name='x_query') self.embed_sample = self.embedding(features=features, name='x_sample') query_encode = self.encoder(self.embed_query, name='x_query', features=features) sample_encode = self.encoder(self.embed_sample, name='x_sample', features=features) output = self.concat(query_encode, sample_encode) output = tf.layers.dense(output, 1, kernel_regularizer=tf.contrib. layers.l2_regularizer(0.001), name='fc') else: raise ValueError( 'unknown sim_mode, represent or cross') else: output = self.encoder(features=features) ############### predict ################## if mode == tf.estimator.ModeKeys.PREDICT: #pdb.set_trace() predictions = { 'encode': output, 'pred': tf.cast(tf.greater(tf.nn.softmax(output)[:, 0], 0.5), tf.int32) if self.num_output == 2 else tf.cast(tf.greater(tf.nn.sigmoid(output), 0.5), tf.int32), 'score': tf.nn.softmax(output)[:, 0] if self.num_output == 2 else tf.nn.sigmoid(output), 'label': features['label'] } return tf.estimator.EstimatorSpec(mode, predictions=predictions) ############### loss ################## loss = cal_loss(output, labels, self.batch_size, self.conf) ############### train ################## if mode == tf.estimator.ModeKeys.TRAIN: return self.train_estimator_spec(mode, loss, global_step, params) ############### eval ################## if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = {} #{"accuracy": tf.metrics.accuracy( # labels=labels, predictions=predictions["classes"])} return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): if self.tfrecords_mode == 'class': #size = self.num_class num_classes_per_batch = 32 assert num_classes_per_batch < self.num_class num_sentences_per_class = self.batch_size // num_classes_per_batch elif self.tfrecords_mode == 'pair': #data order: query,pos,query,neg num_sentences_per_class = 4 num_classes_per_batch = self.batch_size // num_sentences_per_class elif self.tfrecords_mode == 'point': #data order: query, sample(pos or neg) num_classes_per_batch = 2 num_sentences_per_class = self.batch_size // num_classes_per_batch else: raise ValueError('unknown tfrecords_mode') #filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \ # for i in range(size)] filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith('train') ] if len(filenames) == 0: logging.warn( "Can't find any tfrecords file for train, prepare now!") self.prepare() filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith('train') ] size = len(filenames) logging.info("tfrecords train class num: {}".format(size)) datasets = [ tf.data.TFRecordDataset(filename) for filename in filenames ] datasets = [dataset.repeat() for dataset in datasets] #datasets = [dataset.shuffle(buffer_size=1000) for dataset in datasets] def generator(): while True: labels = np.random.choice(range(size), num_classes_per_batch, replace=False) for label in labels: for _ in range(num_sentences_per_class): yield label choice_dataset = tf.data.Dataset.from_generator( generator, tf.int64) dataset = tf.contrib.data.choose_from_datasets( datasets, choice_dataset) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(4 * self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() ##test #pdb.set_trace() #sess = tf.Session() #features1,label1 = sess.run([features,label]) #features1['x_query_pred'] = [item.decode('utf-8') for item in features1['x_query_pred'][1]] #features1['x_sample_pred'] = [item.decode('utf-8') for item in features1['x_sample_pred'][1]] return features, label def test_input_fn(mode): #filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \ # for i in range(self.num_class * self.dev_size)] filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith(mode) ] assert self.num_class == len( filenames), "the num of tfrecords file error!" logging.info("tfrecords test class num: {}".format(len(filenames))) dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda: test_input_fn("test") elif mode == 'dev': return lambda: test_input_fn("dev") elif mode == 'label': return lambda: test_input_fn("train") else: raise ValueError("unknown input_fn type!") def train(self): params = { 'is_training': True, 'keep_prob': 0.7, } estimator = self.get_train_estimator(self.create_model_fn(), params) estimator.train(input_fn=self.create_input_fn("train"), max_steps=self.max_steps) def save(self): params = {'is_training': False, 'keep_prob': 1} def get_features(): features = { 'x_query': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query'), 'x_query_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length'), 'label': tf.placeholder(dtype=tf.int64, shape=[None], name='label') } if self.tfrecords_mode in ['pair', 'point']: features.update({ 'x_sample': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_sample'), 'x_sample_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_sample_length') }) features.update(self.encoder.get_features()) return features self.save_model(self.create_model_fn(), params, get_features) def test(self, mode='test'): params = {'is_training': False, 'keep_prob': 1} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) predictions = estimator.predict(input_fn=self.create_input_fn(mode)) predictions = list(predictions) if self.tfrecords_mode == 'class': predictions_vec = [item['encode'] for item in predictions] predictions_label = [item['label'] for item in predictions] refers = estimator.predict(input_fn=self.create_input_fn("label")) refers = list(refers) refers_vec = [item['encode'] for item in refers] refers_label = [item['label'] for item in refers] right = 0 thre_right = 0 sum = 0 if self.is_distance: scores = euclidean_distances(predictions_vec, refers_vec) selected_ids = np.argmin(scores, axis=-1) else: scores = cosine_similarity(predictions_vec, refers_vec) selected_ids = np.argmax(scores, axis=-1) for idx, item in enumerate(selected_ids): if refers_label[item] == predictions_label[idx]: if self.is_distance: if 1 - scores[idx][item] > self.score_thre: thre_right += 1 else: if scores[idx][item] > self.score_thre: thre_right += 1 right += 1 sum += 1 print("Acc:{}".format(float(right) / sum)) print("ThreAcc:{}".format(float(thre_right) / sum)) elif self.tfrecords_mode == 'pair': #对于pair方式的评估 scores = [item['score'] for item in predictions] labels = [item['label'] for item in predictions] #pdb.set_trace() #predictions scores = np.reshape(scores, [self.num_class * self.dev_size, -1]) pred_max_ids = np.argmax(scores, axis=-1) #label labels = np.reshape(labels, [self.num_class, -1]) right = 0 for idx, max_id in enumerate(pred_max_ids): if labels[idx][max_id] == 1: right += 1 sum = len(pred_max_ids) print("Acc:{}".format(float(right) / sum)) elif self.tfrecords_mode == 'point': scores = [item['score'] for item in predictions] scores = np.reshape(scores, -1) scores = [0 if item < self.score_thre else 1 for item in scores] #pred = [item['pred'] for item in predictions] labels = [item['label'] for item in predictions] res = metrics(labels=labels, logits=np.array(scores)) print("precision:{} recall:{} f1:{}".format( res[3], res[4], res[5])) def concat(self, a, b): tmp = tf.concat([a, b], axis=-1) #return tmp res1 = a * b res2 = a + b res3 = a - b return tf.concat([tmp, res1, res2, res3], axis=-1) def knn(self, scores, predictions_label, refers_label, k=4): sorted_id = np.argsort(-scores, axis=-1) shape = np.shape(sorted_id) max_id = [] for idx in range(shape[0]): mp = defaultdict(int) for idy in range(k): mp[refers_label[int(sorted_id[idx][idy])]] += 1 max_id.append(max(mp, key=mp.get)) return max_id
import sys sys.path.append('..') from utils.preprocess import Preprocess sentence = "내일 저녁 8시에 닭튀김을 주문하고 싶어" # 전처리 객체 생성 p = Preprocess(userdic='../utils/user_dic.tsv' ) # tsv : tab seperated values, 음식 이름, 시간 정보에 대한 사전 # 형태소 분석기 실행 pos = p.pos(sentence) # 품사 태그와 같이 키워드 출력 ret = p.get_keywords(pos, without_tag=False) print(ret)
sents = [] with open(file_name, 'r', encoding='utf-8') as f: lines = f.readlines() for idx, l in enumerate(lines): if l[0] == ';' and lines[idx + 1][0] == '$': this_sent = [] elif l[0] == '$' and lines[idx - 1][0] == ';': continue elif l[0] == '\n': sents.append(this_sent) else: this_sent.append(tuple(l.split())) return sents p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin', userdic='../../utils/user_dic.tsv') corpus = read_file('ner_train.txt') sentences, tags = [], [] for t in corpus: tagged_sentence = [] sentence, bio_tag = [], [] for w in t: tagged_sentence.append((w[1], w[3])) sentence.append(w[1]) bio_tag.append(w[3]) sentences.append(sentence) tags.append(bio_tag)
class XGB(TaskBase): def __init__(self, conf): super(XGB, self).__init__(conf) self.preprocess = Preprocess() self.vectorizer = TfidfVectorizer() self.thre = 0.5 self.read_data() def output_label(self): with open(self.dict_path, 'w') as f: for item in self.labels: f.write('{}\t{}\n'.format(item, self.labels[item])) def read_data(self): #load train_data csv = pd.read_csv(self.ori_path, header=0, sep="\t", error_bad_lines=False) train_list = self.preprocess.process(csv['text']) self.labels = { item: idx for idx, item in enumerate(set(csv['target'])) } self.labels_rev = {self.labels[item]: item for item in self.labels} self.labels_rev[-1] = '未知' self.output_label() print("class_num:", len(self.labels)) #train data weight X = self.vectorizer.fit_transform( [' '.join(item) for item in train_list]) y = [self.labels[item] for item in csv['target']] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self.test_size, random_state=0) self.data = {} self.data['x_train'] = X_train self.data['y_train'] = y_train self.data['x_test'] = X_test self.data['y_test'] = y_test def train(self): ### fit model for train data self.model = XGBClassifier( learning_rate=0.3, n_estimators=100, # 树的个数--1000棵树建立xgboost max_depth=6, # 树的深度 min_child_weight=1, # 叶子节点最小权重 gamma=0., # 惩罚项中叶子结点个数前的参数 subsample=0.8, # 随机选择80%样本建立决策树 colsample_btree=0.8, # 随机选择80%特征建立决策树 objective='multi:softmax', # 指定损失函数 scale_pos_weight=1, # 解决样本个数不平衡的问题 random_state=27 # 随机数 ) print("开始训练!") self.model.fit(self.data['x_train'], self.data['y_train'], eval_set=[(self.data['x_test'], self.data['y_test'])], eval_metric="mlogloss", early_stopping_rounds=10, verbose=True) ### model evaluate predictions = self.model.predict_proba(self.data['x_test']) y_pred = np.argmax(predictions, 1) scores = [predictions[idx][y_pred[idx]] for idx in range(len(y_pred))] #for idx in range(len(y_pred)): # if scores[idx] < self.thre: # y_pred[idx] = -1 accuracy = accuracy_score(self.data['y_test'], y_pred) print("accuarcy: %.2f%%" % (accuracy * 100.0)) #dt = pd.DataFrame({'text':self.data['raw_test_list'], # 'feature':self.data['test_list'], # 'target':[self.labels_rev[item] for item in # self.data['y_test']] , # 'pred': [self.labels_rev[item] for item in # y_pred], # 'score': scores }) #dt.to_csv(self.result_path,index=False,sep=',') def test(self, mode='test'): assert os.path.exists(file), "file [%s] not existed!" % file lines = [line.strip() for line in open(file).readlines()] test_list = self.preprocess.process(lines) test_weight = self.vectorizer.transform( [' '.join(item) for item in test_list]) predictions = self.model.predict_proba(test_weight) pred = np.argmax(predictions, 1) pdb.set_trace() with open(file + '.res', 'w') as f_w: for idx, line in enumerate(lines): f_w.write("{}\t{}\t{}\n".format(line, self.labels_rev[pred[idx]], predictions[idx][pred[idx]]))
class ClassifyM(): def __init__(self): self.preprocess = Preprocess() self.thre = 0.5 def load(self, train_path, test_path): #load train_data csv = pd.read_csv(train_path) train_list = self.preprocess.process(csv['text']) self.labels = { item: idx for idx, item in enumerate(set(csv['intent'])) } self.output_label() self.labels_rev = {self.labels[item]: item for item in self.labels} self.labels_rev[-1] = '未知' print("class_num:", len(self.labels)) self.labels_num = len(self.labels) y_train = [self.labels[item] for item in csv['intent']] #train data weight self.vectorizer = TfidfVectorizer() train_weight = self.vectorizer.fit_transform( [' '.join(item) for item in train_list]) #load test_data self.result_path = test_path + ".result.csv" csv = pd.read_csv(test_path) test_list = self.preprocess.process(csv['text']) y_test = [self.labels[item] for item in csv['intent']] #int label #test data weight test_weight = self.vectorizer.transform( [' '.join(item) for item in test_list]) self.data = {} self.data['x_train'] = train_weight self.data['y_train'] = y_train self.data['x_test'] = test_weight self.data['y_test'] = y_test self.data['raw_test_list'] = csv['text'] self.data['test_list'] = test_list def output_label(self): with open('data/label.txt', 'w') as f: for item in self.labels: f.write('{}\t{}\n'.format(item, self.labels[item])) def train(self): ### fit model for train data self.model = XGBClassifier( learning_rate=0.1, n_estimators=1000, # 树的个数--1000棵树建立xgboost max_depth=6, # 树的深度 min_child_weight=1, # 叶子节点最小权重 gamma=0., # 惩罚项中叶子结点个数前的参数 subsample=0.8, # 随机选择80%样本建立决策树 colsample_btree=0.8, # 随机选择80%特征建立决策树 objective='multi:softmax', # 指定损失函数 scale_pos_weight=1, # 解决样本个数不平衡的问题 random_state=27 # 随机数 ) print("开始训练!") self.model.fit(self.data['x_train'], self.data['y_train'], eval_set=[(self.data['x_test'], self.data['y_test'])], eval_metric="mlogloss", early_stopping_rounds=10, verbose=True) ### model evaluate predictions = self.model.predict_proba(self.data['x_test']) y_pred = np.argmax(predictions, 1) scores = [predictions[idx][y_pred[idx]] for idx in range(len(y_pred))] for idx in range(len(y_pred)): if scores[idx] < self.thre: y_pred[idx] = -1 accuracy = accuracy_score(self.data['y_test'], y_pred) print("accuarcy: %.2f%%" % (accuracy * 100.0)) dt = pd.DataFrame({ 'text': self.data['raw_test_list'], 'feature': self.data['test_list'], 'target': [self.labels_rev[item] for item in self.data['y_test']], 'pred': [self.labels_rev[item] for item in y_pred], 'score': scores }) dt.to_csv(self.result_path, index=False, sep=',') def test(self, text): test_list = self.preprocess.process([text]) test_weight = self.vectorizer.transform( [' '.join(item) for item in test_list]) predictions = self.model.predict_proba(test_weight) pred = np.argmax(predictions, 1) print(self.labels_rev[pred[0]], predictions[0][pred[0]]) def test(self, file): lines = [line.strip() for line in open(file).readlines()] test_list = self.preprocess.process(lines) test_weight = self.vectorizer.transform( [' '.join(item) for item in test_list]) predictions = self.model.predict_proba(test_weight) pred = np.argmax(predictions, 1) pdb.set_trace() with open(file + '.res', 'w') as f_w: for idx, line in enumerate(lines): f_w.write("{}\t{}\t{}\n".format(line, self.labels_rev[pred[idx]], predictions[idx][pred[idx]])) def process(self, train_path, test_path): self.load(train_path, test_path) self.train()
def __init__(self, config, root_dir='/data/music/chord_recognition', dataset_names=('ce200', ), featuretype=FeatureTypes.cqt, num_workers=20, train=False, preprocessing=False, resize=None, kfold=4): super(AudioDataset, self).__init__() self.config = config self.root_dir = root_dir self.dataset_names = dataset_names self.preprocessor = Preprocess(config, featuretype, dataset_names, self.root_dir) self.resize = resize self.train = train self.ratio = config.experiment['data_ratio'] # preprocessing hyperparameters # song_hz, n_bins, bins_per_octave, hop_length mp3_config = config.mp3 feature_config = config.feature self.mp3_string = "%d_%.1f_%.1f" % \ (mp3_config['song_hz'], mp3_config['inst_len'], mp3_config['skip_interval']) self.feature_string = "%s_%d_%d_%d" % \ (featuretype.value, feature_config['n_bins'], feature_config['bins_per_octave'], feature_config['hop_length']) if feature_config['large_voca'] == True: # store paths if exists is_preprocessed = True if os.path.exists( os.path.join(root_dir, 'result', dataset_names[0] + '_voca', self.mp3_string, self.feature_string)) else False if (not is_preprocessed) | preprocessing: midi_paths = self.preprocessor.get_all_files() print(' --------- need preprocessed -----------') if num_workers > 1: num_path_per_process = math.ceil( len(midi_paths) / num_workers) args = [ midi_paths[i * num_path_per_process:(i + 1) * num_path_per_process] for i in range(num_workers) ] # start process p = Pool(processes=num_workers) p.map(self.preprocessor.generate_labels_features_voca, args) p.close() else: self.preprocessor.generate_labels_features_voca(midi_paths) # kfold is 5 fold index ( 0, 1, 2, 3, 4 ) self.song_names, self.paths = self.get_paths_voca(kfold=kfold) else: # store paths if exists is_preprocessed = True if os.path.exists( os.path.join(root_dir, 'result', dataset_names[0], self.mp3_string, self.feature_string)) else False if (not is_preprocessed) | preprocessing: midi_paths = self.preprocessor.get_all_files() if num_workers > 1: num_path_per_process = math.ceil( len(midi_paths) / num_workers) args = [ midi_paths[i * num_path_per_process:(i + 1) * num_path_per_process] for i in range(num_workers) ] # start process p = Pool(processes=num_workers) p.map(self.preprocessor.generate_labels_features_new, args) p.close() else: self.preprocessor.generate_labels_features_new(midi_paths) # kfold is 5 fold index ( 0, 1, 2, 3, 4 ) self.song_names, self.paths = self.get_paths(kfold=kfold)
def __init__(self): self.preprocess = Preprocess() self.thre = 0.5
sys.path.append(".") import pandas as pd import tensorflow as tf from keras import preprocessing from keras.models import Model from keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate train_file_path = "./models/intent/total_train_data.csv" data = pd.read_csv(train_file_path) queries = data["query"].tolist() intents = data["intent"].tolist() from utils.preprocess import Preprocess p = Preprocess(word2idx_dic="./train/chatbot_bin.bin") sequences = [] for sent in queries: pos = p.pos(sent) keywords = p.get_keywords(pos, without_tag=True) seq = p.get_wordidx_sequence(keywords) sequences.append(seq) from config.GlobalParams import MAX_SEQ_LEN from sklearn.model_selection import train_test_split padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding="post") data_size = len(padded_seqs) ds = tf.data.Dataset.from_tensor_slices((padded_seqs, intents)).shuffle(data_size)
# 엔진. 의도 분류 모듈을 만들기 전 모델의 설계 및 학습 # 파일을 읽어 의도 분류 모델을 생성하고 학습 import pandas as pd import tensorflow as tf from tensorflow.keras import preprocessing from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate train_file = 'total_train_data.csv' data = pd.read_csv(train_file, delimiter=',') queries = data['query'].tolist() intents = data['intent'].tolist() from utils.preprocess import Preprocess p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin') sequences = [] for sentence in queries: pos = p.pos(sentence) keywords = p.get_keywords(pos, without_tag=True) seq = p.get_wordidx_sequence(keywords) sequences.append(seq) from config.globalparams import MAX_SEQ_LEN padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post') ds = tf.data.Dataset.from_tensor_slices((padded_seqs, intents)) ds = ds.shuffle(len(queries)) train_size = int(len(padded_seqs) * 0.7)
class NER(TaskBase): def __init__(self, conf): super(NER, self).__init__(conf) self.task_type = 'ner' self.conf = conf self.read_data() #if self.maxlen == -1: # self.maxlen = max([len(text.split()) for text in self.text_list]) #model params params = conf params.update({ "maxlen":self.maxlen, "embedding_size":self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_class, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**params) def read_data(self): self.pre = Preprocess() self.util = NERUtil() self.text_list, self.label_list = self.util.load_ner_data(self.ori_path) self.text_list = [self.pre.get_dl_input_by_text(text) for text in self.text_list] self.num_class = self.num_output = len(set(list(chain.from_iterable(self.label_list)))) self.data_type = 'column_2' def create_model_fn(self): def model_fn(features, labels, mode, params): self.encoder.keep_prob = params['keep_prob'] self.encoder.is_training = params['is_training'] seq_len = features['x_query_length'] global_step = tf.train.get_or_create_global_step() ################ encode ################## if not self.use_language_model: self.embedding, _ = self.init_embedding() embed = self.embedding(features = features, name = 'x_query') out = self.encoder(embed, 'x_query', features = features, middle_flag = True) else: out = self.encoder(features = features) logits = tf.reshape(out, [-1, int(out.shape[1]), self.num_class]) transition_params = tf.get_variable('crf', [self.num_class,self.num_class], dtype=tf.float32) pred_ids, _ = tf.contrib.crf.crf_decode(logits, transition_params, seq_len) ############### predict ################## if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'logit': logits, 'pred_ids': pred_ids, } return tf.estimator.EstimatorSpec(mode, predictions=predictions) else: ############### loss #################### log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(logits, labels, seq_len, transition_params) loss = -tf.reduce_mean(log_likelihood) if mode == tf.estimator.ModeKeys.TRAIN: return self.train_estimator_spec(mode, loss, global_step, params) if mode == tf.estimator.ModeKeys.EVAL: #pdb.set_trace() weights = tf.sequence_mask(seq_len, self.maxlen) metrics = {'acc': tf.metrics.accuracy(labels, pred_ids, weights)} #metrics = {'acc': tf.metrics.accuracy(labels, pred_ids)} return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): filenames = [os.path.join(self.tfrecords_path,item) for item in os.listdir(self.tfrecords_path) if item.startswith('train')] if len(filenames) == 0: logging.warn("Can't find any tfrecords file for train, prepare now!") self.prepare() filenames = [os.path.join(self.tfrecords_path,item) for item in os.listdir(self.tfrecords_path) if item.startswith('train')] dataset = tf.data.TFRecordDataset(filenames) dataset = dataset.repeat() gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(4*self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label def test_input_fn(mode): filenames = [os.path.join(self.tfrecords_path,item) for item in os.listdir(self.tfrecords_path) if item.startswith(mode)] assert len(filenames) > 0, "Can't find any tfrecords file for %s!"%mode dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda : test_input_fn("test") elif mode == 'dev': return lambda : test_input_fn("dev") else: raise ValueError("unknown input_fn type!") def save(self): params = { 'is_training': False, 'keep_prob': 1 } def get_features(): features = {'x_query': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query'), 'x_query_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length'), } #'label': tf.placeholder(dtype=tf.int64, # shape=[None], # name='label')} features.update(self.encoder.get_features()) return features self.save_model(self.create_model_fn(), params, get_features) def train(self): params = { 'is_training': True, 'keep_prob': 0.7 } estimator = self.get_train_estimator(self.create_model_fn(), params) estimator.train(input_fn = self.create_input_fn("train"), max_steps = self.max_steps) self.save() def test(self, mode = 'test'): params = { 'is_training': False, 'keep_prob': 1 } config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(), config = config, params = params) if mode == 'dev': estimator.evaluate(input_fn=self.create_input_fn('dev')) elif mode == 'test': estimator.evaluate(input_fn=self.create_input_fn('test')) else: raise ValueError("unknown mode:[%s]"%mode)
class AudioDataset(Dataset): def __init__(self, config, root_dir='/data/music/chord_recognition', dataset_names=('ce200', ), featuretype=FeatureTypes.cqt, num_workers=20, train=False, preprocessing=False, resize=None, kfold=4): super(AudioDataset, self).__init__() self.config = config self.root_dir = root_dir self.dataset_names = dataset_names self.preprocessor = Preprocess(config, featuretype, dataset_names, self.root_dir) self.resize = resize self.train = train self.ratio = config.experiment['data_ratio'] # preprocessing hyperparameters # song_hz, n_bins, bins_per_octave, hop_length mp3_config = config.mp3 feature_config = config.feature self.mp3_string = "%d_%.1f_%.1f" % \ (mp3_config['song_hz'], mp3_config['inst_len'], mp3_config['skip_interval']) self.feature_string = "%s_%d_%d_%d" % \ (featuretype.value, feature_config['n_bins'], feature_config['bins_per_octave'], feature_config['hop_length']) if feature_config['large_voca'] == True: # store paths if exists is_preprocessed = True if os.path.exists( os.path.join(root_dir, 'result', dataset_names[0] + '_voca', self.mp3_string, self.feature_string)) else False if (not is_preprocessed) | preprocessing: midi_paths = self.preprocessor.get_all_files() print(' --------- need preprocessed -----------') if num_workers > 1: num_path_per_process = math.ceil( len(midi_paths) / num_workers) args = [ midi_paths[i * num_path_per_process:(i + 1) * num_path_per_process] for i in range(num_workers) ] # start process p = Pool(processes=num_workers) p.map(self.preprocessor.generate_labels_features_voca, args) p.close() else: self.preprocessor.generate_labels_features_voca(midi_paths) # kfold is 5 fold index ( 0, 1, 2, 3, 4 ) self.song_names, self.paths = self.get_paths_voca(kfold=kfold) else: # store paths if exists is_preprocessed = True if os.path.exists( os.path.join(root_dir, 'result', dataset_names[0], self.mp3_string, self.feature_string)) else False if (not is_preprocessed) | preprocessing: midi_paths = self.preprocessor.get_all_files() if num_workers > 1: num_path_per_process = math.ceil( len(midi_paths) / num_workers) args = [ midi_paths[i * num_path_per_process:(i + 1) * num_path_per_process] for i in range(num_workers) ] # start process p = Pool(processes=num_workers) p.map(self.preprocessor.generate_labels_features_new, args) p.close() else: self.preprocessor.generate_labels_features_new(midi_paths) # kfold is 5 fold index ( 0, 1, 2, 3, 4 ) self.song_names, self.paths = self.get_paths(kfold=kfold) def __len__(self): return len(self.paths) def __getitem__(self, idx): instance_path = self.paths[idx] res = dict() data = torch.load(instance_path) res['feature'] = np.log(np.abs(data['feature']) + 1e-6) res['chord'] = data['chord'] return res def get_paths(self, kfold=4): temp = {} used_song_names = list() for name in self.dataset_names: dataset_path = os.path.join(self.root_dir, "result", name, self.mp3_string, self.feature_string) song_names = os.listdir(dataset_path) for song_name in song_names: paths = [] instance_names = os.listdir( os.path.join(dataset_path, song_name)) if len(instance_names) > 0: used_song_names.append(song_name) for instance_name in instance_names: paths.append( os.path.join(dataset_path, song_name, instance_name)) temp[song_name] = paths # throw away unused song names song_names = used_song_names song_names = SortedList(song_names) print('Total used song length : %d' % len(song_names)) tmp = [] for i in range(len(song_names)): tmp += temp[song_names[i]] print('Total instances (train and valid) : %d' % len(tmp)) # divide train/valid dataset using k fold result = [] total_fold = 5 quotient = len(song_names) // total_fold remainder = len(song_names) % total_fold fold_num = [0] for i in range(total_fold): fold_num.append(quotient) for i in range(remainder): fold_num[i + 1] += 1 for i in range(total_fold): fold_num[i + 1] += fold_num[i] if self.train: tmp = [] # get not augmented data for k in range(total_fold): if k != kfold: for i in range(fold_num[k], fold_num[k + 1]): result += temp[song_names[i]] tmp += song_names[fold_num[k]:fold_num[k + 1]] song_names = tmp else: for i in range(fold_num[kfold], fold_num[kfold + 1]): instances = temp[song_names[i]] instances = [inst for inst in instances if "1.00_0" in inst] result += instances song_names = song_names[fold_num[kfold]:fold_num[kfold + 1]] return song_names, result def get_paths_voca(self, kfold=4): temp = {} used_song_names = list() for name in self.dataset_names: dataset_path = os.path.join(self.root_dir, "result", name + '_voca', self.mp3_string, self.feature_string) song_names = os.listdir(dataset_path) for song_name in song_names: paths = [] instance_names = os.listdir( os.path.join(dataset_path, song_name)) if len(instance_names) > 0: used_song_names.append(song_name) for instance_name in instance_names: paths.append( os.path.join(dataset_path, song_name, instance_name)) temp[song_name] = paths # throw away unused song names song_names = used_song_names song_names = SortedList(song_names) print('Total used song length : %d' % len(song_names)) tmp = [] for i in range(len(song_names)): tmp += temp[song_names[i]] print('Total instances (train and valid) : %d' % len(tmp)) # divide train/valid dataset using k fold result = [] total_fold = 5 quotient = len(song_names) // total_fold remainder = len(song_names) % total_fold fold_num = [0] for i in range(total_fold): fold_num.append(quotient) for i in range(remainder): fold_num[i + 1] += 1 for i in range(total_fold): fold_num[i + 1] += fold_num[i] if self.train: tmp = [] # get not augmented data for k in range(total_fold): if k != kfold: for i in range(fold_num[k], fold_num[k + 1]): result += temp[song_names[i]] tmp += song_names[fold_num[k]:fold_num[k + 1]] song_names = tmp else: for i in range(fold_num[kfold], fold_num[kfold + 1]): instances = temp[song_names[i]] instances = [inst for inst in instances if "1.00_0" in inst] result += instances song_names = song_names[fold_num[kfold]:fold_num[kfold + 1]] return song_names, result
def __init__(self, conf): super(XGB, self).__init__(conf) self.preprocess = Preprocess() self.vectorizer = TfidfVectorizer() self.thre = 0.5 self.read_data()
class Classify(object): def __init__(self, conf): self.task_type = 'classify' self.conf = conf for attr in conf: setattr(self, attr, conf[attr]) self.pre = Preprocess() self.model_loaded = False self.zdy = {} csv = pd.read_csv(self.ori_path, header=0, sep=",", error_bad_lines=False) self.text_list = list(csv['text']) self.label_list = list(csv['target']) self.num_class = len(set(self.label_list)) self.num_output = self.num_class logging.info( f">>>>>>>>>>>> class num:{self.num_class} <<<<<<<<<<<<<<<") for idx, text in enumerate(self.text_list): self.text_list[idx] = self.pre.get_dl_input_by_text(text) if len(self.text_list[idx]) == 0: logging.error(f"find blank lines in {idx}") self.conf.update({ "maxlen": self.maxlen, "maxlen1": self.maxlen, "maxlen2": self.maxlen, "num_class": self.num_class, "embedding_size": self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_output, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**self.conf) def init_embedding(self): self.vocab_dict = embedding[self.embedding_type].build_dict(\ dict_path = self.dict_path, text_list = self.text_list, mode = self.mode) self.embedding = embedding[self.embedding_type]( text_list=self.text_list, vocab_dict=self.vocab_dict, dict_path=self.dict_path, random=self.rand_embedding, maxlen=self.maxlen, batch_size=self.batch_size, embedding_size=self.embedding_size, conf=self.conf) def prepare(self): self.init_embedding() self.gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) self.gt.process(self.text_list, self.label_list, self.embedding.text2id, self.encoder.encoder_fun, self.vocab_dict, self.tfrecords_path, self.label_path, self.test_size) logging.info("tfrecords generated!") def cal_loss(self, pred, labels, batch_size, conf): loss = get_loss(type=self.loss_type, logits=pred, labels=labels, labels_sparse=True, **conf) return loss def create_model_fn(self): def model_fn(features, labels, mode, params): ########### embedding ################# if not self.use_language_model: self.init_embedding() self.embed_query = self.embedding(features=features, name='x_query') else: self.embedding = None ############# encoder ################# #model params self.encoder.keep_prob = params['keep_prob'] self.encoder.is_training = params['is_training'] global_step = tf.train.get_or_create_global_step() if not self.use_language_model: out = self.encoder(self.embed_query, name='x_query', features=features) else: out = self.encoder(features=features) #pred = tf.nn.softmax(tf.layers.dense(out, self.num_class)) pred = tf.nn.softmax(out) ############### predict ################## if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'encode': out, 'logit': pred, 'label': features['label'] } return tf.estimator.EstimatorSpec(mode, predictions=predictions) ############### loss ################## loss = self.cal_loss(pred, labels, self.batch_size, self.conf) ############### train ################## if mode == tf.estimator.ModeKeys.TRAIN: if self.use_clr: self.learning_rate = cyclic_learning_rate( global_step=global_step, learning_rate=self.learning_rate, mode=self.clr_mode) optimizer = get_train_op(global_step, self.optimizer_type, loss, self.learning_rate, clip_grad=5) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=optimizer) ############### eval ################## if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = {} #{"accuracy": tf.metrics.accuracy( # labels=labels, predictions=predictions["classes"])} return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): size = self.num_class num_classes_per_batch = self.num_class_per_batch assert num_classes_per_batch <= self.num_class, \ f"num_classes_per_batch is {num_classes_per_batch} > {self.num_class}" num_sentences_per_class = self.batch_size // num_classes_per_batch filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \ for i in range(size)] logging.info("tfrecords train class num: {}".format( len(filenames))) datasets = [ tf.data.TFRecordDataset(filename) for filename in filenames ] datasets = [dataset.repeat() for dataset in datasets] #assert self.batch_size == num_sentences_per_class* num_classes_per_batch def generator(): while True: labels = np.random.choice(range(size), num_classes_per_batch, replace=False) for label in labels: for _ in range(num_sentences_per_class): yield label choice_dataset = tf.data.Dataset.from_generator( generator, tf.int64) dataset = tf.contrib.data.choose_from_datasets( datasets, choice_dataset) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(4 * self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() #test #sess = tf.Session() #features,label = sess.run([features,label]) #features['x_query_pred'] = [item.decode('utf-8') for item in # features['x_query_pred'][1]] return features, label def test_input_fn(mode): filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \ for i in range(self.num_class)] assert self.num_class == len( filenames), "the num of tfrecords file error!" logging.info("tfrecords test class num: {}".format(len(filenames))) dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda: test_input_fn("test") else: raise ValueError("unknown input_fn type!") def train(self): params = {'is_training': True, 'keep_prob': 0.5} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) estimator.train(input_fn=self.create_input_fn("train"), max_steps=self.max_steps) self.save() def save(self): params = {'is_training': False, 'keep_prob': 1} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) def serving_input_receiver_fn(): features = { 'x_query': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query'), 'x_query_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length'), 'label': tf.placeholder(dtype=tf.int64, shape=[None], name='label') } features.update(self.encoder.features) return tf.estimator.export.ServingInputReceiver(features, features) estimator.export_savedmodel( self.export_dir_path, # 目录 serving_input_receiver_fn, # 返回ServingInputReceiver的函数 assets_extra=None, as_text=False, checkpoint_path=None) def test(self): params = {'is_training': False, 'keep_prob': 1} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) predictions = estimator.predict(input_fn=self.create_input_fn("test")) predictions = list(predictions) scores = [item['logit'] for item in predictions] labels = [item['label'] for item in predictions] max_scores = np.max(scores, axis=-1) max_ids = np.argmax(scores, axis=-1) res = np.equal(labels, max_ids) right = len(list(filter(lambda x: x == True, res))) sum = len(res) print("Acc:{}".format(float(right) / sum))
class NER(object): def __init__(self, conf): self.conf = conf for attr in conf: setattr(self, attr, conf[attr]) self.task_type = 'ner' self.clip_grad = 5.0 self.optimizer_type = self.optimizer_type self.label2tag = { self.tag2label[item]: item for item in self.tag2label } self.shuffle = True self.is_training = tf.placeholder(tf.bool, [], name="is_training") self.global_step = tf.Variable(0, trainable=False) self.keep_prob = tf.where(self.is_training, 0.5, 1.0) self.pre = Preprocess() self.text_list, self.label_list = load_ner_data(self.train_path) if self.maxlen == -1: self.maxlen = max([len(text.split()) for text in self.text_list]) self.trans_label_list(self.label_list, self.tag2label) self.text_list = [ self.pre.get_dl_input_by_text(text) for text in self.text_list ] if not self.use_language_model: #build vocabulary map using training data self.vocab_dict = embedding[self.embedding_type].build_dict( dict_path=self.dict_path, text_list=self.text_list) #define embedding object by embedding_type self.embedding = embedding[self.embedding_type]( text_list=self.text_list, vocab_dict=self.vocab_dict, dict_path=self.dict_path, random=self.rand_embedding, batch_size=self.batch_size, maxlen=self.maxlen, embedding_size=self.embedding_size, conf=self.conf) self.embed = self.embedding(name='x') else: self.embedding = None self.labels = tf.placeholder(tf.int32, shape=[None, None], name="labels") self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], name="sequence_lengths") #model params params = conf params.update({ "maxlen": self.maxlen, "embedding_size": self.embedding_size, "keep_prob": self.keep_prob, "is_training": self.is_training, "batch_size": self.batch_size, "num_output": self.num_class }) self.encoder = encoder[self.encoder_type](**params) if not self.use_language_model: self.out = self.encoder(self.embed, 'query', middle_flag=True) else: self.out = self.encoder() self.output_nodes = self.out.name.split(':')[0] self.loss(self.out) self.optimizer = get_train_op(self.global_step, self.optimizer_type, self.loss, self.clip_grad, self.learning_rate) #self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) if self.use_language_model: tvars = tf.trainable_variables() init_checkpoint = conf['init_checkpoint_path'] (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) def loss(self, out): out_shape = tf.shape(out) self.logits = tf.reshape(out, [-1, out_shape[1], self.num_class]) if not self.use_crf: self.labels_softmax_ = tf.argmax(self.logits, axis=-1) self.labels_softmax_ = tf.cast(self.labels_softmax_, tf.int32) if self.use_crf: log_likelihood, self.transition_params = crf_log_likelihood( inputs=self.logits, tag_indices=self.labels, sequence_lengths=self.sequence_lengths) self.loss = -tf.reduce_mean(log_likelihood) else: losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits, labels=self.labels) mask = tf.sequence_mask(self.sequence_lengths) losses = tf.boolean_mask(losses, mask) self.loss = tf.reduce_mean(losses) tf.summary.scalar("loss", self.loss) def trans_label_list(self, label_list, tag2label): for idx, labels in enumerate(label_list): for idy, label in enumerate(labels): label_list[idx][idy] = tag2label[label_list[idx][idy]] def demo_one(self, sess, sent): label_list = [] batches = batch_iter(sent, self.batch_size, self.epoch_num, shuffle=False) for batch in batches: seqs, labels = zip(*batch) label_list_, _ = self.predict_one_batch(sess, seqs) label_list.extend(label_list_) label2tag = {} for tag, label in self.tag2label.items(): label2tag[label] = tag if label != 0 else label tag = [label2tag[label] for label in label_list[0]] return tag def train(self): train_data = zip(self.text_list, self.label_list) batches = batch_iter(train_data, self.batch_size, self.epoch_num, shuffle=True) max_acc = -1 for step, batch in enumerate(batches): x_batch, labels = zip(*batch) sys.stdout.write(' processing: {}.'.format(step + 1) + '\r') step_num = step + 1 if not self.use_language_model: _, x_batch, len_batch = self.embedding.text2id( x_batch, self.vocab_dict, self.maxlen, need_preprocess=False) feed_dict = {self.sequence_lengths: len_batch} feed_dict[self.labels], _ = self.embedding.pad_sequences( labels) feed_dict.update(self.embedding.feed_dict(x_batch, 'x')) feed_dict.update(self.encoder.feed_dict(query=len_batch)) else: feed_dict = {} feed_dict.update(self.encoder.feed_dict(x_batch)) _, loss_train, step_num_ = self.sess.run( [self.optimizer, self.loss, self.global_step], feed_dict=feed_dict) if step_num % (self.valid_step / 10) == 0: logging.info('step {}, loss: {:.4}'.format(\ step_num, loss_train)) if step_num % (self.valid_step) == 0: logging.info('===========validation / test===========') result = self.test() logging.info("result:", result) if result['acc'] > max_acc: max_acc = result['acc'] self.saver.save(self.sess, "{0}/{1}.ckpt".format( self.checkpoint_path, self.task_type), global_step=step) write_pb(self.checkpoint_path, self.model_path, ["is_training", self.output_nodes]) else: self.save_pb() logging.info(f'train finished! accuracy: {max_acc}') sys.exit(0) def test(self): #saver = tf.train.Saver() #with tf.Session() as sess: # logging.info('=========== testing ===========') # saver.restore(sess, self.model_path) # label_list, seq_len_list = self.dev_one_epoch(sess, test) # self.evaluate(label_list, seq_len_list, test) self.raw_dev_text_list, self.dev_label_list = load_ner_data( self.test_path) #self.raw_dev_text_list, self.dev_label_list = \ # self.raw_dev_text_list[:50], self.dev_label_list[:50] self.dev_text_list = [self.pre.get_dl_input_by_text(text) for \ text in self.raw_dev_text_list] self.trans_label_list(self.dev_label_list, self.tag2label) dev_data = zip(self.dev_text_list, self.dev_label_list) out_label_list, seq_len_list = self.dev_one_epoch(self.sess, dev_data) result = self.evaluate(self.dev_label_list, out_label_list, \ self.raw_dev_text_list, seq_len_list) return result def dev_one_epoch(self, sess, dev): """ :param sess: :param dev: :return: """ label_list, seq_len_list = [], [] batches = batch_iter(dev, self.batch_size, self.epoch_num, shuffle=False) for batch in batches: seqs, labels = zip(*batch) label_list_, seq_len_list_ = self.predict_one_batch(sess, seqs) label_list.extend(label_list_) seq_len_list.extend(seq_len_list_) return label_list, seq_len_list def predict_one_batch(self, sess, seqs): """ :param sess: :param seqs: :return: label_list seq_len_list """ if self.use_language_model: _, x_batch, len_batch = self.embedding.text2id( seqs, self.vocab_dict, self.maxlen, need_preprocess=False) feed_dict = {self.sequence_lengths: len_batch} feed_dict.update(self.embedding.feed_dict(x_batch, 'x')) feed_dict.update(self.encoder.feed_dict(query=len_batch)) else: feed_dict.update(self.encoder.feed_dict(x_batch)) if self.use_crf: logits, transition_params = sess.run( [self.logits, self.transition_params], feed_dict=feed_dict) label_list = [] for logit, seq_len in zip(logits, len_batch): viterbi_seq, _ = viterbi_decode(logit[:seq_len], transition_params) label_list.append(viterbi_seq) return label_list, len_batch else: label_list = sess.run(self.labels_softmax_, feed_dict=feed_dict) return label_list, len_batch #def evaluate(self, label_list, seq_len_list, data, epoch=None): def evaluate(self, dev_label_list, out_label_list, raw_dev_text_list, \ seq_len_list): model_predict = [] for label, label_pred, sent, seq_len in zip(dev_label_list, out_label_list, raw_dev_text_list, seq_len_list): sent = sent.split() sent_res = [] for idx in range(seq_len): sent_res.append([sent[idx], label[idx], label_pred[idx]]) model_predict.append(sent_res) accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. for item in model_predict: sent = [i[0] for i in item] lab = [i[1] for i in item] lab_pred = [i[2] for i in item] accs += [a == b for (a, b) in zip(lab, lab_pred)] lab_chunks = set(get_chunks(lab, self.tag2label)) lab_pred_chunks = set(get_chunks(lab_pred, self.tag2label)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 acc = np.mean(accs) return {"acc": 100 * acc, "f1": 100 * f1}
from utils.preprocess import Preprocess from models.ner.nermodel import NerModel p = Preprocess(word2index_dic='../train_tools/dict/chatbot_dict.bin', userdic='../utils/user_dic.tsv') ner = NerModel(model_name='../models/ner/ner_model.h5', proprocess=p) query = '오늘 오전 13시 2분에 탕수육 주문하고 싶어요' predicts = ner.predict(query) print(predicts)
def __init__(self, conf): self.conf = conf for attr in conf: setattr(self, attr, conf[attr]) self.task_type = 'ner' self.clip_grad = 5.0 self.optimizer_type = self.optimizer_type self.label2tag = { self.tag2label[item]: item for item in self.tag2label } self.shuffle = True self.is_training = tf.placeholder(tf.bool, [], name="is_training") self.global_step = tf.Variable(0, trainable=False) self.keep_prob = tf.where(self.is_training, 0.5, 1.0) self.pre = Preprocess() self.text_list, self.label_list = load_ner_data(self.train_path) if self.maxlen == -1: self.maxlen = max([len(text.split()) for text in self.text_list]) self.trans_label_list(self.label_list, self.tag2label) self.text_list = [ self.pre.get_dl_input_by_text(text) for text in self.text_list ] if not self.use_language_model: #build vocabulary map using training data self.vocab_dict = embedding[self.embedding_type].build_dict( dict_path=self.dict_path, text_list=self.text_list) #define embedding object by embedding_type self.embedding = embedding[self.embedding_type]( text_list=self.text_list, vocab_dict=self.vocab_dict, dict_path=self.dict_path, random=self.rand_embedding, batch_size=self.batch_size, maxlen=self.maxlen, embedding_size=self.embedding_size, conf=self.conf) self.embed = self.embedding(name='x') else: self.embedding = None self.labels = tf.placeholder(tf.int32, shape=[None, None], name="labels") self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], name="sequence_lengths") #model params params = conf params.update({ "maxlen": self.maxlen, "embedding_size": self.embedding_size, "keep_prob": self.keep_prob, "is_training": self.is_training, "batch_size": self.batch_size, "num_output": self.num_class }) self.encoder = encoder[self.encoder_type](**params) if not self.use_language_model: self.out = self.encoder(self.embed, 'query', middle_flag=True) else: self.out = self.encoder() self.output_nodes = self.out.name.split(':')[0] self.loss(self.out) self.optimizer = get_train_op(self.global_step, self.optimizer_type, self.loss, self.clip_grad, self.learning_rate) #self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) if self.use_language_model: tvars = tf.trainable_variables() init_checkpoint = conf['init_checkpoint_path'] (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map)