def _load_data_(self): self.text_len = [] self.label_vocab = Vocabulary() # load labels self.labels = [] with open(self.meta_data_path, mode='r') as f: for line in f: label_token = line.strip().split('\t')[-1] self.label_vocab.add_token(label_token) self.labels.append(self.label_vocab.token_to_id[label_token]) # load index self.data_index = [] with open(self.data_index_path, mode='r') as f: for line in f: if self.dataset_name == 'R8' or self.dataset_name == 'R52' or self.dataset_name == 'ohsumed' or self.dataset_name == 'MR': if self.is_test and line.find('test') != -1: self.data_index.append(len(self.data_index)) elif not self.is_test and line.find('train') != -1: self.data_index.append(len(self.data_index)) elif self.dataset_name == '20ng': self.data_index.append(int(line.strip())) # load data self.data = [] with open(self.data_path, mode='r') as f: for line in f: ids = self.token_vocab.index_sentence(line.strip()) if len(ids) > self.max_sequence_length: ids = ids[:self.max_sequence_length] self.text_len.append(self.max_sequence_length) elif len(ids) < self.max_sequence_length: self.text_len.append(len(ids)) ids.extend([0] * (self.max_sequence_length - len(ids))) self.data.append(ids)
class TextDataset(Dataset): def __init__(self, data_index_path, data_path, meta_data_path, vocab_path, max_sequence_length, dataset_name, is_test): self.data_index_path = data_index_path self.data_path = data_path self.vocab_path = vocab_path self.meta_data_path = meta_data_path self.max_sequence_length = max_sequence_length self.dataset_name = dataset_name self.is_test = is_test self.token_vocab = Vocabulary(self.vocab_path, is_padded=True) self._load_data_() def _load_data_(self): self.text_len = [] self.label_vocab = Vocabulary() # load labels self.labels = [] with open(self.meta_data_path, mode='r') as f: for line in f: label_token = line.strip().split('\t')[-1] self.label_vocab.add_token(label_token) self.labels.append(self.label_vocab.token_to_id[label_token]) # load index self.data_index = [] with open(self.data_index_path, mode='r') as f: for line in f: if self.dataset_name == 'R8' or self.dataset_name == 'R52' or self.dataset_name == 'ohsumed' or self.dataset_name == 'MR': if self.is_test and line.find('test') != -1: self.data_index.append(len(self.data_index)) elif not self.is_test and line.find('train') != -1: self.data_index.append(len(self.data_index)) elif self.dataset_name == '20ng': self.data_index.append(int(line.strip())) # load data self.data = [] with open(self.data_path, mode='r') as f: for line in f: ids = self.token_vocab.index_sentence(line.strip()) if len(ids) > self.max_sequence_length: ids = ids[:self.max_sequence_length] self.text_len.append(self.max_sequence_length) elif len(ids) < self.max_sequence_length: self.text_len.append(len(ids)) ids.extend([0] * (self.max_sequence_length - len(ids))) self.data.append(ids) def __getitem__(self, item): input_ids = torch.LongTensor(self.data[self.data_index[item]]) text_len = torch.LongTensor([self.text_len[item]]) label = torch.LongTensor([self.labels[self.data_index[item]]]) return input_ids, text_len, label def __len__(self): return len(self.data_index)
def __init__(self, data_index_path, data_path, meta_data_path, vocab_path, max_sequence_length, dataset_name, is_test): self.data_index_path = data_index_path self.data_path = data_path self.vocab_path = vocab_path self.meta_data_path = meta_data_path self.max_sequence_length = max_sequence_length self.dataset_name = dataset_name self.is_test = is_test self.token_vocab = Vocabulary(self.vocab_path, is_padded=True) self._load_data_()
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') if not FLAGS.output_dir: raise ValueError( 'You must supply the output directory with --output_dir') print('Dataset directory:', FLAGS.dataset_dir) print('Output directory:', FLAGS.output_dir) vocab = Vocabulary() writer = DataWriter(vocab, FLAGS.dataset_dir, FLAGS.output_dir, FLAGS.str_size, FLAGS.name, FLAGS.split) writer.build_data()
) remove_stopwords = True min_freq = 5 lowercase = True if args.module == "train": train_iter = ReutersDatasetIterator(args.data_root, "training") vocab_path = "common_persist/vocab.pkl" if os.path.exists(vocab_path): log.info("Loading existing vocab") vocabulary = file_utils.load_obj(vocab_path) else: log.info("Vocab doesn't exist. Creating") if not os.path.exists("common_persist"): os.makedirs("common_persist") vocabulary = Vocabulary( remove_stopwords, min_freq, lowercase, "./data/reuters/stopwords") vocabulary.build(train_iter) file_utils.save_obj(vocabulary, vocab_path) train_set = ReutersDataset(args.data_root, "training", vocabulary) test_set = ReutersDataset(args.data_root, "test", vocabulary) train_loader = DataLoader(train_set, shuffle=True, batch_size=1) test_loader = DataLoader(test_set, shuffle=False, batch_size=1) if args.model == "doc2vec": doc2vec_model_path = "common_persist/doc2vec_model.pkl" train_tagged_path = "common_persist/train_tagged.pkl" test_tagged_path = "common_persist/test_tagged.pkl"
def main(_): assert FLAGS.file_pattern, "--file_pattern is required" assert FLAGS.train_checkpoints, "--train_checkpoints is required" assert FLAGS.summaries_dir, "--summaries_dir is required" vocab = Vocabulary() model_config = configuration.ModelConfig() training_config = configuration.TrainingConfig() print(FLAGS.learning_rate) training_config.initial_learning_rate = FLAGS.learning_rate sequence_length = model_config.sequence_length batch_size = FLAGS.batch_size summaries_dir = FLAGS.summaries_dir if not tf.gfile.IsDirectory(summaries_dir): tf.logging.info("Creating training directory: %s", summaries_dir) tf.gfile.MakeDirs(summaries_dir) train_checkpoints = FLAGS.train_checkpoints if not tf.gfile.IsDirectory(train_checkpoints): tf.logging.info("Creating training directory: %s", train_checkpoints) tf.gfile.MakeDirs(train_checkpoints) # 数据队列初始化 input_queue = DataReader(FLAGS.dataset_dir, FLAGS.file_pattern, model_config, batch_size=batch_size) g = tf.Graph() with g.as_default(): # 数据队列 with tf.name_scope(None, 'input_queue'): input_images, input_labels = input_queue.read() # 模型建立 model = crnn.CRNN(256, model_config.num_classes, 'train') logits = model.build(input_images) with tf.name_scope(None, 'loss'): loss = tf.reduce_mean( tf.nn.ctc_loss(labels=input_labels, inputs=logits, sequence_length=sequence_length * tf.ones(batch_size, dtype=tf.int32)), name='compute_loss', ) tf.losses.add_loss(loss) total_loss = tf.losses.get_total_loss(False) with tf.name_scope(None, 'decoder'): decoded, _ = tf.nn.ctc_beam_search_decoder( logits, sequence_length * tf.ones(batch_size, dtype=tf.int32), merge_repeated=False, ) with tf.name_scope(None, 'acurracy'): sequence_dist = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), input_labels), name='seq_dist', ) preds = tf.sparse_tensor_to_dense(decoded[0], name='prediction') gt_labels = tf.sparse_tensor_to_dense(input_labels, name='Ground_Truth') # print(len(slim.get_model_variables())) # print('>>>>>>>>>>>>>>>>>>>>>>>>>>>') # print(len(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) # sys.exit() global_step = tf.Variable(initial_value=0, name="global_step", trainable=False, collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) start_learning_rate = training_config.initial_learning_rate learning_rate = tf.train.exponential_decay( start_learning_rate, global_step, decay_steps=training_config.learning_decay_steps, decay_rate=training_config.learning_rate_decay_factor, staircase=True, ) # summary # Add summaries for variables. for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) tf.summary.scalar(name='Seq_Dist', tensor=sequence_dist) tf.summary.scalar(name='global_step', tensor=global_step) tf.summary.scalar(name='learning_rate', tensor=learning_rate) tf.summary.scalar(name='total_loss', tensor=total_loss) # global/secs hook globalhook = tf.train.StepCounterHook( every_n_steps=FLAGS.log_every_n_steps, ) # 保存chekpoints的hook # saver = tf.train.Saver(max_to_keep=training_config.max_checkpoints_to_keep) # saverhook = tf.train.CheckpointSaverHook( # checkpoint_dir=FLAGS.train_checkpoints, # save_steps=2000, # saver=saver, # ) # #保存summaries的hook # merge_summary_op = tf.summary.merge_all() # summaryhook = tf.train.SummarySaverHook( # save_steps=200, # output_dir=FLAGS.summaries_dir, # summary_op=merge_summary_op, # ) # 训练时需要logging的hook tensors_print = { 'global_step': global_step, 'loss': loss, 'Seq_Dist': sequence_dist, # 'accurays':accurays, } loghook = tf.train.LoggingTensorHook( tensors=tensors_print, every_n_iter=FLAGS.log_every_n_steps, ) # 停止hook stophook = tf.train.StopAtStepHook(last_step=FLAGS.number_of_steps) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) session_config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) # extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # with tf.control_dependencies(extra_update_ops): # optimizer = tf.train.AdadeltaOptimizer( # learning_rate=learning_rate).minimize(loss=total_loss, global_step=global_step) optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate) train_op = tf.contrib.training.create_train_op(total_loss=total_loss, optimizer=optimizer, global_step=global_step) # train_op = tf.group([optimizer, total_loss, sequence_dist]) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_checkpoints, hooks=[globalhook, loghook, stophook], save_checkpoint_secs=180, save_summaries_steps=100, config=session_config) as sess: while not sess.should_stop(): oloss, opreds, ogt_labels = sess.run( [train_op, preds, gt_labels]) accuray = compute_acuracy(opreds, ogt_labels) print("accuracy: %9f" % (accuray))
def main(_): assert FLAGS.file_pattern, "--file_pattern is required" assert FLAGS.train_checkpoints, "--train_checkpoints is required" assert FLAGS.summaries_dir, "--summaries_dir is required" vocab = Vocabulary() model_config = configuration.ModelConfig() training_config = configuration.TrainingConfig() print(FLAGS.learning_rate) training_config.initial_learning_rate = FLAGS.learning_rate sequence_length = model_config.sequence_length batch_size = FLAGS.batch_size summaries_dir = FLAGS.summaries_dir if not tf.gfile.IsDirectory(summaries_dir): tf.logging.info("Creating training directory: %s", summaries_dir) tf.gfile.MakeDirs(summaries_dir) train_checkpoints = FLAGS.train_checkpoints if not tf.gfile.IsDirectory(train_checkpoints): tf.logging.info("Creating training directory: %s", train_checkpoints) tf.gfile.MakeDirs(train_checkpoints) # 数据队列初始化 input_queue = DataReader(FLAGS.dataset_dir, FLAGS.file_pattern, model_config, batch_size=batch_size) g = tf.Graph() with g.as_default(): # 数据队列 with tf.name_scope(None, 'input_queue'): input_images, input_labels = input_queue.read() # 模型建立 model = crnn.CRNN(256, model_config.num_classes, 'train') logits = model.build(input_images) with tf.name_scope(None, 'loss'): loss = tf.reduce_mean( tf.nn.ctc_loss(labels=input_labels, inputs=logits, sequence_length=sequence_length * tf.ones(batch_size, dtype=tf.int32)), name='compute_loss', ) tf.losses.add_loss(loss) total_loss = tf.losses.get_total_loss(False) with tf.name_scope(None, 'decoder'): decoded, _ = tf.nn.ctc_beam_search_decoder( logits, sequence_length * tf.ones(batch_size, dtype=tf.int32), merge_repeated=False, ) with tf.name_scope(None, 'acurracy'): sequence_dist = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), input_labels), name='seq_dist', ) preds = tf.sparse_tensor_to_dense(decoded[0], name='prediction') gt_labels = tf.sparse_tensor_to_dense(input_labels, name='Ground_Truth') global_step = tf.Variable(initial_value=0, name="global_step", trainable=False, collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) # 训练时需要logging的hook tensors_print = { 'global_step': global_step, #'loss': loss, } loghook = tf.train.LoggingTensorHook( tensors=tensors_print, every_n_iter=FLAGS.log_every_n_steps, ) # 停止hook stophook = tf.train.StopAtStepHook(last_step=FLAGS.number_of_steps) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) session_config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) train_op = tf.assign_add(global_step, tf.constant(1)) session = tf.train.ChiefSessionCreator( config=session_config, checkpoint_dir=FLAGS.train_checkpoints, ) labels_shape = input_labels.dense_shape with tf.train.MonitoredSession(session, hooks=[loghook, stophook]) as sess: while not sess.should_stop(): test_logits, test_images, test_shape, _ = \ sess.run([logits, input_images, labels_shape, input_labels]) if test_logits.shape[ 1] != FLAGS.batch_size or test_images.shape[ 0] != FLAGS.batch_size or test_shape[ 0] != FLAGS.batch_size: print("get it!!!!!") test_loss = sess.run([loss]) sess.run(train_op)
%matplotlib inline import matplotlib.pyplot as plt import matplotlib.image as mpimg from data_utils.vocabulary import Vocabulary from model import configuration, crnn tf.logging.set_verbosity(tf.logging.INFO) gpu_options = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) sess = tf.InteractiveSession(config=config) #模型建立 vocab = Vocabulary() with tf.name_scope(None, 'input_image'): img_input = tf.placeholder(tf.uint8, shape=(32, 300, 3)) image = tf.to_float(img_input) image = tf.expand_dims(image, 0) model = crnn.CRNN(256, 37, 'inference') logit = model.build(image) # print(logit.get_shape().as_list()) # print(tf.shape(logit)[0]) # sys.exit() decodes, _ = tf.nn.ctc_beam_search_decoder(inputs=logit, sequence_length=tf.shape(