def build_eval_graph(self, words, pos, chunks, capitals): # convert to tensor of strings split_sentence = tf.string_split(words, " ") split_pos = tf.string_split(pos, ' ') split_chunks = tf.string_split(chunks, ' ') split_capitals = tf.string_split(capitals, ' ') # convert sparse to dense dense_words = tf.sparse_tensor_to_dense(split_sentence, default_value="") dense_pos = tf.sparse_tensor_to_dense(split_pos, default_value="") dense_chunks = tf.sparse_tensor_to_dense(split_chunks, default_value="") dense_capitals = tf.sparse_tensor_to_dense(split_capitals, default_value="") # do table lookup table_words = self.table.lookup(dense_words) table_pos = self.table_pos.lookup(dense_pos) table_chunks = self.table_chunk.lookup(dense_chunks) table_capitals = tf.string_to_number(dense_capitals, out_type=tf.int64) return table_words, table_pos, table_chunks, table_capitals
def build_eval_graph(self, x): # convert to tensor of strings split_example = tf.string_split(x, " ") # convert sparse to dense tensor_entry = tf.sparse_tensor_to_dense(split_example, default_value="") tensor_entry = self.table.lookup(tensor_entry) return tensor_entry
def __load_data(self, file_names, record_defaults, data_column, bucket_boundaries, field_delim=__DEFAULT_DELIM, skip_header_lines=0, num_epochs=None, shuffle=True): original_file_names = file_names[:] file_names = self.__generate_preprocessed_files(file_names, data_column, field_delim=field_delim) filename_queue = tf.train.string_input_producer( file_names, num_epochs=num_epochs, shuffle=shuffle ) sentence, pos, chunks, capitals, entities = self._read_file(filename_queue, record_defaults, field_delim, skip_header_lines) voca_path, voca_suffix = BaseDataLoader._split_file_to_path_and_name( original_file_names[0]) # TODO: will be break with multiple filenames voca_name = ConllPreprocessor.VOCABULARY_PREFIX + voca_suffix self.__vocabulary_file = voca_path + voca_name # load look up tables that maps words to ids if self.table is None: print('vocabulary table is None => creating it') main_voca_file = voca_path + voca_name if self._use_pretrained_emb: self.pretrained_emb_matrix, vocabulary = self.preload_embeddings(embed_dim=self._embed_dim, file_name=self._pretrained_emb_file, train_vocabulary=main_voca_file, other_vocabularies=self._other_voca_files) tensor_vocabulary = tf.constant(vocabulary) self.table = tf.contrib.lookup.index_table_from_tensor(tensor_vocabulary, default_value=ConllPreprocessor.UNK_TOKEN_ID, num_oov_buckets=0) else: self.table = tf.contrib.lookup.index_table_from_file(vocabulary_file=main_voca_file, default_value=ConllPreprocessor.UNK_TOKEN_ID, num_oov_buckets=0) if self.table_pos is None: print('vocabulary table_pos is None => creating it') self.table_pos = tf.contrib.lookup.index_table_from_file( vocabulary_file=voca_path + self._TABLE_POS + voca_suffix, num_oov_buckets=0) if self.table_chunk is None: print('vocabulary table_chunk is None => creating it') self.table_chunk = tf.contrib.lookup.index_table_from_file( vocabulary_file=voca_path + self._TABLE_CHUNK + voca_suffix, num_oov_buckets=0) if self.table_entity is None: print('vocabulary table_entity is None => creating it') self.table_entity = tf.contrib.lookup.index_table_from_file( vocabulary_file=voca_path + self._TABLE_ENTITY + voca_suffix, num_oov_buckets=0) if self._used_for_test_data: print('Reverse vocabulary is needed => creating it') self.reverse_table = tf.contrib.lookup.index_to_string_table_from_file( vocabulary_file=voca_path + voca_name) print('Reverse entity vocabulary is needed => creating it') self.reverse_table_entity = tf.contrib.lookup.index_to_string_table_from_file( vocabulary_file=voca_path + self._TABLE_ENTITY + voca_suffix) # convert to tensor of strings split_sentence = tf.string_split([sentence], " ") split_pos = tf.string_split([pos], ' ') split_chunks = tf.string_split([chunks], ' ') split_capitals = tf.string_split([capitals], ' ') split_entities = tf.string_split([entities], ' ') # determine lengths of sequences line_number = split_sentence.indices[:, 0] line_position = split_sentence.indices[:, 1] lengths = (tf.segment_max(data=line_position, segment_ids=line_number) + 1).sg_cast(dtype=tf.int32) # convert sparse to dense dense_sent = tf.sparse_tensor_to_dense(split_sentence, default_value="") dense_sent = self.table.lookup(dense_sent) dense_pos = tf.sparse_tensor_to_dense(split_pos, default_value="") dense_pos = self.table_pos.lookup(dense_pos) dense_chunks = tf.sparse_tensor_to_dense(split_chunks, default_value="") dense_chunks = self.table_chunk.lookup(dense_chunks) dense_capitals = tf.sparse_tensor_to_dense(split_capitals, default_value="") dense_capitals = tf.string_to_number(dense_capitals, out_type=tf.int64) dense_entities = tf.sparse_tensor_to_dense(split_entities, default_value="") dense_entities = self.table_entity.lookup(dense_entities) # get the enqueue op to pass to a coordinator to be run self.enqueue_op = self.shuffle_queue.enqueue( [dense_sent, dense_pos, dense_chunks, dense_capitals, dense_entities]) dense_sent, dense_pos, dense_chunks, dense_capitals, dense_entities = self.shuffle_queue.dequeue() # add queue to queue runner self.qr = tf.train.QueueRunner(self.shuffle_queue, [self.enqueue_op] * self.num_threads) tf.train.queue_runner.add_queue_runner(self.qr) # reshape from <unknown> shape into proper form after dequeue from random shuffle queue # this is needed so next queue can automatically infer the shape properly dense_sent = dense_sent.sg_reshape(shape=[1, -1]) dense_pos = dense_pos.sg_reshape(shape=[1, -1]) dense_chunks = dense_chunks.sg_reshape(shape=[1, -1]) dense_capitals = dense_capitals.sg_reshape(shape=[1, -1]) dense_entities = dense_entities.sg_reshape(shape=[1, -1]) _, (padded_sent, padded_pos, padded_chunk, padded_capitals, padded_entities) = \ tf.contrib.training.bucket_by_sequence_length(lengths, [dense_sent, dense_pos, dense_chunks, dense_capitals, dense_entities], batch_size=self._batch_size, bucket_boundaries=bucket_boundaries, dynamic_pad=True, capacity=self._capacity, num_threads=self.num_threads, name='bucket_queue') # reshape shape into proper form after dequeue from bucket queue padded_sent = padded_sent.sg_reshape(shape=[self._batch_size, -1]) padded_pos = padded_pos.sg_reshape(shape=[self._batch_size, -1]) padded_chunk = padded_chunk.sg_reshape(shape=[self._batch_size, -1]) padded_capitals = padded_capitals.sg_reshape(shape=[self._batch_size, -1, 1]) padded_entities = padded_entities.sg_reshape(shape=[self._batch_size, -1]) return padded_sent, padded_pos, padded_chunk, padded_capitals, padded_entities
initial=0, desc='test', ncols=70, unit='b', leave=False) # batch loop loss_avg = 0. for _ in iterator: # run session batch_loss = None #batch_loss = sess.run(loss) adv, diff, orig_x, preds, target, predsx, filenames = sess.run( [adv_x, diff_x, x, preds_adv, y, preds_x, filenames_t]) preds = tf.sparse_tensor_to_dense(preds, default_value=-1).eval() predsx = tf.sparse_tensor_to_dense(predsx, default_value=-1).eval() for p, px, t, filename in zip(preds, predsx, target, filenames): p = [(int(ch) + 1) for ch in p if ch != -1] str_p = index2str(p) t = [ch for ch in t if ch != 0] str_t = index2str(t) px = [(int(ch) + 1) for ch in px if ch != -1] str_px = index2str(px) if px != p: correct = "DIFF" else: correct = "SAME"
def __load_data(self, file_names, record_defaults, data_column, bucket_boundaries, field_delim=__DEFAULT_DELIM, skip_header_lines=0, num_epochs=None, shuffle=True): original_file_names = file_names[:] file_names = self.__generate_preprocessed_files( file_names, data_column, field_delim=field_delim) filename_queue = tf.train.string_input_producer(file_names, num_epochs=num_epochs, shuffle=shuffle) sentence, pos, chunks, capitals, entities = self._read_file( filename_queue, record_defaults, field_delim, skip_header_lines) voca_path, voca_suffix = BaseDataLoader._split_file_to_path_and_name( original_file_names[0] ) # TODO: will be break with multiple filenames voca_name = ConllPreprocessor.VOCABULARY_PREFIX + voca_suffix self.__vocabulary_file = voca_path + voca_name # load look up tables that maps words to ids if self.table is None: print('vocabulary table is None => creating it') main_voca_file = voca_path + voca_name if self._use_pretrained_emb: self.pretrained_emb_matrix, vocabulary = self.preload_embeddings( embed_dim=self._embed_dim, file_name=self._pretrained_emb_file, train_vocabulary=main_voca_file, other_vocabularies=self._other_voca_files) tensor_vocabulary = tf.constant(vocabulary) self.table = tf.contrib.lookup.index_table_from_tensor( tensor_vocabulary, default_value=ConllPreprocessor.UNK_TOKEN_ID, num_oov_buckets=0) else: self.table = tf.contrib.lookup.index_table_from_file( vocabulary_file=main_voca_file, default_value=ConllPreprocessor.UNK_TOKEN_ID, num_oov_buckets=0) if self.table_pos is None: print('vocabulary table_pos is None => creating it') self.table_pos = tf.contrib.lookup.index_table_from_file( vocabulary_file=voca_path + self._TABLE_POS + voca_suffix, num_oov_buckets=0) if self.table_chunk is None: print('vocabulary table_chunk is None => creating it') self.table_chunk = tf.contrib.lookup.index_table_from_file( vocabulary_file=voca_path + self._TABLE_CHUNK + voca_suffix, num_oov_buckets=0) if self.table_entity is None: print('vocabulary table_entity is None => creating it') self.table_entity = tf.contrib.lookup.index_table_from_file( vocabulary_file=voca_path + self._TABLE_ENTITY + voca_suffix, num_oov_buckets=0) if self._used_for_test_data: print('Reverse vocabulary is needed => creating it') self.reverse_table = tf.contrib.lookup.index_to_string_table_from_file( vocabulary_file=voca_path + voca_name) print('Reverse entity vocabulary is needed => creating it') self.reverse_table_entity = tf.contrib.lookup.index_to_string_table_from_file( vocabulary_file=voca_path + self._TABLE_ENTITY + voca_suffix) # convert to tensor of strings split_sentence = tf.string_split([sentence], " ") split_pos = tf.string_split([pos], ' ') split_chunks = tf.string_split([chunks], ' ') split_capitals = tf.string_split([capitals], ' ') split_entities = tf.string_split([entities], ' ') # determine lengths of sequences line_number = split_sentence.indices[:, 0] line_position = split_sentence.indices[:, 1] lengths = ( tf.segment_max(data=line_position, segment_ids=line_number) + 1).sg_cast(dtype=tf.int32) # convert sparse to dense dense_sent = tf.sparse_tensor_to_dense(split_sentence, default_value="") dense_sent = self.table.lookup(dense_sent) dense_pos = tf.sparse_tensor_to_dense(split_pos, default_value="") dense_pos = self.table_pos.lookup(dense_pos) dense_chunks = tf.sparse_tensor_to_dense(split_chunks, default_value="") dense_chunks = self.table_chunk.lookup(dense_chunks) dense_capitals = tf.sparse_tensor_to_dense(split_capitals, default_value="") dense_capitals = tf.string_to_number(dense_capitals, out_type=tf.int64) dense_entities = tf.sparse_tensor_to_dense(split_entities, default_value="") dense_entities = self.table_entity.lookup(dense_entities) # get the enqueue op to pass to a coordinator to be run self.enqueue_op = self.shuffle_queue.enqueue([ dense_sent, dense_pos, dense_chunks, dense_capitals, dense_entities ]) dense_sent, dense_pos, dense_chunks, dense_capitals, dense_entities = self.shuffle_queue.dequeue( ) # add queue to queue runner self.qr = tf.train.QueueRunner(self.shuffle_queue, [self.enqueue_op] * self.num_threads) tf.train.queue_runner.add_queue_runner(self.qr) # reshape from <unknown> shape into proper form after dequeue from random shuffle queue # this is needed so next queue can automatically infer the shape properly dense_sent = dense_sent.sg_reshape(shape=[1, -1]) dense_pos = dense_pos.sg_reshape(shape=[1, -1]) dense_chunks = dense_chunks.sg_reshape(shape=[1, -1]) dense_capitals = dense_capitals.sg_reshape(shape=[1, -1]) dense_entities = dense_entities.sg_reshape(shape=[1, -1]) _, (padded_sent, padded_pos, padded_chunk, padded_capitals, padded_entities) = \ tf.contrib.training.bucket_by_sequence_length(lengths, [dense_sent, dense_pos, dense_chunks, dense_capitals, dense_entities], batch_size=self._batch_size, bucket_boundaries=bucket_boundaries, dynamic_pad=True, capacity=self._capacity, num_threads=self.num_threads, name='bucket_queue') # reshape shape into proper form after dequeue from bucket queue padded_sent = padded_sent.sg_reshape(shape=[self._batch_size, -1]) padded_pos = padded_pos.sg_reshape(shape=[self._batch_size, -1]) padded_chunk = padded_chunk.sg_reshape(shape=[self._batch_size, -1]) padded_capitals = padded_capitals.sg_reshape( shape=[self._batch_size, -1, 1]) padded_entities = padded_entities.sg_reshape( shape=[self._batch_size, -1]) return padded_sent, padded_pos, padded_chunk, padded_capitals, padded_entities
def __load_batch(self, file_names, record_defaults, data_column, bucket_boundaries, field_delim=_CSV_DELIM, skip_header_lines=0, num_epochs=None, shuffle=True): original_file_names = file_names[:] file_names = self.__generate_preprocessed_files( file_names, data_column, bucket_boundaries, field_delim=field_delim) filename_queue = tf.train.string_input_producer(file_names, num_epochs=num_epochs, shuffle=shuffle) example, label = self._read_file(filename_queue, record_defaults, field_delim, skip_header_lines) voca_path, voca_name = BaseDataLoader._split_file_to_path_and_name( original_file_names[0] ) # TODO: will be break with multiple filenames voca_name = KagglePreprocessor.VOCABULARY_PREFIX + voca_name self.__vocabulary_file = voca_path + voca_name # load look up table that maps words to ids self.table = tf.contrib.lookup.index_table_from_file( vocabulary_file=voca_path + voca_name, default_value=KagglePreprocessor.UNK_TOKEN_ID, num_oov_buckets=0) # convert to tensor of strings split_example = tf.string_split([example], " ") # determine lengths of sequences line_number = split_example.indices[:, 0] line_position = split_example.indices[:, 1] lengths = ( tf.segment_max(data=line_position, segment_ids=line_number) + 1).sg_cast(dtype=tf.int32) # convert sparse to dense dense_example = tf.sparse_tensor_to_dense(split_example, default_value="") dense_example = self.table.lookup(dense_example) # get the enqueue op to pass to a coordintor to be run self.enqueue_op = self.shuffle_queue.enqueue([dense_example, label]) dense_example, label = self.shuffle_queue.dequeue() # add queue to queue runner self.qr = tf.train.QueueRunner(self.shuffle_queue, [self.enqueue_op] * self.num_threads) tf.train.queue_runner.add_queue_runner(self.qr) # reshape from <unknown> shape into proper form after dequeue from random shuffle queue # this is needed so next queue can automatically infer the shape properly dense_example = dense_example.sg_reshape(shape=[1, -1]) label = label.sg_reshape(shape=[1]) _, (padded_examples, label_examples) = tf.contrib.training.bucket_by_sequence_length( lengths, [dense_example, label], batch_size=self._batch_size, bucket_boundaries=bucket_boundaries, dynamic_pad=True, capacity=self._capacity, num_threads=self._num_threads) # reshape shape into proper form after dequeue from bucket queue padded_examples = padded_examples.sg_reshape( shape=[self._batch_size, -1]) label_examples = label_examples.sg_reshape(shape=[self._batch_size]) return padded_examples, label_examples
def train_loop(): with tf.device("/cpu:0"): # Launch the graph with tf.Session(graph=graph, config=config) as sess: print("Starting Tensorboard...") initstart = time.time() train_writer = tf.summary.FileWriter(logs_path + '/TRAIN', graph=sess.graph) test_writer = tf.summary.FileWriter(logs_path + '/TEST', graph=sess.graph) run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE, output_partition_graphs=True) run_metadata = tf.RunMetadata() tf.global_variables_initializer().run() saver = tf.train.Saver() #Load paths for curr_epoch in range(num_epochs): print('>>>', time.strftime('[%H:%M:%S]'), 'Epoch', curr_epoch + 1, '/', num_epochs) train_cost = train_ler = 0 start = t_time = time.time() index_list = range(0, datasetsize) for batch in range(num_batches_per_epoch): # Getting the index indexes = random.sample(index_list, batchsize) index_list = [x for x in index_list if x not in indexes] train_inputs = next_miniBatch(indexes, dr[0]) train_targets = next_target_miniBatch(indexes, dr[1]) #train_inputs,train_targets = fake_data(num_examples,num_mfccs,num_classes-1) newindex = [i % num_examples for i in range(batchsize)] random.shuffle(newindex) batch_train_inputs = train_inputs[newindex] # Padding input to max_time_step of this batch batch_train_inputs, batch_train_seq_len = pad_sequences( batch_train_inputs) #for x in range(batchsize): # print('>>>'+str(x)+': ',train_targets[newindex][x].size,batch_train_seq_len[x],dr[0][x]) # print(decode_to_chars(train_targets[newindex][x])) #if train_targets[newindex][x].size > batch_train_seq_len[x]: # Converting to sparse representation so as to to feed SparseTensor input batch_train_targets = sparse_tuple_from( train_targets[newindex]) #saveImg(batch_train_inputs) feed = { inputs: batch_train_inputs, targets: batch_train_targets, seq_len: batch_train_seq_len } batch_cost, _, l = sess.run( [cost, train_optimizer, ler], feed, options=run_options) #,run_metadata = run_metadata) train_cost += batch_cost * batchsize train_ler += l * batchsize print('[' + str(curr_epoch) + ']', ' >>>', time.strftime('[%H:%M:%S]'), 'Batch', batch + 1, '/', num_batches_per_epoch, '@Cost', batch_cost, 'Time Elapsed', time.time() - t_time, 's') t_time = time.time() if (batch % 16 == 0): summary = sess.run( merged, feed_dict=feed, options=run_options) #,run_metadata=run_metadata) train_writer.add_summary( summary, int(batch + (curr_epoch * num_batches_per_epoch))) #train_writer.add_run_metadata(run_metadata, 'step%03d' % int(batch+(curr_epoch*num_batches_per_epoch))) train_writer.flush() # Metrics mean train_cost /= num_examples train_ler /= num_examples #Testing print('>>>', time.strftime('[%H:%M:%S]'), 'Evaluating Test Accuracy...') t_index = random.sample(range(0, testsetsize), testbatchsize) test_inputs = next_miniBatch(t_index, t_dr[0], test=True) test_targets = next_target_miniBatch(t_index, t_dr[1]) newindex = [i % testbatchsize for i in range(testbatchsize)] batch_test_inputs = test_inputs[newindex] batch_test_inputs, batch_test_seq_len = pad_sequences( batch_test_inputs, test=True) batch_test_targets = sparse_tuple_from(test_targets[newindex]) t_feed = { inputs: batch_test_inputs, targets: batch_test_targets, seq_len: batch_test_seq_len } test_ler, d = sess.run( (ler, decoded[0]), feed_dict=t_feed, options=run_options) #,run_metadata = run_metadata) dense_decoded = tf.sparse_tensor_to_dense( d, default_value=-1).eval(session=sess) for i, seq in enumerate(dense_decoded): seq = [s for s in seq if s != -1] tmp_o = decode_to_chars(test_targets[i]) tmp_d = decode_to_chars(seq) print('Sequence %d' % i) print('\t Original:\n%s' % tmp_o) print('\t Decoded:\n%s' % tmp_d) #print('\t Corrected:\n%s' % tmp_corr) print('Done!') log = "Epoch {}/{} | Batch Cost : {:.3f} | Train Accuracy : {:.3f}% | Test Accuracy : {:.3f}% | Time Elapsed : {:.3f}s" print( log.format(curr_epoch + 1, num_epochs, train_cost, 100 - (train_ler * 100), 100 - (test_ler * 100), time.time() - start)) t_summary = sess.run( merged, feed_dict=t_feed, options=run_options) #, run_metadata=run_metadata) test_writer.add_summary( t_summary, int(batch + (curr_epoch * num_batches_per_epoch))) #test_writer.add_run_metadata(run_metadata, 'step%03d' % int(batch+(curr_epoch*num_batches_per_epoch))) test_writer.flush() save_path = saver.save(sess, savepath + '/model') print(">>> Model saved succesfully") print('Total Training Time: ' + str(time.time() - initstart) + 's')