def main(_): initialize_globals() if not FLAGS.test_files: log_error('You need to specify what files to use for evaluation via ' 'the --test_files flag.') exit(1) global alphabet alphabet = Alphabet(FLAGS.alphabet_config_path) # sort examples by length, improves packing of batches and timesteps test_data = preprocess( FLAGS.test_files.split(','), FLAGS.test_batch_size, alphabet=alphabet, numcep=Config.n_input, numcontext=Config.n_context, hdf5_cache_path=FLAGS.hdf5_test_set).sort_values( by="features_len", ascending=False) from DeepSpeech import create_inference_graph graph = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1) samples = evaluate(test_data, graph, alphabet) if FLAGS.test_output_file: # Save decoded tuples as JSON, converting NumPy floats to Python floats json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
def test(): # Reading test set test_data = preprocess(FLAGS.test_files.split(','), FLAGS.test_batch_size, Config.n_input, Config.n_context, Config.alphabet) graph = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1) evaluate.evaluate(test_data, graph)
def test(): # Reading test set test_data = preprocess(FLAGS.test_files.split(','), FLAGS.test_batch_size, Config.n_input, Config.n_context, Config.alphabet, hdf5_cache_path=FLAGS.test_cached_features_path) graph = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1) evaluate.evaluate(test_data, graph)
def test(ckpt_file, test_data): # Reading test set test_data = preprocess(test_data.split(','), FLAGS.test_batch_size, Config.n_input, Config.n_context, Config.alphabet, hdf5_cache_path=FLAGS.test_cached_features_path) graph = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1) evaluate.evaluate(test_data, graph, Config.alphabet, ckpt_file)
def predict_comments(comments): # 加载停用词 with open('data/stopwords.txt', encoding='utf-8') as f: stopwords = [line.strip('\n') for line in f.readlines()] test_comment = random.choice(comments) pre = preprocess() # 选择其中一条分类,并去除非中文字符 content = pre.clean_str(test_comment['content']) rating = test_comment['rating']['value'] # 对评论分词 seg_list = jieba.cut(content, cut_all=False, HMM=True) # 去掉停用词和无意义的 cut_content = ' '.join( [x.strip('\n') for x in seg_list if x not in stopwords and len(x) > 1]) n_dim = 20000 vectorizer = pickle.load(open('data/vectorizer.pickle', 'rb')) # 转化为特征向量 one_test_data = vectorizer.transform([cut_content]) # 转化为 pytorch 输入的 Tensor 数据,squeeze(0) 增加一个 batch 维度 one_test_data = torch.from_numpy(one_test_data.toarray()).unsqueeze(0) model = TxtModel(n_dim, 2).double() model.to(device) model.load_state_dict(torch.load("checkpoints/moviePointEpoch3i144.pth")) # 使用准确度最好的模型预测,softmax 处理输出概率,取得最大概率的下标再加 1 则为预测的标签 pred = torch.argmax(F.softmax(model(one_test_data.to(device)), dim=1)) + 1 if rating < 3: rat = '差评1' else: rat = '好评2' print('评论内容: ', content) #print('关键字: ',cut_content) print('观众评价: ', rat) print('预测评价: ', pred)
def main(_): initialize_globals() if not FLAGS.test_files: log_error('You need to specify what files to use for evaluation via ' 'the --test_files flag.') exit(1) #if FLAGS.embeddings_output_dir: # prefix = FLAGS.embeddings_output_dir # print('Prefix :', prefix) # #print('LAYER4 :', LAYER4) # EMBEDDINGS = prefix + 'embeddings/' # LAYER4 = EMBEDDINGS + 'layer4/' # LAYER5 = EMBEDDINGS + 'layer5/' # LAYER6 = EMBEDDINGS + 'layer6/' # c.TEXT = EMBEDDINGS + 'text/' # print('LAYER4 :', LAYER4) # sort examples by length, improves packing of batches and timesteps test_data = preprocess(FLAGS.test_files.split(','), FLAGS.test_batch_size, alphabet=Config.alphabet, numcep=Config.n_input, numcontext=Config.n_context, hdf5_cache_path=FLAGS.hdf5_test_set).sort_values( by="features_len", ascending=False) #print('test_data', test_data) #print(test_data.fname[1]) #return 1 #print(test_data[0].fname) print('Batch Size: ', FLAGS.test_batch_size) from DeepSpeech import create_inference_graph graph = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1) samples = evaluate(test_data, graph) if FLAGS.test_output_file: # Save decoded tuples as JSON, converting NumPy floats to Python floats json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
def main(_): initialize_globals() if not FLAGS.test_files: log_error('You need to specify what files to use for evaluation via ' 'the --test_files flag.') exit(1) global alphabet alphabet = Alphabet(FLAGS.alphabet_config_path) scorer = Scorer(FLAGS.lm_weight, FLAGS.valid_word_count_weight, FLAGS.lm_binary_path, FLAGS.lm_trie_path, alphabet) # sort examples by length, improves packing of batches and timesteps test_data = preprocess( FLAGS.test_files.split(','), FLAGS.test_batch_size, alphabet=alphabet, numcep=N_FEATURES, numcontext=N_CONTEXT, hdf5_cache_path=FLAGS.hdf5_test_set).sort_values( by="features_len", ascending=False) def create_windows(features): num_strides = len(features) - (N_CONTEXT * 2) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2*N_CONTEXT+1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, N_FEATURES), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) return features # Create overlapping windows over the features test_data['features'] = test_data['features'].apply(create_windows) with tf.Session() as session: inputs, outputs, layers = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1) # Transpose to batch major for decoder transposed = tf.transpose(outputs['outputs'], [1, 0, 2]) labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels") label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths") sparse_labels = tf.cast(ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32) loss = tf.nn.ctc_loss(labels=sparse_labels, inputs=layers['raw_logits'], sequence_length=inputs['input_lengths']) # Create a saver using variables from the above newly created graph mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')} saver = tf.train.Saver(mapping) # Restore variables from training checkpoint checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if not checkpoint: log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir)) exit(1) checkpoint_path = checkpoint.model_checkpoint_path saver.restore(session, checkpoint_path) logitses = [] losses = [] print('Computing acoustic model predictions...') batch_count = len(test_data) // FLAGS.test_batch_size bar = progressbar.ProgressBar(max_value=batch_count, widget=progressbar.AdaptiveETA) # First pass, compute losses and transposed logits for decoding for batch in bar(split_data(test_data, FLAGS.test_batch_size)): session.run(outputs['initialize_state']) features = pad_to_dense(batch['features'].values) features_len = batch['features_len'].values labels = pad_to_dense(batch['transcript'].values) label_lengths = batch['transcript_len'].values logits, loss = session.run([transposed, loss], feed_dict={ inputs['input']: features, inputs['input_lengths']: features_len, labels_ph: labels, label_lengths_ph: label_lengths }) logitses.append(logits) losses.extend(loss) ground_truths = [] predictions = [] distances = [] print('Decoding predictions...') bar = progressbar.ProgressBar(max_value=batch_count, widget=progressbar.AdaptiveETA) # Get number of accessible CPU cores for this process num_processes = len(os.sched_getaffinity(0)) # Second pass, decode logits and compute WER and edit distance metrics for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))): seq_lengths = batch['features_len'].values.astype(np.int32) decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer) ground_truths.extend(alphabet.decode(l) for l in batch['transcript']) predictions.extend(d[0][1] for d in decoded) distances.extend(levenshtein(a, b) for a, b in zip(labels, predictions)) wer, samples = calculate_report(ground_truths, predictions, distances, losses) mean_edit_distance = np.mean(distances) mean_loss = np.mean(losses) # Take only the first report_count items report_samples = itertools.islice(samples, FLAGS.report_count) print('Test - WER: %f, loss: %f, mean edit distance: %f' % (wer, mean_loss, mean_edit_distance)) print('-' * 80) for sample in report_samples: print('WER: %f, loss: %f, edit distance: %f' % (sample.wer, sample.loss, sample.distance)) print(' - src: "%s"' % sample.src) print(' - res: "%s"' % sample.res) print('-' * 80) if FLAGS.test_output_file: json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
def train(server=None): r''' Trains the network on a given server of a cluster. If no server provided, it performs single process training. ''' # Initializing and starting the training coordinator coord = TrainingCoordinator(Config.is_chief) coord.start() # Create a variable to hold the global_step. # It will automagically get incremented by the optimizer. global_step = tf.Variable(0, trainable=False, name='global_step') dropout_rates = [ tf.placeholder(tf.float32, name='dropout_{}'.format(i)) for i in range(6) ] # Reading training set train_data = preprocess(FLAGS.train_files.split(','), FLAGS.train_batch_size, Config.n_input, Config.n_context, Config.alphabet, hdf5_cache_path=FLAGS.train_cached_features_path) train_set = DataSet(train_data, FLAGS.train_batch_size, limit=FLAGS.limit_train, next_index=lambda i: coord.get_next_index('train')) # Reading validation set dev_data = preprocess(FLAGS.dev_files.split(','), FLAGS.dev_batch_size, Config.n_input, Config.n_context, Config.alphabet, hdf5_cache_path=FLAGS.dev_cached_features_path) dev_set = DataSet(dev_data, FLAGS.dev_batch_size, limit=FLAGS.limit_dev, next_index=lambda i: coord.get_next_index('dev')) # Combining all sets to a multi set model feeder model_feeder = ModelFeeder(train_set, dev_set, Config.n_input, Config.n_context, Config.alphabet, tower_feeder_count=len( Config.available_devices)) # Create the optimizer optimizer = create_optimizer() # Synchronous distributed training is facilitated by a special proxy-optimizer if not server is None: optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=FLAGS.replicas_to_agg, total_num_replicas=FLAGS.replicas) # Get the data_set specific graph end-points gradients, loss = get_tower_results(model_feeder, optimizer, dropout_rates) # Average tower gradients across GPUs avg_tower_gradients = average_gradients(gradients) # Add summaries of all variables and gradients to log log_grads_and_vars(avg_tower_gradients) # Op to merge all summaries for the summary hook merge_all_summaries_op = tf.summary.merge_all() # These are saved on every step step_summaries_op = tf.summary.merge_all('step_summaries') step_summary_writers = { 'train': tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120), 'dev': tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'), max_queue=120) } # Apply gradients to modify the model apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step) if FLAGS.early_stop is True and not FLAGS.validation_step > 0: log_warn( 'Parameter --validation_step needs to be >0 for early stopping to work' ) class CoordHook(tf.train.SessionRunHook): r''' Embedded coordination hook-class that will use variables of the surrounding Python context. ''' def after_create_session(self, session, coord): log_debug('Starting queue runners...') model_feeder.start_queue_threads(session, coord) log_debug('Queue runners started.') def end(self, session): # Closing the data_set queues log_debug('Closing queues...') model_feeder.close_queues(session) log_debug('Queues closed.') # Telling the ps that we are done send_token_to_ps(session) # Collecting the hooks hooks = [CoordHook()] # Hook to handle initialization and queues for sync replicas. if not server is None: hooks.append(optimizer.make_session_run_hook(Config.is_chief)) # Hook to save TensorBoard summaries if FLAGS.summary_secs > 0: hooks.append( tf.train.SummarySaverHook(save_secs=FLAGS.summary_secs, output_dir=FLAGS.summary_dir, summary_op=merge_all_summaries_op)) # Hook wih number of checkpoint files to save in checkpoint_dir if FLAGS.train and FLAGS.max_to_keep > 0: saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep) hooks.append( tf.train.CheckpointSaverHook(checkpoint_dir=FLAGS.checkpoint_dir, save_secs=FLAGS.checkpoint_secs, saver=saver)) no_dropout_feed_dict = { dropout_rates[0]: 0., dropout_rates[1]: 0., dropout_rates[2]: 0., dropout_rates[3]: 0., dropout_rates[4]: 0., dropout_rates[5]: 0., } # Progress Bar def update_progressbar(set_name): if not hasattr(update_progressbar, 'current_set_name'): update_progressbar.current_set_name = None if (update_progressbar.current_set_name != set_name or update_progressbar.current_job_index == update_progressbar.total_jobs): # finish prev pbar if it exists if hasattr(update_progressbar, 'pbar') and update_progressbar.pbar: update_progressbar.pbar.finish() update_progressbar.total_jobs = None update_progressbar.current_job_index = 0 current_epoch = coord._epoch - 1 if set_name == "train": log_info('Training epoch %i...' % current_epoch) update_progressbar.total_jobs = coord._num_jobs_train else: log_info('Validating epoch %i...' % current_epoch) update_progressbar.total_jobs = coord._num_jobs_dev # recreate pbar update_progressbar.pbar = progressbar.ProgressBar( max_value=update_progressbar.total_jobs, redirect_stdout=True).start() update_progressbar.current_set_name = set_name if update_progressbar.pbar: update_progressbar.pbar.update( update_progressbar.current_job_index + 1, force=True) update_progressbar.current_job_index += 1 # Initialize update_progressbar()'s child fields to safe values update_progressbar.pbar = None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. try: with tf.train.MonitoredTrainingSession( master='' if server is None else server.target, is_chief=Config.is_chief, hooks=hooks, checkpoint_dir=FLAGS.checkpoint_dir, save_checkpoint_secs=None, # already taken care of by a hook log_step_count_steps= 0, # disable logging of steps/s to avoid TF warning in validation sets config=Config.session_config) as session: tf.get_default_graph().finalize() try: if Config.is_chief: # Retrieving global_step from the (potentially restored) model model_feeder.set_data_set(no_dropout_feed_dict, model_feeder.train) step = session.run(global_step, feed_dict=no_dropout_feed_dict) coord.start_coordination(model_feeder, step) # Get the first job job = coord.get_job() while job and not session.should_stop(): log_debug('Computing %s...' % job) is_train = job.set_name == 'train' # The feed_dict (mainly for switching between queues) if is_train: feed_dict = { dropout_rates[0]: FLAGS.dropout_rate, dropout_rates[1]: FLAGS.dropout_rate2, dropout_rates[2]: FLAGS.dropout_rate3, dropout_rates[3]: FLAGS.dropout_rate4, dropout_rates[4]: FLAGS.dropout_rate5, dropout_rates[5]: FLAGS.dropout_rate6, } else: feed_dict = no_dropout_feed_dict # Sets the current data_set for the respective placeholder in feed_dict model_feeder.set_data_set( feed_dict, getattr(model_feeder, job.set_name)) # Initialize loss aggregator total_loss = 0.0 # Setting the training operation in case of training requested train_op = apply_gradient_op if is_train else [] # So far the only extra parameter is the feed_dict extra_params = {'feed_dict': feed_dict} step_summary_writer = step_summary_writers.get( job.set_name) # Loop over the batches for job_step in range(job.steps): if session.should_stop(): break log_debug('Starting batch...') # Compute the batch _, current_step, batch_loss, step_summary = session.run( [train_op, global_step, loss, step_summaries_op], **extra_params) # Log step summaries step_summary_writer.add_summary( step_summary, current_step) # Uncomment the next line for debugging race conditions / distributed TF log_debug('Finished batch step %d.' % current_step) # Add batch to loss total_loss += batch_loss # Gathering job results job.loss = total_loss / job.steps # Display progressbar if FLAGS.show_progressbar: update_progressbar(job.set_name) # Send the current job to coordinator and receive the next one log_debug('Sending %s...' % job) job = coord.next_job(job) if update_progressbar.pbar: update_progressbar.pbar.finish() except Exception as e: log_error(str(e)) traceback.print_exc() # Calling all hook's end() methods to end blocking calls for hook in hooks: hook.end(session) # Only chief has a SyncReplicasOptimizer queue runner that needs to be stopped for unblocking process exit. # A rather graceful way to do this is by stopping the ps. # Only one party can send it w/o failing. if Config.is_chief: send_token_to_ps(session, kill=True) sys.exit(1) log_debug('Session closed.') except tf.errors.InvalidArgumentError as e: log_error(str(e)) log_error( 'The checkpoint in {0} does not match the shapes of the model.' ' Did you change alphabet.txt or the --n_hidden parameter' ' between train runs using the same checkpoint dir? Try moving' ' or removing the contents of {0}.'.format(FLAGS.checkpoint_dir)) sys.exit(1) # Stopping the coordinator coord.stop()
def train(server=None): r''' Trains the network on a given server of a cluster. If no server provided, it performs single process training. ''' # Initializing and starting the training coordinator coord = TrainingCoordinator(Config.is_chief) coord.start() # Create a variable to hold the global_step. # It will automagically get incremented by the optimizer. global_step = tf.Variable(0, trainable=False, name='global_step') dropout_rates = [tf.placeholder(tf.float32, name='dropout_{}'.format(i)) for i in range(6)] # Reading training set train_data = preprocess(FLAGS.train_files.split(','), FLAGS.train_batch_size, Config.n_input, Config.n_context, Config.alphabet, hdf5_cache_path=FLAGS.train_cached_features_path) train_set = DataSet(train_data, FLAGS.train_batch_size, limit=FLAGS.limit_train, next_index=lambda i: coord.get_next_index('train')) # Reading validation set dev_data = preprocess(FLAGS.dev_files.split(','), FLAGS.dev_batch_size, Config.n_input, Config.n_context, Config.alphabet, hdf5_cache_path=FLAGS.dev_cached_features_path) dev_set = DataSet(dev_data, FLAGS.dev_batch_size, limit=FLAGS.limit_dev, next_index=lambda i: coord.get_next_index('dev')) # Combining all sets to a multi set model feeder model_feeder = ModelFeeder(train_set, dev_set, Config.n_input, Config.n_context, Config.alphabet, tower_feeder_count=len(Config.available_devices)) # Create the optimizer optimizer = create_optimizer() # Synchronous distributed training is facilitated by a special proxy-optimizer if not server is None: optimizer = tf.train.SyncReplicasOptimizer(optimizer, replicas_to_aggregate=FLAGS.replicas_to_agg, total_num_replicas=FLAGS.replicas) # Get the data_set specific graph end-points gradients, loss = get_tower_results(model_feeder, optimizer, dropout_rates) # Average tower gradients across GPUs avg_tower_gradients = average_gradients(gradients) # Add summaries of all variables and gradients to log log_grads_and_vars(avg_tower_gradients) # Op to merge all summaries for the summary hook merge_all_summaries_op = tf.summary.merge_all() # These are saved on every step step_summaries_op = tf.summary.merge_all('step_summaries') step_summary_writers = { 'train': tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120), 'dev': tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'), max_queue=120) } # Apply gradients to modify the model apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step) if FLAGS.early_stop is True and not FLAGS.validation_step > 0: log_warn('Parameter --validation_step needs to be >0 for early stopping to work') class CoordHook(tf.train.SessionRunHook): r''' Embedded coordination hook-class that will use variables of the surrounding Python context. ''' def after_create_session(self, session, coord): log_debug('Starting queue runners...') model_feeder.start_queue_threads(session, coord) log_debug('Queue runners started.') def end(self, session): # Closing the data_set queues log_debug('Closing queues...') model_feeder.close_queues(session) log_debug('Queues closed.') # Telling the ps that we are done send_token_to_ps(session) # Collecting the hooks hooks = [CoordHook()] # Hook to handle initialization and queues for sync replicas. if not server is None: hooks.append(optimizer.make_session_run_hook(Config.is_chief)) # Hook to save TensorBoard summaries if FLAGS.summary_secs > 0: hooks.append(tf.train.SummarySaverHook(save_secs=FLAGS.summary_secs, output_dir=FLAGS.summary_dir, summary_op=merge_all_summaries_op)) # Hook wih number of checkpoint files to save in checkpoint_dir if FLAGS.train and FLAGS.max_to_keep > 0: saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep) hooks.append(tf.train.CheckpointSaverHook(checkpoint_dir=FLAGS.checkpoint_dir, save_secs=FLAGS.checkpoint_secs, saver=saver)) no_dropout_feed_dict = { dropout_rates[0]: 0., dropout_rates[1]: 0., dropout_rates[2]: 0., dropout_rates[3]: 0., dropout_rates[4]: 0., dropout_rates[5]: 0., } # Progress Bar def update_progressbar(set_name): if not hasattr(update_progressbar, 'current_set_name'): update_progressbar.current_set_name = None if (update_progressbar.current_set_name != set_name or update_progressbar.current_job_index == update_progressbar.total_jobs): # finish prev pbar if it exists if hasattr(update_progressbar, 'pbar') and update_progressbar.pbar: update_progressbar.pbar.finish() update_progressbar.total_jobs = None update_progressbar.current_job_index = 0 current_epoch = coord._epoch-1 if set_name == "train": log_info('Training epoch %i...' % current_epoch) update_progressbar.total_jobs = coord._num_jobs_train else: log_info('Validating epoch %i...' % current_epoch) update_progressbar.total_jobs = coord._num_jobs_dev # recreate pbar update_progressbar.pbar = progressbar.ProgressBar(max_value=update_progressbar.total_jobs, redirect_stdout=True).start() update_progressbar.current_set_name = set_name if update_progressbar.pbar: update_progressbar.pbar.update(update_progressbar.current_job_index+1, force=True) update_progressbar.current_job_index += 1 # Initialize update_progressbar()'s child fields to safe values update_progressbar.pbar = None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. try: with tf.train.MonitoredTrainingSession(master='' if server is None else server.target, is_chief=Config.is_chief, hooks=hooks, checkpoint_dir=FLAGS.checkpoint_dir, save_checkpoint_secs=None, # already taken care of by a hook log_step_count_steps=0, # disable logging of steps/s to avoid TF warning in validation sets config=Config.session_config) as session: tf.get_default_graph().finalize() try: if Config.is_chief: # Retrieving global_step from the (potentially restored) model model_feeder.set_data_set(no_dropout_feed_dict, model_feeder.train) step = session.run(global_step, feed_dict=no_dropout_feed_dict) coord.start_coordination(model_feeder, step) # Get the first job job = coord.get_job() while job and not session.should_stop(): log_debug('Computing %s...' % job) is_train = job.set_name == 'train' # The feed_dict (mainly for switching between queues) if is_train: feed_dict = { dropout_rates[0]: FLAGS.dropout_rate, dropout_rates[1]: FLAGS.dropout_rate2, dropout_rates[2]: FLAGS.dropout_rate3, dropout_rates[3]: FLAGS.dropout_rate4, dropout_rates[4]: FLAGS.dropout_rate5, dropout_rates[5]: FLAGS.dropout_rate6, } else: feed_dict = no_dropout_feed_dict # Sets the current data_set for the respective placeholder in feed_dict model_feeder.set_data_set(feed_dict, getattr(model_feeder, job.set_name)) # Initialize loss aggregator total_loss = 0.0 # Setting the training operation in case of training requested train_op = apply_gradient_op if is_train else [] # So far the only extra parameter is the feed_dict extra_params = { 'feed_dict': feed_dict } step_summary_writer = step_summary_writers.get(job.set_name) # Loop over the batches for job_step in range(job.steps): if session.should_stop(): break log_debug('Starting batch...') # Compute the batch _, current_step, batch_loss, step_summary = session.run([train_op, global_step, loss, step_summaries_op], **extra_params) # Log step summaries step_summary_writer.add_summary(step_summary, current_step) # Uncomment the next line for debugging race conditions / distributed TF log_debug('Finished batch step %d.' % current_step) # Add batch to loss total_loss += batch_loss # Gathering job results job.loss = total_loss / job.steps # Display progressbar if FLAGS.show_progressbar: update_progressbar(job.set_name) # Send the current job to coordinator and receive the next one log_debug('Sending %s...' % job) job = coord.next_job(job) if update_progressbar.pbar: update_progressbar.pbar.finish() except Exception as e: log_error(str(e)) traceback.print_exc() # Calling all hook's end() methods to end blocking calls for hook in hooks: hook.end(session) # Only chief has a SyncReplicasOptimizer queue runner that needs to be stopped for unblocking process exit. # A rather graceful way to do this is by stopping the ps. # Only one party can send it w/o failing. if Config.is_chief: send_token_to_ps(session, kill=True) sys.exit(1) log_debug('Session closed.') except tf.errors.InvalidArgumentError as e: log_error(str(e)) log_error('The checkpoint in {0} does not match the shapes of the model.' ' Did you change alphabet.txt or the --n_hidden parameter' ' between train runs using the same checkpoint dir? Try moving' ' or removing the contents of {0}.'.format(FLAGS.checkpoint_dir)) sys.exit(1) # Stopping the coordinator coord.stop()
def preprocess_data(filename): with open(filename, 'r') as f: return preprocess.preprocess(f)
def main(_): initialize_globals() if not FLAGS.test_files: log_error('You need to specify what files to use for evaluation via ' 'the --test_files flag.') exit(1) global alphabet alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path)) # sort examples by length, improves packing of batches and timesteps test_data = preprocess(FLAGS.test_files.split(','), FLAGS.test_batch_size, alphabet=alphabet, numcep=N_FEATURES, numcontext=N_CONTEXT, hdf5_cache_path=FLAGS.hdf5_test_set).sort_values( by="features_len", ascending=False) def create_windows(features): num_strides = len(features) - (N_CONTEXT * 2) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2 * N_CONTEXT + 1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, N_FEATURES), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) return features test_data['features'] = test_data['features'].apply(create_windows) with tf.Session() as session: inputs, outputs = create_inference_graph( batch_size=FLAGS.test_batch_size, n_steps=N_STEPS) seq_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size]) decode_logits_ph = tf.placeholder( tf.float32, [None, FLAGS.test_batch_size, alphabet.size() + 1]) labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None]) label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size]) decoded, _ = decode_with_lm(decode_logits_ph, seq_lengths_ph, merge_repeated=False, beam_width=FLAGS.beam_width) sparse_labels = tf.cast( ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32) loss = tf.nn.ctc_loss(labels=sparse_labels, inputs=decode_logits_ph, sequence_length=seq_lengths_ph) distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), sparse_labels) # Create a saver using variables from the above newly created graph mapping = { v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_') } saver = tf.train.Saver(mapping) # Restore variables from training checkpoint checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if not checkpoint: log_error( 'Checkpoint directory ({}) does not contain a valid checkpoint state.' .format(FLAGS.checkpoint_dir)) exit(1) checkpoint_path = checkpoint.model_checkpoint_path saver.restore(session, checkpoint_path) logitses = [] batch_count = len(test_data) // FLAGS.test_batch_size bar = progressbar.ProgressBar(max_value=batch_count - 1, widget=progressbar.AdaptiveETA) for batch in bar(split_data(test_data, FLAGS.test_batch_size)): session.run(outputs['initialize_state']) batch_features = pad_to_dense(batch['features'].values) batch_features_len = batch['features_len'].values full_step_len = np.full_like(batch_features_len, N_STEPS) logits = np.empty([0, FLAGS.test_batch_size, alphabet.size() + 1]) for i in range(0, batch_features.shape[1], N_STEPS): chunk_features = batch_features[:, i:i + N_STEPS, :, :] chunk_features_len = np.minimum(batch_features_len, full_step_len) # pad with zeros if the chunk does not have enough steps steps_in_chunk = chunk_features.shape[1] if steps_in_chunk < FLAGS.n_steps: chunk_features = np.pad( chunk_features, ((0, 0), (0, FLAGS.n_steps - steps_in_chunk), (0, 0), (0, 0)), mode='constant', constant_values=0) output = session.run(outputs['outputs'], feed_dict={ inputs['input']: chunk_features, inputs['input_lengths']: chunk_features_len, }) logits = np.concatenate((logits, output)) # we have processed N_STEPS so subtract from remaining steps batch_features_len -= N_STEPS # clip to zero batch_features_len = np.maximum( batch_features_len, np.zeros_like(batch_features_len)) logitses.append(logits) ground_truths = [] predictions = [] distances = [] losses = [] bar = progressbar.ProgressBar(max_value=batch_count - 1, widget=progressbar.AdaptiveETA) for logits, batch in bar( zip(logitses, split_data(test_data, FLAGS.test_batch_size))): seq_lengths = batch['features_len'].values labels = pad_to_dense(batch['transcript'].values) label_lengths = batch['transcript_len'].values decoded_, loss_, distance_, sparse_labels_ = session.run( [decoded, loss, distance, sparse_labels], feed_dict={ decode_logits_ph: logits, seq_lengths_ph: seq_lengths, labels_ph: labels, label_lengths_ph: label_lengths }) ground_truths.extend( sparse_tensor_value_to_texts(sparse_labels_, alphabet)) predictions.extend( sparse_tensor_value_to_texts(decoded_[0], alphabet)) distances.extend(distance_) losses.extend(loss_) wer, samples = calculate_report(ground_truths, predictions, distances, losses) mean_edit_distance = np.mean(distances) mean_loss = np.mean(losses) # Filter out all items with WER=0 and take only the first report_count items report_samples = itertools.islice((s for s in samples if s.wer > 0), FLAGS.report_count) print('Test - WER: %f, loss: %f, mean edit distance: %f' % (wer, mean_loss, mean_edit_distance)) print('-' * 80) for sample in report_samples: print('WER: %f, loss: %f, mean edit distance: %f' % (sample.wer, sample.loss, sample.distance)) print(' - src: "%s"' % sample.src) print(' - res: "%s"' % sample.res) print('-' * 80) if FLAGS.test_output_file: json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
# Import from model.sequential import Sequential from model.util.layer import Layer from util.preprocess import preprocess from util.metrics import binary_accuracy_score import matplotlib.pyplot as plt # Read *.csv file and preprocess X, y = preprocess('./test/data_weather.csv') # Define nb of layers and nb of neurons model = Sequential([ Layer(5), # param 1: nb of neurons Layer(5), Layer(5), Layer(5), Layer(1) ]) # define batch_size and epochs as you wish! batch_size = 2 epochs = 1000 # build model model.compile() # let's train! model.fit( X,
def train(): r''' Trains the network on a given server of a cluster. If no server provided, it performs single process training. ''' # Reading training set train_index = SampleIndex() train_data = preprocess(FLAGS.train_files.split(','), FLAGS.train_batch_size, Config.n_input, Config.n_context, Config.alphabet, hdf5_cache_path=FLAGS.train_cached_features_path) train_set = DataSet(train_data, FLAGS.train_batch_size, limit=FLAGS.limit_train, next_index=train_index.inc) # Reading validation set dev_index = SampleIndex() dev_data = preprocess(FLAGS.dev_files.split(','), FLAGS.dev_batch_size, Config.n_input, Config.n_context, Config.alphabet, hdf5_cache_path=FLAGS.dev_cached_features_path) dev_set = DataSet(dev_data, FLAGS.dev_batch_size, limit=FLAGS.limit_dev, next_index=dev_index.inc) # Combining all sets to a multi set model feeder model_feeder = ModelFeeder(train_set, dev_set, Config.n_input, Config.n_context, Config.alphabet, tower_feeder_count=len( Config.available_devices)) # Dropout dropout_rates = [ tf.placeholder(tf.float32, name='dropout_{}'.format(i)) for i in range(6) ] dropout_feed_dict = { dropout_rates[0]: FLAGS.dropout_rate, dropout_rates[1]: FLAGS.dropout_rate2, dropout_rates[2]: FLAGS.dropout_rate3, dropout_rates[3]: FLAGS.dropout_rate4, dropout_rates[4]: FLAGS.dropout_rate5, dropout_rates[5]: FLAGS.dropout_rate6, } no_dropout_feed_dict = { dropout_rates[0]: 0., dropout_rates[1]: 0., dropout_rates[2]: 0., dropout_rates[3]: 0., dropout_rates[4]: 0., dropout_rates[5]: 0., } # Building the graph optimizer = create_optimizer() gradients, loss = get_tower_results(model_feeder, optimizer, dropout_rates) # Average tower gradients across GPUs avg_tower_gradients = average_gradients(gradients) log_grads_and_vars(avg_tower_gradients) # global_step is automagically incremented by the optimizer global_step = tf.Variable(0, trainable=False, name='global_step') apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step) # Summaries step_summaries_op = tf.summary.merge_all('step_summaries') step_summary_writers = { 'train': tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120), 'dev': tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'), max_queue=120) } # Checkpointing checkpoint_saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep) checkpoint_path = os.path.join(FLAGS.checkpoint_dir, 'train') checkpoint_filename = 'checkpoint' best_dev_saver = tf.train.Saver(max_to_keep=1) best_dev_path = os.path.join(FLAGS.checkpoint_dir, 'best_dev') best_dev_filename = 'best_dev_checkpoint' initializer = tf.global_variables_initializer() with tf.Session(config=Config.session_config) as session: log_debug('Session opened.') tf.get_default_graph().finalize() # Loading or initializing loaded = False if FLAGS.load in ['auto', 'last']: loaded = try_loading(session, checkpoint_saver, checkpoint_filename, 'most recent epoch') if not loaded and FLAGS.load in ['auto', 'best']: loaded = try_loading(session, best_dev_saver, best_dev_filename, 'best validation') if not loaded: if FLAGS.load in ['auto', 'init']: log_info('Initializing...') session.run(initializer) else: log_error( 'Unable to load %s model from specified checkpoint dir' ' - consider using load option "auto" or "init".' % FLAGS.load) sys.exit(1) # Retrieving global_step from restored model and setting training parameters accordingly model_feeder.set_data_set(no_dropout_feed_dict, train_set) step = session.run(global_step, feed_dict=no_dropout_feed_dict) num_gpus = len(Config.available_devices) steps_per_epoch = max(1, train_set.total_batches // num_gpus) steps_trained = step % steps_per_epoch current_epoch = step // steps_per_epoch target_epoch = current_epoch + abs( FLAGS.epoch) if FLAGS.epoch < 0 else FLAGS.epoch train_index.index = steps_trained * num_gpus log_debug('step: %d' % step) log_debug('epoch: %d' % current_epoch) log_debug('target epoch: %d' % target_epoch) log_debug('steps per epoch: %d' % steps_per_epoch) log_debug('batches per step (GPUs): %d' % num_gpus) log_debug('number of batches in train set: %d' % train_set.total_batches) log_debug('number of batches already trained in epoch: %d' % train_index.index) def run_set(set_name): data_set = getattr(model_feeder, set_name) is_train = set_name == 'train' train_op = apply_gradient_op if is_train else [] feed_dict = dropout_feed_dict if is_train else no_dropout_feed_dict model_feeder.set_data_set(feed_dict, data_set) total_loss = 0.0 step_summary_writer = step_summary_writers.get(set_name) num_steps = max(1, data_set.total_batches // num_gpus) checkpoint_time = time.time() if FLAGS.show_progressbar: pbar = progressbar.ProgressBar(max_value=num_steps, redirect_stdout=True).start() # Batch loop for step_index in range(steps_trained, num_steps): if coord.should_stop(): break _, current_step, batch_loss, step_summary = \ session.run([train_op, global_step, loss, step_summaries_op], feed_dict=feed_dict) total_loss += batch_loss step_summary_writer.add_summary(step_summary, current_step) if FLAGS.show_progressbar: pbar.update(step_index + 1, force=True) if is_train and FLAGS.checkpoint_secs > 0 and time.time( ) - checkpoint_time > FLAGS.checkpoint_secs: checkpoint_saver.save(session, checkpoint_path, global_step=current_step) checkpoint_time = time.time() if FLAGS.show_progressbar: pbar.finish() return total_loss / num_steps if target_epoch > current_epoch: log_info('STARTING Optimization') best_dev_loss = float('inf') dev_losses = [] coord = tf.train.Coordinator() with coord.stop_on_exception(): log_debug('Starting queue runners...') model_feeder.start_queue_threads(session, coord=coord) log_debug('Queue runners started.') # Epoch loop for current_epoch in range(current_epoch, target_epoch): # Training if coord.should_stop(): break log_info('Training epoch %d ...' % current_epoch) train_loss = run_set('train') log_info('Finished training epoch %d - loss: %f' % (current_epoch, train_loss)) checkpoint_saver.save(session, checkpoint_path, global_step=global_step) steps_trained = 0 # Validation log_info('Validating epoch %d ...' % current_epoch) dev_loss = run_set('dev') dev_losses.append(dev_loss) log_info('Finished validating epoch %d - loss: %f' % (current_epoch, dev_loss)) if dev_loss < best_dev_loss: best_dev_loss = dev_loss save_path = best_dev_saver.save( session, best_dev_path, latest_filename=best_dev_filename) log_info( "Saved new best validating model with loss %f to: %s" % (best_dev_loss, save_path)) # Early stopping if FLAGS.early_stop and len(dev_losses) >= FLAGS.es_steps: mean_loss = np.mean(dev_losses[-FLAGS.es_steps:-1]) std_loss = np.std(dev_losses[-FLAGS.es_steps:-1]) dev_losses = dev_losses[-FLAGS.es_steps:] log_debug( 'Checking for early stopping (last %d steps) validation loss: ' '%f, with standard deviation: %f and mean: %f' % (FLAGS.es_steps, dev_losses[-1], std_loss, mean_loss)) if dev_losses[-1] > np.max(dev_losses[:-1]) or \ (abs(dev_losses[-1] - mean_loss) < FLAGS.es_mean_th and std_loss < FLAGS.es_std_th): log_info( 'Early stop triggered as (for last %d steps) validation loss:' ' %f with standard deviation: %f and mean: %f' % (FLAGS.es_steps, dev_losses[-1], std_loss, mean_loss)) break log_debug('Closing queues...') coord.request_stop() model_feeder.close_queues(session) log_debug('Queues closed.') else: log_info('Target epoch already reached - skipped training.') log_debug('Session closed.')
def train(server=None): r''' Trains the network on a given server of a cluster. If no server provided, it performs single process training. ''' # The transfer learning approach here need us to supply the layers which we # want to exclude from the source model. # Say we want to exclude all layers except for the first one, we can use this: # # drop_source_layers=['2', '3', 'lstm', '5', '6'] # # If we want to use all layers from the source model except the last one, we use this: # # drop_source_layers=['6'] # drop_source_layers = ['2', '3', 'lstm', '5', '6'][-int(FLAGS.drop_source_layers):] # Initializing and starting the training coordinator coord = TrainingCoordinator(Config.is_chief) coord.start() # Create a variable to hold the global_step. # It will automagically get incremented by the optimizer. global_step = tf.Variable(0, trainable=False, name='global_step') dropout_rates = [ tf.placeholder(tf.float32, name='dropout_{}'.format(i)) for i in range(6) ] # Reading training set train_data = preprocess(FLAGS.train_files.split(','), FLAGS.train_batch_size, Config.n_input, Config.n_context, Config.alphabet, hdf5_cache_path=FLAGS.train_cached_features_path) train_set = DataSet(train_data, FLAGS.train_batch_size, limit=FLAGS.limit_train, next_index=lambda i: coord.get_next_index('train')) # Reading validation set dev_data = preprocess(FLAGS.dev_files.split(','), FLAGS.dev_batch_size, Config.n_input, Config.n_context, Config.alphabet, hdf5_cache_path=FLAGS.dev_cached_features_path) dev_set = DataSet(dev_data, FLAGS.dev_batch_size, limit=FLAGS.limit_dev, next_index=lambda i: coord.get_next_index('dev')) # Combining all sets to a multi set model feeder model_feeder = ModelFeeder(train_set, dev_set, Config.n_input, Config.n_context, Config.alphabet, tower_feeder_count=len( Config.available_devices)) # Create the optimizer optimizer = create_optimizer() # Synchronous distributed training is facilitated by a special proxy-optimizer if not server is None: optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=FLAGS.replicas_to_agg, total_num_replicas=FLAGS.replicas) # Get the data_set specific graph end-points gradients, loss = get_tower_results(model_feeder, optimizer, dropout_rates, drop_source_layers) # Average tower gradients across GPUs avg_tower_gradients = average_gradients(gradients) # Add summaries of all variables and gradients to log log_grads_and_vars(avg_tower_gradients) # Op to merge all summaries for the summary hook merge_all_summaries_op = tf.summary.merge_all() # These are saved on every step step_summaries_op = tf.summary.merge_all('step_summaries') step_summary_writers = { 'train': tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120), 'dev': tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'), max_queue=120) } # Apply gradients to modify the model apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step) if FLAGS.early_stop is True and not FLAGS.validation_step > 0: log_warn( 'Parameter --validation_step needs to be >0 for early stopping to work' ) class CoordHook(tf.train.SessionRunHook): r''' Embedded coordination hook-class that will use variables of the surrounding Python context. ''' def after_create_session(self, session, coord): log_debug('Starting queue runners...') model_feeder.start_queue_threads(session, coord) log_debug('Queue runners started.') def end(self, session): # Closing the data_set queues log_debug('Closing queues...') model_feeder.close_queues(session) log_debug('Queues closed.') # Telling the ps that we are done send_token_to_ps(session) # Collecting the hooks hooks = [CoordHook()] # Hook to handle initialization and queues for sync replicas. if not server is None: hooks.append(optimizer.make_session_run_hook(Config.is_chief)) # Hook to save TensorBoard summaries if FLAGS.summary_secs > 0: hooks.append( tf.train.SummarySaverHook(save_secs=FLAGS.summary_secs, output_dir=FLAGS.summary_dir, summary_op=merge_all_summaries_op)) # Hook wih number of checkpoint files to save in checkpoint_dir if FLAGS.train and FLAGS.max_to_keep > 0: saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep) hooks.append( tf.train.CheckpointSaverHook(checkpoint_dir=FLAGS.checkpoint_dir, save_secs=FLAGS.checkpoint_secs, saver=saver)) no_dropout_feed_dict = { dropout_rates[0]: 0., dropout_rates[1]: 0., dropout_rates[2]: 0., dropout_rates[3]: 0., dropout_rates[4]: 0., dropout_rates[5]: 0., } # Progress Bar def update_progressbar(set_name): if not hasattr(update_progressbar, 'current_set_name'): update_progressbar.current_set_name = None if (update_progressbar.current_set_name != set_name or update_progressbar.current_job_index == update_progressbar.total_jobs): # finish prev pbar if it exists if hasattr(update_progressbar, 'pbar') and update_progressbar.pbar: update_progressbar.pbar.finish() update_progressbar.total_jobs = None update_progressbar.current_job_index = 0 current_epoch = coord._epoch - 1 sufix = "graph_noisySVA_CV_2layers_" checkpoint_stash = "/docker_files/ckpt_stash/" checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) checkpoint_path = checkpoint.model_checkpoint_path ckpt_dest_name = sufix + str(current_epoch - 118) + "_eph" str_to_replace = "s/" + checkpoint_path.split( '/')[-1] + "/" + ckpt_dest_name + "/" subprocess.Popen( ["cp", checkpoint_path + ".meta", checkpoint_stash]) #pdb.set_trace() subprocess.Popen([ "rename", str_to_replace, checkpoint_stash + checkpoint_path.split('/')[-1] + ".meta" ]) subprocess.Popen([ "cp", checkpoint_path + ".data-00000-of-00001", checkpoint_stash ]) subprocess.Popen([ "rename", str_to_replace, checkpoint_stash + checkpoint_path.split('/')[-1] + ".data-00000-of-00001" ]) subprocess.Popen( ["cp", checkpoint_path + ".index", checkpoint_stash]) subprocess.Popen([ "rename", str_to_replace, checkpoint_stash + checkpoint_path.split('/')[-1] + ".index" ]) #HERE if set_name == "train": log_info('Training epoch %i...' % current_epoch) update_progressbar.total_jobs = coord._num_jobs_train else: log_info('Validating epoch %i...' % current_epoch) update_progressbar.total_jobs = coord._num_jobs_dev # recreate pbar update_progressbar.pbar = progressbar.ProgressBar( max_value=update_progressbar.total_jobs, redirect_stdout=True).start() update_progressbar.current_set_name = set_name if update_progressbar.pbar: update_progressbar.pbar.update( update_progressbar.current_job_index + 1, force=True) update_progressbar.current_job_index += 1 # Initialize update_progressbar()'s child fields to safe values update_progressbar.pbar = None ### TRANSFER LEARNING ### def init_fn(scaffold, session): if FLAGS.source_model_checkpoint_dir: drop_source_layers.append('layer_6') print('Initializing from', FLAGS.source_model_checkpoint_dir) ckpt = tf.train.load_checkpoint(FLAGS.source_model_checkpoint_dir) variables = list(ckpt.get_variable_to_shape_map().keys()) for v in tf.global_variables(): if not any(layer in v.op.name for layer in drop_source_layers): #if not v.name.count('b6') or not v.name.count('h6') or not v.name.count('raw_logits'): with open("/data/german_DS/deepspeech-german/nodes.txt", "w") as nodetxtfile: print('Loading', v.op.name) nodetxtfile.write(v.op.name) v.load(ckpt.get_tensor(v.op.name), session=session) scaffold = tf.train.Scaffold( init_op=tf.variables_initializer([ v for v in tf.global_variables() if any(layer in v.op.name for layer in drop_source_layers) ] #or v.name.count('b6')] ), init_fn=init_fn) ### TRANSFER LEARNING ### pdb.set_trace() # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. try: with tf.train.MonitoredTrainingSession( master='' if server is None else server.target, is_chief=Config.is_chief, hooks=hooks, scaffold=scaffold, # transfer-learning checkpoint_dir=FLAGS.checkpoint_dir, save_checkpoint_secs=None, # already taken care of by a hook log_step_count_steps= 0, # disable logging of steps/s to avoid TF warning in validation sets config=Config.session_config) as session: #tf.get_default_graph().finalize() #do_export = False try: if Config.is_chief: # Retrieving global_step from the (potentially restored) model model_feeder.set_data_set(no_dropout_feed_dict, model_feeder.train) step = session.run(global_step, feed_dict=no_dropout_feed_dict) coord.start_coordination(model_feeder, step) #if do_export: #export(session) #print("########INDISE EXPORT###########") #do_export = True # Get the first job job = coord.get_job() while job and not session.should_stop(): log_debug('Computing %s...' % job) is_train = job.set_name == 'train' # The feed_dict (mainly for switching between queues) if is_train: feed_dict = { dropout_rates[0]: FLAGS.dropout_rate, dropout_rates[1]: FLAGS.dropout_rate2, dropout_rates[2]: FLAGS.dropout_rate3, dropout_rates[3]: FLAGS.dropout_rate4, dropout_rates[4]: FLAGS.dropout_rate5, dropout_rates[5]: FLAGS.dropout_rate6, } else: feed_dict = no_dropout_feed_dict # Sets the current data_set for the respective placeholder in feed_dict model_feeder.set_data_set( feed_dict, getattr(model_feeder, job.set_name)) # Initialize loss aggregator total_loss = 0.0 # Setting the training operation in case of training requested train_op = apply_gradient_op if is_train else [] # So far the only extra parameter is the feed_dict extra_params = {'feed_dict': feed_dict} step_summary_writer = step_summary_writers.get( job.set_name) # Loop over the batches for job_step in range(job.steps): if session.should_stop(): break log_debug('Starting batch...') # Compute the batch _, current_step, batch_loss, step_summary = session.run( [train_op, global_step, loss, step_summaries_op], **extra_params) # Log step summaries step_summary_writer.add_summary( step_summary, current_step) # Uncomment the next line for debugging race conditions / distributed TF log_debug('Finished batch step %d.' % current_step) # Add batch to loss total_loss += batch_loss # Gathering job results job.loss = total_loss / job.steps # Display progressbar if FLAGS.show_progressbar: update_progressbar(job.set_name) # Send the current job to coordinator and receive the next one log_debug('Sending %s...' % job) job = coord.next_job(job) if update_progressbar.pbar: update_progressbar.pbar.finish() #export() #mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')} #saver = tf.train.Saver(mapping) #def do_graph_freeze(output_file=None, output_node_names=None, variables_blacklist=None): # freeze_graph.freeze_graph_with_def_protos( # input_graph_def=session.graph_def, # input_saver_def=saver.as_saver_def(), # input_checkpoint=checkpoint_path, # output_node_names=output_node_names, # restore_op_name=None, # filename_tensor_name=None, # output_graph=output_file, # clear_devices=False, # variable_names_blacklist=variables_blacklist, # initializer_nodes='') #output_graph_path = "output_graph.pb" #do_graph_freeze(output_file=output_graph_path, output_node_names='logits,initialize_state', variables_blacklist='previous_state_c,previous_state_h') except Exception as e: log_error(str(e)) traceback.print_exc() # Calling all hook's end() methods to end blocking calls for hook in hooks: hook.end(session) # Only chief has a SyncReplicasOptimizer queue runner that needs to be stopped for unblocking process exit. # A rather graceful way to do this is by stopping the ps. # Only one party can send it w/o failing. if Config.is_chief: send_token_to_ps(session, kill=True) sys.exit(1) log_debug('Session closed.') except tf.errors.InvalidArgumentError as e: log_error(str(e)) log_error( 'The checkpoint in {0} does not match the shapes of the model.' ' Did you change alphabet.txt or the --n_hidden parameter' ' between train runs using the same checkpoint dir? Try moving' ' or removing the contents of {0}.'.format(FLAGS.checkpoint_dir)) sys.exit(1) # Stopping the coordinator coord.stop()
# 每次迭代同时加载的个数 batch_size = 100 best_accuracy = 0.0 # 损失函数 criterion = nn.CrossEntropyLoss() model = TxtModel(n_dim, 2).double() model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) #exp_lr_scheduler = lr_scheduler.StepLR(optimizer,step_size = 25, gamma= 0.1) pre = preprocess() train_vec_data, test_vec_data, train_label, test_label = pre.get_data() train_dataset = TxtDataset(train_vec_data, train_label) test_dataset = TxtDataset(test_vec_data, test_label) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) for epoch in range(epochs): model.train()