def get_conversation_turn_stats(folders, bucket_size=30, max_turns=3000): start_time = time.time() number_of_files_read = 0 # Array with occurrences of conversations with i turns. Array[i] get #conversations with i turns turns = [0] * max_turns exceeded_max_turns = 0 total_turns = 0.0 longest_conversation = 0 more_than_500 = 0 more_than_1000 = 0 more_than_2000 = 0 for folder in folders: folder_path = "../../ubuntu-ranking-dataset-creator/src/dialogs/" + folder for filename in os.listdir(folder_path): number_of_files_read += 1 file_path = folder_path + "/" + filename num_turns = get_num_turns_in_file(file_path, bucket_size, max_turns) if num_turns > 2000: more_than_2000 += 1 elif (num_turns > 1000): more_than_1000 += 1 elif num_turns > 500: more_than_500 += 1 if num_turns < max_turns: turns[num_turns] += 1 total_turns += num_turns else: exceeded_max_turns += 1 print("Done with folder: " + str(folder) + ", read " + str(number_of_files_read) + " conversations") print "Number of files read: " + str(number_of_files_read) # Find stats max_occ = 0 max_index = 0 for i in range(2, max_turns): if turns[i] > max_occ: max_occ = turns[i] max_index = i print("Mode is " + str(max_index) + " with " + str(max_occ) + " occurrences") print("Average turns per conversation is " + str(total_turns / number_of_files_read)) #print("Average turns per conversation without two turns is " + str((total_turns-2*turns[2])/(number_of_files_read-turns[2]))) for occurrence in turns: print(occurrence) print("Exceeded max turns " + str(exceeded_max_turns)) print("More than 2000 " + str(more_than_2000)) print("More than 1000 " + str(more_than_1000)) print("More than 500 " + str(more_than_500)) print get_time(start_time)
def read_every_data_file_and_create_initial_files(folders, initial_x_file_path, initial_y_file_path): start_time = time.time() number_of_files_read = 0 for folder in folders: folder_path = "../../ubuntu-ranking-dataset-creator/src/dialogs/" + folder for filename in os.listdir(folder_path): number_of_files_read += 1 file_path = folder_path + "/" + filename preprocess_training_file(file_path, initial_x_file_path, initial_y_file_path) print("Done with folder: " + str(folder) + ", read " + str(number_of_files_read) + " files") print "Number of files read: " + str(number_of_files_read) print get_time(start_time)
def create_fast_text_model(folder, merged_spellcheck_path): start_time_fasttext = time.time() path = './' + str(folder) + '/model' model = fasttext.skipgram(merged_spellcheck_path, path) print("Time used to create Fasttext model: ", get_time(start_time_fasttext)) return model
def get_most_similar_words_for_unk(unknown_words, vocab_words, unknown_dict_pickle_path, unk_to_vocab_txt_path, save_freq): # The resulting dictionary consisting of 'unk_word' : 'most similar vocab word' unknown_words_results = {} # If a previously dictionary is saved, this one will be fed with words that has NOT computed a similar word new_unk_words_dict = {} # If pickle file exists, load into unknown_words_results if os.path.exists(unknown_dict_pickle_path): unknown_words_results = load_pickle_file(unknown_dict_pickle_path) for key, value in unknown_words.iteritems(): if key not in unknown_words_results: # If the word is not computed, add to new_unk_words_dict so it can be computed later new_unk_words_dict[key] = value # Set unknown_words to the words that is not computed unknown_words = new_unk_words_dict # Create lists for faster computation known_words_list = [(key, value[0], value[1]) for key, value in vocab_words.iteritems()] unknown_words_list = [(key, value) for key, value in unknown_words.iteritems()] counter = 1 start_time_unk = time.time() # Loop all unknown_words for unk_key, unk_values in unknown_words_list: min_dist = 1 word = "" if (counter % 5000) == 0: print(" Calculated " + str(counter) + " unknown words") # Loop all vocab words for calculating the distance for key, value, dis in known_words_list: cur_dist = distance(unk_values, value, dis) # Save the word that is most similar if cur_dist < min_dist: min_dist = cur_dist word = key # Save most similar vocab_word to the unk_word unknown_words_results[unk_key] = word counter += 1 # Once in a while, save checkpoints if counter % save_freq == 0: save_to_pickle(unknown_dict_pickle_path, unknown_words_results) print(" Saved temporarily unknown_words_dictionary") print("Time to get similar words for all UNK:", get_time(start_time_unk)) save_to_pickle(unknown_dict_pickle_path, unknown_words_results) save_dict_to_file(unk_to_vocab_txt_path, unknown_words_results) return unknown_words_results
def read_every_data_file_and_create_initial_files(initial_x_file_path, initial_y_file_path, misspelled_vocabulary): start_time = time.time() number_of_files_read = 0 number_of_lines = 0 folder_path = "../../opensubtitles-parser/data" for filename in os.listdir(folder_path): if filename[-7:] == 'raw.txt': filename_path = paths['source_folder_root'] + filename number_of_files_read += 1 num_sentences = preprocess_training_file(filename_path, initial_x_file_path, initial_y_file_path, misspelled_vocabulary) number_of_lines += num_sentences print("Done with filename: " + str(filename) + ", read " + str(number_of_files_read) + " files, processed " + str(number_of_lines) + " sentences") else: print(filename + " is not preprocessed") print "Number of files read: " + str(number_of_files_read) print get_time(start_time)
def train(): """Train a en->fr translation model using WMT data.""" print("Checking for needed files") check_for_needed_files_and_create() train_path = paths['train_path'] shuffle_file(train_path, train_path) print("Creating file queue") filename_queue = input_pipeline(root=paths['preprocess_root_files'] ,start_name=paths['train_file']) filename_queue_dev = input_pipeline(root=paths['preprocess_root_files'], start_name=paths['dev_file']) perplexity_log_path = os.path.join(FLAGS.train_dir, paths['perplexity_log']) if not os.path.exists(perplexity_log_path): with open(perplexity_log_path, 'w') as fileObject: fileObject.write("Learning_rate: %d \t Optimizer: %s \n" % (FLAGS.learning_rate, optimizer)) fileObject.write("Step \tPerplexity \tBucket perplexity \n") # Avoid allocating all of the GPU memory config = get_session_configs() with tf.device(use_gpu): with tf.Session(config=config) as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False) # Stream data print("Setting up coordinator") coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) # This is for the training loop. train_set = [[] for _ in _buckets] dev_set = [[] for _ in _buckets] step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] read_line = 0 reading_file_path = "" # Create log writer object print("Create log writer object") summary_writer = tf.train.SummaryWriter(FLAGS.log_dir, graph=tf.get_default_graph()) reader_train_data = tf.TextLineReader() # skip_header_lines=int, number of lines to skip key, txt_row_train_data = reader_dev_data = tf.TextLineReader() _, txt_row_dev_data = lowest_perplexity = 20.0 train_time = time.time() print("Starting training loop") try: while current_step < FLAGS.max_train_steps: # not coord.should_stop(): if current_step % FLAGS.print_frequency == 0: print("Step number: " + str(current_step)) read_line, reading_file_path = check_and_shuffle_file(key, sess, read_line, paths['train_path']) # Get a batch train_set, bucket_id = get_batch(txt_row_train_data, train_set, FLAGS.batch_size) start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch(train_set, bucket_id) # Clean out trained bucket train_set[bucket_id] = [] # Make a step _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) # Calculating variables step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: check_time = time.time() print(get_time(train_time, "to train")) # Print statistics for the previous epoch. dev_set, bucket_id = get_batch(txt_row_dev_data, dev_set, FLAGS.batch_size, ac_function=min) perplexity = exp(float(loss)) if loss < 300 else float("inf") print("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): previous_losses.append(loss) # Save checkpoint and zero timer and loss. print("Save checkpoint") checkpoint_path = os.path.join(FLAGS.train_dir, "Ola.ckpt"), checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Adding perplexity to tensorboard perplexity_summary = tf.Summary() overall_value = perplexity_summary.value.add() overall_value.tag = "perplexity_overall" overall_value.simple_value = perplexity # Run evals on development set and print their perplexity. print("Run evaluation on development set") bucket_perplexity = "" for bucket_id in xrange(len(_buckets)): if len(dev_set[bucket_id]) == 0: print(" eval: empty bucket %d" % bucket_id) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch(dev_set, bucket_id) # Clean out used bucket del dev_set[bucket_id][:FLAGS.batch_size] _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = exp(float(eval_loss)) if eval_loss < 300 else float("inf") print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) bucket_perplexity += "\t" + str(eval_ppx) # Adding bucket perplexity to tensorboard bucket_value = perplexity_summary.value.add() bucket_value.tag = "perplexity_bucket %d" % bucket_id bucket_value.simple_value = eval_ppx summary_writer.add_summary(perplexity_summary, model.global_step.eval()) with open(os.path.join(FLAGS.train_dir, paths['perplexity_log']), 'a') as fileObject: fileObject.write(str(model.global_step) + " \t" + str(perplexity) + bucket_perplexity + "\n") # Save model if checkpoint was the best one if perplexity < lowest_perplexity: # and current_step > 400000: lowest_perplexity = perplexity checkpoint_path = os.path.join(FLAGS.train_dir, "Ola_best_.ckpt"), checkpoint_path, global_step=model.global_step) sys.stdout.flush() print(get_time(check_time), "to do checkpoint") train_time = time.time() except tf.errors.OutOfRangeError: print('Done training, epoch reached') finally: coord.request_stop() coord.join(threads)
def train(): """Train a en->fr translation model using WMT data.""" print("Checking for needed files") check_for_needed_files_and_create() print("Creating file queues") filename_queue = input_pipeline(root=paths['stateful_datafiles'], start_name="merged_train", shuffle=False) filename_queue_dev = input_pipeline(root=paths['stateful_datafiles'], start_name="merged_dev", shuffle=False) perplexity_log_path = os.path.join(FLAGS.train_dir, paths['perplexity_log']) if not os.path.exists(perplexity_log_path): with open(perplexity_log_path, 'w') as fileObject: fileObject.write( "Learning_rate: %d \t Optimizer: %s \t Lstm %s \n" % (FLAGS.learning_rate, optimizer, FLAGS.use_lstm)) fileObject.write("Step \tPerplexity \tBucket perplexity \n") # Avoid allocating all of the GPU memory config = get_session_configs() with tf.device(use_gpu): with tf.Session(config=config) as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False) # Stream data print("Setting up coordinator") coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) # This is for the training loop. step_time, loss = 0.0, 0.0 current_step = 0 train_set = [[] for _ in range(batch_size)] dev_set = [[] for _ in range(batch_size)] previous_losses = [] read_line = 0 read_line_dev = 0 reading_file_path = paths['merged_train_stateful_path_file1'] reading_dev_file_path = paths['merged_dev_stateful_path'] # Create log writer object print("Create log writer object") summary_writer = tf.train.SummaryWriter(FLAGS.log_dir, graph=tf.get_default_graph()) key, txt_row_train_data = tf.TextLineReader().read(filename_queue) key_dev, txt_row_dev_data = tf.TextLineReader().read(filename_queue_dev) lowest_perplexity = 20.0 train_time = time.time() # Need an initial state for the encoder rnn if FLAGS.use_lstm: initial_state = np.zeros((num_layers, 2, batch_size, size)) else: initial_state = np.zeros((num_layers, batch_size, size)) state = initial_state dev_state = initial_state print("Starts training loop") try: while FLAGS.max_train_steps >= current_step: # not coord.should_stop(): if current_step % FLAGS.print_frequency == 0: print("Step number" + str(current_step)) # Get a batch # Find empty holders in training set empty_conversations = [index for index, conversation in enumerate(train_set) if conversation == []] if empty_conversations != []: init_key, init_line =[key, txt_row_train_data]) read_line, reading_file_path = check_and_shuffle_file(init_key, sess, read_line, reading_file_path, stateful=True) train_set, batch_train_set, state = get_stateful_batch(txt_row_train_data, train_set, empty_conversations, init_line, state, size, FLAGS.use_lstm) start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch(batch_train_set) # Make a step _, step_loss, _, state = model.step(sess, encoder_inputs, decoder_inputs, target_weights, state, False) # Calculating variables step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: check_time = time.time() print(get_time(train_time), "to train") # Print statistics for the previous epoch. empty_dev_conversations = [index for index, conversation in enumerate(dev_set) if conversation == []] if empty_dev_conversations != []: init_key_dev, init_line_dev =[key_dev, txt_row_dev_data]) read_line_dev, reading_dev_file_path = check_and_shuffle_file(init_key_dev, sess, read_line_dev, reading_dev_file_path, stateful=True, dev=True) dev_set, batch_dev_set, dev_state = get_stateful_batch(txt_row_dev_data, dev_set, empty_dev_conversations, init_line_dev, dev_state, size, FLAGS.use_lstm) perplexity = exp(float(loss)) if loss < 300 else float("inf") print("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): previous_losses.append(loss) # Save checkpoint and zero timer and loss. print("Save checkpoint") checkpoint_path = os.path.join(FLAGS.train_dir, "Vinyals.ckpt"), checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Adding perplexity to tensorboard perplexity_summary = tf.Summary() overall_value = perplexity_summary.value.add() overall_value.tag = "perplexity_overall" overall_value.simple_value = perplexity # Run evals on development set and print their perplexity. print("Run evaluation on development set") step_perplexity = "" # Run eval on three steps # 1 encoder_inputs, decoder_inputs, target_weights = model.get_batch(batch_dev_set) _, eval_loss, _, dev_state = model.step(sess, encoder_inputs, decoder_inputs, target_weights, dev_state, True) eval_ppx = exp(float(eval_loss)) if eval_loss < 300 else float("inf") print(" eval: step %d perplexity %.2f" % (1.0, eval_ppx)) step_perplexity += "\t" + str(eval_ppx) # Adding step perplexity to tensorboard step_value = perplexity_summary.value.add() step_value.tag = "perplexity_step %d" % 1.0 step_value.simple_value = eval_ppx # 2 empty_dev_conversations = [index for index, conversation in enumerate(dev_set) if conversation == []] if empty_dev_conversations != []: init_key_dev, init_line_dev =[key_dev, txt_row_dev_data]) read_line_dev, reading_dev_file_path = check_and_shuffle_file(init_key_dev, sess, read_line_dev, reading_dev_file_path, stateful=True, dev=True) dev_set, batch_dev_set, dev_state = get_stateful_batch(txt_row_dev_data, dev_set,empty_dev_conversations, init_line_dev, dev_state, size, FLAGS.use_lstm) encoder_inputs, decoder_inputs, target_weights = model.get_batch(batch_dev_set) _, eval_loss, _, dev_state = model.step(sess, encoder_inputs, decoder_inputs, target_weights, dev_state, True) eval_ppx = exp(float(eval_loss)) if eval_loss < 300 else float("inf") print(" eval: step %d perplexity %.2f" % (2.0, eval_ppx)) step_perplexity += "\t" + str(eval_ppx) # Adding step perplexity to tensorboard step_value = perplexity_summary.value.add() step_value.tag = "perplexity_step %d" % 2.0 step_value.simple_value = eval_ppx # 3 empty_dev_conversations = [index for index, conversation in enumerate(dev_set) if conversation == []] if empty_dev_conversations != []: init_key_dev, init_line_dev =[key_dev, txt_row_dev_data]) read_line_dev, reading_dev_file_path = check_and_shuffle_file(init_key_dev, sess, read_line_dev, reading_dev_file_path, stateful=True, dev=True) dev_set, batch_dev_set, dev_state = get_stateful_batch(txt_row_dev_data, dev_set, empty_dev_conversations, init_line_dev, dev_state, size, FLAGS.use_lstm) encoder_inputs, decoder_inputs, target_weights = model.get_batch(batch_dev_set) _, eval_loss, _, dev_state = model.step(sess, encoder_inputs, decoder_inputs, target_weights, dev_state, True) eval_ppx = exp(float(eval_loss)) if eval_loss < 300 else float("inf") print(" eval: step %d perplexity %.2f" % (3.0, eval_ppx)) step_perplexity += "\t" + str(eval_ppx) # Adding step perplexity to tensorboard step_value = perplexity_summary.value.add() step_value.tag = "perplexity_step %d" % 3.0 step_value.simple_value = eval_ppx summary_writer.add_summary(perplexity_summary, model.global_step.eval()) with open(os.path.join(FLAGS.train_dir, paths['perplexity_log']), 'a') as fileObject: fileObject.write(str(model.global_step) + " \t" + str(perplexity) + step_perplexity + "\n") # Save model if checkpoint was the best one if perplexity < lowest_perplexity: lowest_perplexity = perplexity checkpoint_path = os.path.join(FLAGS.train_dir, "Vinyals_stateful_best_.ckpt"), checkpoint_path, global_step=model.global_step) sys.stdout.flush() get_time(check_time, "to do checkpoint") train_time = time.time() except tf.errors.OutOfRangeError: print('Done training, epoch reached') finally: coord.request_stop() coord.join(threads)
def get_conversation_stats_for_context(folders, fit_1, fit_2, fit_3, fit_4, fit_5, fit_6): start_time = time.time() number_of_files_checked = 0 fits_1_conv = 0 fits_1_turns = 0 fits_2_conv = 0 fits_2_turns = 0 fits_3_conv = 0 fits_3_turns = 0 fits_4_conv = 0 fits_4_turns = 0 fits_5_conv = 0 fits_5_turns = 0 fits_6_conv = 0 fits_6_turns = 0 counter = 0 nice_files = [] for folder in folders: folder_path = "../../ubuntu-ranking-dataset-creator/src/dialogs/" + folder for filename in os.listdir(folder_path): number_of_files_checked += 1 file_path = folder_path + "/" + filename num_turns, fit_1_bool, fit_2_bool, fit_3_bool, fit_4_bool, fit_5_bool, fit_6_bool, = non_turns_exceed_max_turns_in_conv( file_path, fit_1, fit_2, fit_3, fit_4, fit_5, fit_6) if fit_1_bool: fits_1_conv += 1 fits_1_turns += num_turns if counter < 40: counter += 1 nice_files.append(file_path) if fit_2_bool: fits_2_conv += 1 fits_2_turns += num_turns if fit_3_bool: fits_3_conv += 1 fits_3_turns += num_turns if fit_4_bool: fits_4_conv += 1 fits_4_turns += num_turns if fit_5_bool: fits_5_conv += 1 fits_5_turns += num_turns if fit_6_bool: fits_6_conv += 1 fits_6_turns += num_turns print("Done with folder: " + str(folder) + ", read " + str(number_of_files_checked) + " files") print "Number of files read: " + str(number_of_files_checked) print( str(fits_1_conv) + " conversations fits with max len " + str(fit_1) + ". Has " + str(fits_1_turns) + " turns") print( str(fits_2_conv) + " conversations fits with max len " + str(fit_2) + ". Has " + str(fits_2_turns) + " turns") print( str(fits_3_conv) + " conversations fits with max len " + str(fit_3) + ". Has " + str(fits_3_turns) + " turns") print( str(fits_4_conv) + " conversations fits with max len " + str(fit_4) + ". Has " + str(fits_4_turns) + " turns") print( str(fits_5_conv) + " conversations fits with max len " + str(fit_5) + ". Has " + str(fits_5_turns) + " turns") print( str(fits_6_conv) + " conversations fits with max len " + str(fit_6) + ". Has " + str(fits_6_turns) + " turns") for files in nice_files: print(files) print get_time(start_time)