Example #1
0
def get_conversation_turn_stats(folders, bucket_size=30, max_turns=3000):
    start_time = time.time()
    number_of_files_read = 0
    # Array with occurrences of conversations with i turns. Array[i] get #conversations with i turns
    turns = [0] * max_turns
    exceeded_max_turns = 0
    total_turns = 0.0
    longest_conversation = 0

    more_than_500 = 0
    more_than_1000 = 0
    more_than_2000 = 0

    for folder in folders:
        folder_path = "../../ubuntu-ranking-dataset-creator/src/dialogs/" + folder
        for filename in os.listdir(folder_path):
            number_of_files_read += 1
            file_path = folder_path + "/" + filename
            num_turns = get_num_turns_in_file(file_path, bucket_size,
                                              max_turns)
            if num_turns > 2000:
                more_than_2000 += 1
            elif (num_turns > 1000):
                more_than_1000 += 1
            elif num_turns > 500:
                more_than_500 += 1

            if num_turns < max_turns:
                turns[num_turns] += 1
                total_turns += num_turns
            else:
                exceeded_max_turns += 1
        print("Done with folder: " + str(folder) + ", read " +
              str(number_of_files_read) + " conversations")
    print "Number of files read: " + str(number_of_files_read)

    # Find stats
    max_occ = 0
    max_index = 0
    for i in range(2, max_turns):
        if turns[i] > max_occ:
            max_occ = turns[i]
            max_index = i
    print("Mode is " + str(max_index) + " with " + str(max_occ) +
          " occurrences")

    print("Average turns per conversation is " +
          str(total_turns / number_of_files_read))
    #print("Average turns per conversation without two turns is " + str((total_turns-2*turns[2])/(number_of_files_read-turns[2])))

    for occurrence in turns:
        print(occurrence)
    print("Exceeded max turns " + str(exceeded_max_turns))
    print("More than 2000 " + str(more_than_2000))
    print("More than 1000 " + str(more_than_1000))
    print("More than 500 " + str(more_than_500))
    print get_time(start_time)
Example #2
0
def read_every_data_file_and_create_initial_files(folders, initial_x_file_path,
                                                  initial_y_file_path):
    start_time = time.time()
    number_of_files_read = 0
    for folder in folders:
        folder_path = "../../ubuntu-ranking-dataset-creator/src/dialogs/" + folder
        for filename in os.listdir(folder_path):
            number_of_files_read += 1
            file_path = folder_path + "/" + filename
            preprocess_training_file(file_path, initial_x_file_path,
                                     initial_y_file_path)
        print("Done with folder: " + str(folder) + ", read " +
              str(number_of_files_read) + " files")

    print "Number of files read: " + str(number_of_files_read)
    print get_time(start_time)
Example #3
0
def create_fast_text_model(folder, merged_spellcheck_path):
    start_time_fasttext = time.time()
    path = './' + str(folder) + '/model'
    model = fasttext.skipgram(merged_spellcheck_path, path)
    print("Time used to create Fasttext model: ",
          get_time(start_time_fasttext))
    return model
def get_most_similar_words_for_unk(unknown_words, vocab_words,
                                   unknown_dict_pickle_path,
                                   unk_to_vocab_txt_path, save_freq):

    # The resulting dictionary consisting of 'unk_word' : 'most similar vocab word'
    unknown_words_results = {}
    # If a previously dictionary is saved, this one will be fed with words that has NOT computed a similar word
    new_unk_words_dict = {}

    # If pickle file exists, load into unknown_words_results
    if os.path.exists(unknown_dict_pickle_path):
        unknown_words_results = load_pickle_file(unknown_dict_pickle_path)
        for key, value in unknown_words.iteritems():
            if key not in unknown_words_results:
                # If the word is not computed, add to new_unk_words_dict so it can be computed later
                new_unk_words_dict[key] = value

        # Set unknown_words to the words that is not computed
        unknown_words = new_unk_words_dict

    # Create lists for faster computation
    known_words_list = [(key, value[0], value[1])
                        for key, value in vocab_words.iteritems()]
    unknown_words_list = [(key, value)
                          for key, value in unknown_words.iteritems()]

    counter = 1
    start_time_unk = time.time()
    # Loop all unknown_words
    for unk_key, unk_values in unknown_words_list:
        min_dist = 1
        word = ""
        if (counter % 5000) == 0:
            print("     Calculated " + str(counter) + " unknown words")
        # Loop all vocab words for calculating the distance
        for key, value, dis in known_words_list:
            cur_dist = distance(unk_values, value, dis)
            # Save the word that is most similar
            if cur_dist < min_dist:
                min_dist = cur_dist
                word = key
        # Save most similar vocab_word to the unk_word
        unknown_words_results[unk_key] = word
        counter += 1

        # Once in a while, save checkpoints
        if counter % save_freq == 0:
            save_to_pickle(unknown_dict_pickle_path, unknown_words_results)
            print("   Saved temporarily unknown_words_dictionary")
    print("Time to get similar words for all UNK:", get_time(start_time_unk))
    save_to_pickle(unknown_dict_pickle_path, unknown_words_results)
    save_dict_to_file(unk_to_vocab_txt_path, unknown_words_results)

    return unknown_words_results
Example #5
0
def read_every_data_file_and_create_initial_files(initial_x_file_path,
                                                  initial_y_file_path,
                                                  misspelled_vocabulary):
    start_time = time.time()
    number_of_files_read = 0
    number_of_lines = 0
    folder_path = "../../opensubtitles-parser/data"
    for filename in os.listdir(folder_path):
        if filename[-7:] == 'raw.txt':
            filename_path = paths['source_folder_root'] + filename
            number_of_files_read += 1
            num_sentences = preprocess_training_file(filename_path,
                                                     initial_x_file_path,
                                                     initial_y_file_path,
                                                     misspelled_vocabulary)
            number_of_lines += num_sentences
            print("Done with filename: " + str(filename) + ", read " +
                  str(number_of_files_read) + " files, processed " +
                  str(number_of_lines) + " sentences")
        else:
            print(filename + " is not preprocessed")

    print "Number of files read: " + str(number_of_files_read)
    print get_time(start_time)
def train():
    """Train a en->fr translation model using WMT data."""

    print("Checking for needed files")
    check_for_needed_files_and_create()
    train_path = paths['train_path']
    shuffle_file(train_path, train_path)

    print("Creating file queue")
    filename_queue = input_pipeline(root=paths['preprocess_root_files'] ,start_name=paths['train_file'])
    filename_queue_dev = input_pipeline(root=paths['preprocess_root_files'], start_name=paths['dev_file'])

    perplexity_log_path = os.path.join(FLAGS.train_dir, paths['perplexity_log'])

    if not os.path.exists(perplexity_log_path):
        with open(perplexity_log_path, 'w') as fileObject:
            fileObject.write("Learning_rate: %d \t Optimizer: %s \n" % (FLAGS.learning_rate, optimizer))
            fileObject.write("Step \tPerplexity \tBucket perplexity \n")

    # Avoid allocating all of the GPU memory
    config = get_session_configs()
    with tf.device(use_gpu):
        with tf.Session(config=config) as sess:
            # Create model.
            print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
            model = create_model(sess, False)

            # Stream data
            print("Setting up coordinator")
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord)

            # This is for the training loop.
            train_set = [[] for _ in _buckets]
            dev_set = [[] for _ in _buckets]
            step_time, loss = 0.0, 0.0
            current_step = 0
            previous_losses = []
            read_line = 0
            reading_file_path = ""

            # Create log writer object
            print("Create log writer object")
            summary_writer = tf.train.SummaryWriter(FLAGS.log_dir, graph=tf.get_default_graph())

            reader_train_data = tf.TextLineReader()  # skip_header_lines=int, number of lines to skip
            key, txt_row_train_data = reader_train_data.read(filename_queue)

            reader_dev_data = tf.TextLineReader()
            _, txt_row_dev_data = reader_dev_data.read(filename_queue_dev)

            lowest_perplexity = 20.0

            train_time = time.time()

            print("Starting training loop")
            try:
                while current_step < FLAGS.max_train_steps:  # not coord.should_stop():
                    if current_step % FLAGS.print_frequency == 0:
                        print("Step number: " + str(current_step))

                    read_line, reading_file_path = check_and_shuffle_file(key, sess, read_line, paths['train_path'])

                    # Get a batch
                    train_set, bucket_id = get_batch(txt_row_train_data, train_set, FLAGS.batch_size)
                    start_time = time.time()
                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(train_set, bucket_id)

                    # Clean out trained bucket
                    train_set[bucket_id] = []

                    # Make a step
                    _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False)

                    # Calculating variables
                    step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
                    loss += step_loss / FLAGS.steps_per_checkpoint
                    current_step += 1

                    # Once in a while, we save checkpoint, print statistics, and run evals.
                    if current_step % FLAGS.steps_per_checkpoint == 0:
                        check_time = time.time()
                        print(get_time(train_time, "to train"))
                        # Print statistics for the previous epoch.
                        dev_set, bucket_id = get_batch(txt_row_dev_data, dev_set, FLAGS.batch_size, ac_function=min)

                        perplexity = exp(float(loss)) if loss < 300 else float("inf")
                        print("global step %d learning rate %.4f step-time %.2f perplexity "
                              "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity))

                        # Decrease learning rate if no improvement was seen over last 3 times.
                        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
                            sess.run(model.learning_rate_decay_op)
                        previous_losses.append(loss)

                        # Save checkpoint and zero timer and loss.
                        print("Save checkpoint")
                        checkpoint_path = os.path.join(FLAGS.train_dir, "Ola.ckpt")
                        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
                        step_time, loss = 0.0, 0.0

                        # Adding perplexity to tensorboard
                        perplexity_summary = tf.Summary()
                        overall_value = perplexity_summary.value.add()
                        overall_value.tag = "perplexity_overall"
                        overall_value.simple_value = perplexity

                        # Run evals on development set and print their perplexity.
                        print("Run evaluation on development set")
                        bucket_perplexity = ""
                        for bucket_id in xrange(len(_buckets)):
                            if len(dev_set[bucket_id]) == 0:
                                print("  eval: empty bucket %d" % bucket_id)
                                continue
                            encoder_inputs, decoder_inputs, target_weights = model.get_batch(dev_set, bucket_id)

                            # Clean out used bucket
                            del dev_set[bucket_id][:FLAGS.batch_size]

                            _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
                            eval_ppx = exp(float(eval_loss)) if eval_loss < 300 else float("inf")
                            print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))

                            bucket_perplexity += "\t" + str(eval_ppx)

                            # Adding bucket perplexity to tensorboard
                            bucket_value = perplexity_summary.value.add()
                            bucket_value.tag = "perplexity_bucket %d" % bucket_id
                            bucket_value.simple_value = eval_ppx
                        summary_writer.add_summary(perplexity_summary, model.global_step.eval())

                        with open(os.path.join(FLAGS.train_dir, paths['perplexity_log']), 'a') as fileObject:
                            fileObject.write(str(model.global_step) + " \t" + str(perplexity) + bucket_perplexity + "\n")
                        # Save model if checkpoint was the best one
                        if perplexity < lowest_perplexity:  # and current_step > 400000:
                            lowest_perplexity = perplexity
                            checkpoint_path = os.path.join(FLAGS.train_dir, "Ola_best_.ckpt")
                            model.saver.save(sess, checkpoint_path, global_step=model.global_step)

                        sys.stdout.flush()
                        print(get_time(check_time), "to do checkpoint")
                        train_time = time.time()
            except tf.errors.OutOfRangeError:
                print('Done training, epoch reached')
            finally:
                coord.request_stop()
            coord.join(threads)
Example #7
0
def train():
    """Train a en->fr translation model using WMT data."""

    print("Checking for needed files")
    check_for_needed_files_and_create()

    print("Creating file queues")

    filename_queue = input_pipeline(root=paths['stateful_datafiles'], start_name="merged_train", shuffle=False)

    filename_queue_dev = input_pipeline(root=paths['stateful_datafiles'], start_name="merged_dev", shuffle=False)

    perplexity_log_path = os.path.join(FLAGS.train_dir, paths['perplexity_log'])

    if not os.path.exists(perplexity_log_path):
        with open(perplexity_log_path, 'w') as fileObject:
            fileObject.write(
                "Learning_rate: %d \t Optimizer: %s \t Lstm %s \n" % (FLAGS.learning_rate, optimizer, FLAGS.use_lstm))
            fileObject.write("Step \tPerplexity \tBucket perplexity \n")

    # Avoid allocating all of the GPU memory
    config = get_session_configs()
    with tf.device(use_gpu):
        with tf.Session(config=config) as sess:
            # Create model.
            print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
            model = create_model(sess, False)

            # Stream data
            print("Setting up coordinator")
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord)

            # This is for the training loop.
            step_time, loss = 0.0, 0.0
            current_step = 0
            train_set = [[] for _ in range(batch_size)]
            dev_set = [[] for _ in range(batch_size)]
            previous_losses = []
            read_line = 0
            read_line_dev = 0
            reading_file_path = paths['merged_train_stateful_path_file1']
            reading_dev_file_path = paths['merged_dev_stateful_path']

            # Create log writer object
            print("Create log writer object")
            summary_writer = tf.train.SummaryWriter(FLAGS.log_dir, graph=tf.get_default_graph())

            key, txt_row_train_data = tf.TextLineReader().read(filename_queue)

            key_dev, txt_row_dev_data = tf.TextLineReader().read(filename_queue_dev)

            lowest_perplexity = 20.0

            train_time = time.time()

            # Need an initial state for the encoder rnn
            if FLAGS.use_lstm:
                initial_state = np.zeros((num_layers, 2, batch_size, size))
            else:
                initial_state = np.zeros((num_layers, batch_size, size))
            state = initial_state
            dev_state = initial_state

            print("Starts training loop")

            try:
                while FLAGS.max_train_steps >= current_step:  # not coord.should_stop():
                    if current_step % FLAGS.print_frequency == 0:
                        print("Step number" + str(current_step))

                    # Get a batch
                    # Find empty holders in training set
                    empty_conversations = [index for index, conversation in enumerate(train_set) if conversation == []]
                    if empty_conversations != []:
                        init_key, init_line = sess.run([key, txt_row_train_data])
                        read_line, reading_file_path = check_and_shuffle_file(init_key, sess, read_line, reading_file_path, stateful=True)
                    train_set, batch_train_set, state = get_stateful_batch(txt_row_train_data, train_set, empty_conversations, init_line, state, size, FLAGS.use_lstm)
                    start_time = time.time()
                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(batch_train_set)

                    # Make a step
                    _, step_loss, _, state = model.step(sess, encoder_inputs, decoder_inputs, target_weights, state, False)

                    # Calculating variables
                    step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
                    loss += step_loss / FLAGS.steps_per_checkpoint
                    current_step += 1

                    # Once in a while, we save checkpoint, print statistics, and run evals.
                    if current_step % FLAGS.steps_per_checkpoint == 0:

                        check_time = time.time()
                        print(get_time(train_time), "to train")

                        # Print statistics for the previous epoch.
                        empty_dev_conversations = [index for index, conversation in enumerate(dev_set) if
                                               conversation == []]
                        if empty_dev_conversations != []:
                            init_key_dev, init_line_dev = sess.run([key_dev, txt_row_dev_data])
                            read_line_dev, reading_dev_file_path = check_and_shuffle_file(init_key_dev, sess, read_line_dev, reading_dev_file_path, stateful=True, dev=True)
                        dev_set, batch_dev_set, dev_state = get_stateful_batch(txt_row_dev_data, dev_set, empty_dev_conversations, init_line_dev, dev_state, size, FLAGS.use_lstm)

                        perplexity = exp(float(loss)) if loss < 300 else float("inf")
                        print("global step %d learning rate %.4f step-time %.2f perplexity "
                              "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity))

                        # Decrease learning rate if no improvement was seen over last 3 times.
                        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
                            sess.run(model.learning_rate_decay_op)
                        previous_losses.append(loss)

                        # Save checkpoint and zero timer and loss.
                        print("Save checkpoint")
                        checkpoint_path = os.path.join(FLAGS.train_dir, "Vinyals.ckpt")
                        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
                        step_time, loss = 0.0, 0.0

                        # Adding perplexity to tensorboard
                        perplexity_summary = tf.Summary()
                        overall_value = perplexity_summary.value.add()
                        overall_value.tag = "perplexity_overall"
                        overall_value.simple_value = perplexity

                        # Run evals on development set and print their perplexity.
                        print("Run evaluation on development set")
                        step_perplexity = ""
                         # Run eval on three steps

                        # 1
                        encoder_inputs, decoder_inputs, target_weights = model.get_batch(batch_dev_set)

                        _, eval_loss, _, dev_state = model.step(sess, encoder_inputs, decoder_inputs, target_weights, dev_state, True)
                        eval_ppx = exp(float(eval_loss)) if eval_loss < 300 else float("inf")
                        print("  eval: step %d perplexity %.2f" % (1.0, eval_ppx))

                        step_perplexity += "\t" + str(eval_ppx)

                        # Adding step perplexity to tensorboard
                        step_value = perplexity_summary.value.add()
                        step_value.tag = "perplexity_step %d" % 1.0
                        step_value.simple_value = eval_ppx

                        # 2
                        empty_dev_conversations = [index for index, conversation in enumerate(dev_set) if
                                                   conversation == []]
                        if empty_dev_conversations != []:
                            init_key_dev, init_line_dev = sess.run([key_dev, txt_row_dev_data])
                            read_line_dev, reading_dev_file_path = check_and_shuffle_file(init_key_dev, sess, read_line_dev, reading_dev_file_path, stateful=True, dev=True)
                        dev_set, batch_dev_set, dev_state = get_stateful_batch(txt_row_dev_data, dev_set,empty_dev_conversations, init_line_dev, dev_state, size, FLAGS.use_lstm)
                        encoder_inputs, decoder_inputs, target_weights = model.get_batch(batch_dev_set)

                        _, eval_loss, _, dev_state = model.step(sess, encoder_inputs, decoder_inputs, target_weights, dev_state,
                                                        True)
                        eval_ppx = exp(float(eval_loss)) if eval_loss < 300 else float("inf")
                        print("  eval: step %d perplexity %.2f" % (2.0, eval_ppx))

                        step_perplexity += "\t" + str(eval_ppx)

                        # Adding step perplexity to tensorboard
                        step_value = perplexity_summary.value.add()
                        step_value.tag = "perplexity_step %d" % 2.0
                        step_value.simple_value = eval_ppx

                        # 3
                        empty_dev_conversations = [index for index, conversation in enumerate(dev_set) if
                                                   conversation == []]
                        if empty_dev_conversations != []:
                            init_key_dev, init_line_dev = sess.run([key_dev, txt_row_dev_data])
                            read_line_dev, reading_dev_file_path = check_and_shuffle_file(init_key_dev, sess,
                                                                                          read_line_dev,
                                                                                          reading_dev_file_path,
                                                                                          stateful=True, dev=True)
                        dev_set, batch_dev_set, dev_state = get_stateful_batch(txt_row_dev_data, dev_set,
                                                                               empty_dev_conversations, init_line_dev,
                                                                               dev_state, size, FLAGS.use_lstm)

                        encoder_inputs, decoder_inputs, target_weights = model.get_batch(batch_dev_set)

                        _, eval_loss, _, dev_state = model.step(sess, encoder_inputs, decoder_inputs, target_weights, dev_state,
                                                        True)
                        eval_ppx = exp(float(eval_loss)) if eval_loss < 300 else float("inf")
                        print("  eval: step %d perplexity %.2f" % (3.0, eval_ppx))

                        step_perplexity += "\t" + str(eval_ppx)

                        # Adding step perplexity to tensorboard
                        step_value = perplexity_summary.value.add()
                        step_value.tag = "perplexity_step %d" % 3.0
                        step_value.simple_value = eval_ppx

                        summary_writer.add_summary(perplexity_summary, model.global_step.eval())

                        with open(os.path.join(FLAGS.train_dir, paths['perplexity_log']), 'a') as fileObject:
                            fileObject.write(str(model.global_step) + " \t" + str(perplexity) + step_perplexity + "\n")

                        # Save model if checkpoint was the best one
                        if perplexity < lowest_perplexity:
                            lowest_perplexity = perplexity
                            checkpoint_path = os.path.join(FLAGS.train_dir, "Vinyals_stateful_best_.ckpt")
                            model.saver.save(sess, checkpoint_path, global_step=model.global_step)

                        sys.stdout.flush()
                        get_time(check_time, "to do checkpoint")
                        train_time = time.time()
            except tf.errors.OutOfRangeError:
                print('Done training, epoch reached')
            finally:
                coord.request_stop()
            coord.join(threads)
Example #8
0
def get_conversation_stats_for_context(folders, fit_1, fit_2, fit_3, fit_4,
                                       fit_5, fit_6):
    start_time = time.time()
    number_of_files_checked = 0
    fits_1_conv = 0
    fits_1_turns = 0
    fits_2_conv = 0
    fits_2_turns = 0
    fits_3_conv = 0
    fits_3_turns = 0
    fits_4_conv = 0
    fits_4_turns = 0
    fits_5_conv = 0
    fits_5_turns = 0
    fits_6_conv = 0
    fits_6_turns = 0

    counter = 0
    nice_files = []
    for folder in folders:
        folder_path = "../../ubuntu-ranking-dataset-creator/src/dialogs/" + folder
        for filename in os.listdir(folder_path):
            number_of_files_checked += 1
            file_path = folder_path + "/" + filename
            num_turns, fit_1_bool, fit_2_bool, fit_3_bool, fit_4_bool, fit_5_bool, fit_6_bool, = non_turns_exceed_max_turns_in_conv(
                file_path, fit_1, fit_2, fit_3, fit_4, fit_5, fit_6)
            if fit_1_bool:
                fits_1_conv += 1
                fits_1_turns += num_turns
                if counter < 40:
                    counter += 1
                    nice_files.append(file_path)
            if fit_2_bool:
                fits_2_conv += 1
                fits_2_turns += num_turns
            if fit_3_bool:
                fits_3_conv += 1
                fits_3_turns += num_turns
            if fit_4_bool:
                fits_4_conv += 1
                fits_4_turns += num_turns
            if fit_5_bool:
                fits_5_conv += 1
                fits_5_turns += num_turns
            if fit_6_bool:
                fits_6_conv += 1
                fits_6_turns += num_turns
        print("Done with folder: " + str(folder) + ", read " +
              str(number_of_files_checked) + " files")

    print "Number of files read: " + str(number_of_files_checked)
    print(
        str(fits_1_conv) + " conversations fits with max len " + str(fit_1) +
        ". Has " + str(fits_1_turns) + " turns")
    print(
        str(fits_2_conv) + " conversations fits with max len " + str(fit_2) +
        ". Has " + str(fits_2_turns) + " turns")
    print(
        str(fits_3_conv) + " conversations fits with max len " + str(fit_3) +
        ". Has " + str(fits_3_turns) + " turns")
    print(
        str(fits_4_conv) + " conversations fits with max len " + str(fit_4) +
        ". Has " + str(fits_4_turns) + " turns")
    print(
        str(fits_5_conv) + " conversations fits with max len " + str(fit_5) +
        ". Has " + str(fits_5_turns) + " turns")
    print(
        str(fits_6_conv) + " conversations fits with max len " + str(fit_6) +
        ". Has " + str(fits_6_turns) + " turns")
    for files in nice_files:
        print(files)
    print get_time(start_time)