Example #1
0
def save_pickle_data(filename, pickle_filename):
    data_process = DataProcess(filename)
    dialog_iter = data_process.create_dialogue_iter(
        data_process.input_file_path)

    index = 0
    with open(pickle_filename, 'wb') as f_handle:
        while True:
            # data -> (context, utterances, target_id)
            data = next(dialog_iter, None)

            if data is None:
                break

            context, utterances, target_id = data

            tokenized_context, _ = data_process.tokenize(context)
            tokenized_utterances, _ = data_process.tokenize(utterances)

            save_data = [tokenized_context, tokenized_utterances]
            pickle.dump(save_data, f_handle)

            index += 1

            if index % 100 == 0:
                print(index)

    print("%s data save complete!" % index)
Example #2
0
def make_valid_data(filename, write_file_name):

    data_process = DataProcess(filename)
    dialog_iter = data_process.create_dialogue_iter(
        data_process.input_file_path)

    input_sum_turn = 0
    input_sum_sentence_len = 0
    with open(write_file_name, "w", encoding='utf-8') as f_handle:
        index = 0

        while True:
            index += 1
            # data -> (context, utterances, target_id)
            data = next(dialog_iter, None)

            if data is None:
                break
            speakers, context, utterances, target_id = data
            context_sentence = context[0].split(" __eot__ ")

            f_handle.write("[%d]" % index + "\n")
            sum_sentence_len = 0
            tot_turn = 0

            for i, sentence in enumerate(context_sentence):
                sentence_len = len(nltk.word_tokenize(sentence))
                if len(sentence) == 0:
                    continue
                sentence_string = speakers[i] + " : " + sentence
                sentence_string = str(sentence_len) + "|" + sentence_string
                f_handle.write(sentence_string + "\n")

                sum_sentence_len += sentence_len
                tot_turn += 1

            avg_sentence_len = sum_sentence_len / tot_turn
            sentence_answer = "Answer : " + utterances[target_id[0]] + "\n"
            f_handle.write(sentence_answer)
            f_handle.write("average sentence length : %.3f" %
                           avg_sentence_len + "\n")
            f_handle.write("total turn number : %d" % tot_turn + '\n')

            f_handle.write("-" * 200 + "\n")
            if index % 500 == 0:
                print(index, ":", "avg_sentence_len - %.3f" % avg_sentence_len,
                      "tot_turn - %d" % tot_turn)
            input_sum_turn += tot_turn
            input_sum_sentence_len += avg_sentence_len

        f_handle.write("average sentence length %.3f" %
                       (input_sum_sentence_len / index))
        f_handle.write("average turn length %.3f" % (input_sum_turn / index))