def pre_process_dialouge(raw_dialogue_file_path, pre_processed_dir, movie_dir_name, dialogue_file1, dialogue_file2, train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2, vocab_file1, vocab_file2, max_dialouge_count, max_dialouge_count_train_test, train_percentile, compare_count, symbol_seq, type_d): print("Extracting file") start = time.time() delete_all_file_dir(pre_processed_dir, type_d) if movie_dir_name == cornell_movie_dir: dialogue_file1, dialogue_file2 = dialouge_seperator_cornell_movie(raw_dialogue_file_path, dialogue_file1, dialogue_file2, symbol_seq, max_dialouge_count) elif movie_dir_name == open_subtitles_dir: dialogue_file1, dialogue_file2 = dialouge_seperator_open_subtitle(raw_dialogue_file_path, dialogue_file1, dialogue_file2, max_dialouge_count) elif movie_dir_name == movie_subtitles_dir: dialogue_file1, dialogue_file2 = dialouge_seperator_movie_subtitle(raw_dialogue_file_path, dialogue_file1, dialogue_file2, max_dialouge_count) else: print("directory match not found") return #print(dialogue_file1, dialogue_file2) train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2 = train_test_split(dialogue_file1, dialogue_file2, max_dialouge_count_train_test, train_percentile, train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2) ''' vocab_dict1 = get_vocab(train_file1) vocab_dict2 = get_vocab(train_file2) write_dict(vocab_file1, vocab_dict1, "key") write_dict(vocab_file2, vocab_dict2, "key") compare_chat_reply(dialogue_file1, dialogue_file2, compare_count) ''' end = time.time() time_lib.elapsed_time(start, end)
def pre_process_dialouge(out_dir_path, input_file_path, movie_dir, output_file1, output_file2, vocab_file1, vocab_file2, dialouge_count, type_d, symbol_seq, compare_count): print("Extracting file") start = time.time() delete_all_file_dir(out_dir_path, type_d) if movie_dir == cornell_movie_dir: output_file1, output_file2 = dialouge_seperator_cornell_movie(input_file_path, symbol_seq, dialouge_count, output_file1, output_file2) elif movie_dir == open_subtitles_dir: output_file1, output_file2 = dialouge_seperator_open_subtitle(input_file_path, output_file1, output_file2, dialouge_count) elif movie_dir == movie_subtitles_dir: output_file1, output_file2 = dialouge_seperator_movie_subtitle(input_file_path, output_file1, output_file2, dialouge_count) else: print("directory match not found") return print(output_file1, output_file2) vocab_dict1 = get_vocab(output_file1) vocab_dict2 = get_vocab(output_file2) write_dict(vocab_file1, vocab_dict1, "key") write_dict(vocab_file2, vocab_dict2, "key") compare_chat_reply(output_file1, output_file2, compare_count) end = time.time() time_lib.elapsed_time(start, end)
def pre_process_dialouge_seq_2_seq(raw_dialogue_file_path, pre_processed_dir, movie_dir_name, dialogue_file1, dialogue_file2, train_file1, train_file2, test_file1, test_file2, max_dialouge_count, max_dialouge_count_train_test, train_percentile, compare_count, symbol_seq, type_d, max_len, type_rand, max_test_dev_count): print("Extracting file") start = time.time() delete_all_file_dir(pre_processed_dir, type_d) if movie_dir_name == cornell_movie_dir: dialogue_file1, dialogue_file2 = dialouge_seperator_cornell_movie(raw_dialogue_file_path, dialogue_file1, dialogue_file2, symbol_seq, max_dialouge_count, max_len) elif movie_dir_name == open_subtitles_dir: dialogue_file1, dialogue_file2 = dialouge_seperator_open_subtitle(raw_dialogue_file_path, dialogue_file1, dialogue_file2, max_dialouge_count) elif movie_dir_name == movie_subtitles_dir: dialogue_file1, dialogue_file2 = dialouge_seperator_movie_subtitle(raw_dialogue_file_path, dialogue_file1, dialogue_file2, max_dialouge_count) else: print("directory match not found") return #print(dialogue_file1, dialogue_file2) # gnmt model #train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2 = train_test_split(dialogue_file1, dialogue_file2, max_dialouge_count_train_test, train_percentile, train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2, max_test_dev_count) train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2 = train_test_split_seq2_seq(dialogue_file1, dialogue_file2, max_dialouge_count_train_test, train_percentile, train_file1, train_file2, test_file1, test_file2, max_test_dev_count) compare_chat_reply(dialogue_file1, dialogue_file2, compare_count, type_rand) end = time.time() time_lib.elapsed_time(start, end)
def pre_process_dialouge(raw_dialogue_file_path, pre_processed_dir, movie_dir_name, dialogue_file1, dialogue_file2, train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2, vocab_file1, vocab_file2, max_dialouge_count, max_dialouge_count_train_test, train_percentile, symbol_seq, type_d, max_len, max_test_dev_count): print("Extracting file") start = time.time() delete_all_file_dir(pre_processed_dir, type_d) dialogue_file1, dialogue_file2 = dialouge_seperator_cornell_movie( raw_dialogue_file_path, dialogue_file1, dialogue_file2, symbol_seq, max_dialouge_count, max_len) train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2 = train_test_split( dialogue_file1, dialogue_file2, max_dialouge_count_train_test, train_percentile, train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2, max_test_dev_count) end = time.time() time_lib.elapsed_time(start, end) return train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2
def all(out_dir_path, input_file_path, output_file1, output_file2, vocab_file1, vocab_file2, dialouge_count, type_d): start = time.time() delete_all_file_dir(out_dir_path, type_d) output_file1, output_file2 = dialouge_seperator_open_subtitle( filename, output_file1, output_file2, dialouge_count) print(output_file1, output_file2) vocab_dict1 = get_vocab(output_file1) vocab_dict2 = get_vocab(output_file2) write_dict(vocab_file1, vocab_dict1, "key") write_dict(vocab_file2, vocab_dict2, "key") compare_chat_reply(output_file1, output_file2, 100) end = time.time() time_lib.elapsed_time(start, end)
output_file2 = "processed_dialouge/cornell movie/train.vi" vocab_file1 = "processed_dialouge/cornell movie/vocab.en" vocab_file2 = "processed_dialouge/cornell movie/vocab.vi" #dialouge_count = 300 dialouge_count = -1 symbol_seq = '+++$+++ ' start = time.time() output_file1, output_file2 = dialouge_seperator_cornell_movie(filename, symbol_seq, 300, output_file1, output_file2) vocab_dict1 = get_vocab(output_file1) vocab_dict2 = get_vocab(output_file2) write_dict(vocab_file1, vocab_dict1, "key") write_dict(vocab_file2, vocab_dict2, "key") compare_chat_reply(output_file1, output_file2, 10) end = time.time() time_lib.elapsed_time(start, end) ''' filename = "corpus/extracted/movie_subtitles_en.txt" output_file1 = "processed_dialouge/movie_subtitles/train.en" output_file2 = "processed_dialouge/movie_subtitles/train.vi" vocab_file1 = "processed_dialouge/movie_subtitles/vocab.en" vocab_file2 = "processed_dialouge/movie_subtitles/vocab.vi" #dialouge_count = 300 dialouge_count = -1 start = time.time() output_file1, output_file2 = dialouge_seperator_movie_subtitle(filename, output_file1, output_file2, dialouge_count) print(output_file1, output_file2) vocab_dict1 = get_vocab(output_file1)