def pre_process_dialouge(raw_dialogue_file_path, pre_processed_dir, movie_dir_name, dialogue_file1, dialogue_file2, train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2, vocab_file1, vocab_file2, max_dialouge_count, max_dialouge_count_train_test, train_percentile, compare_count, symbol_seq, type_d):
	print("Extracting file")

	start = time.time()
	
	delete_all_file_dir(pre_processed_dir, type_d)
	
	if movie_dir_name == cornell_movie_dir:
		dialogue_file1, dialogue_file2 = dialouge_seperator_cornell_movie(raw_dialogue_file_path, dialogue_file1, dialogue_file2, symbol_seq, max_dialouge_count)
	elif movie_dir_name == open_subtitles_dir:
		dialogue_file1, dialogue_file2 = dialouge_seperator_open_subtitle(raw_dialogue_file_path, dialogue_file1, dialogue_file2, max_dialouge_count)
	elif movie_dir_name == movie_subtitles_dir:
		dialogue_file1, dialogue_file2 = dialouge_seperator_movie_subtitle(raw_dialogue_file_path, dialogue_file1, dialogue_file2, max_dialouge_count)
	else:
		print("directory match not found")
		return

	#print(dialogue_file1, dialogue_file2)

	train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2 = train_test_split(dialogue_file1, dialogue_file2, max_dialouge_count_train_test, train_percentile, train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2)
	'''
	vocab_dict1 = get_vocab(train_file1)
	vocab_dict2 = get_vocab(train_file2)

	write_dict(vocab_file1, vocab_dict1, "key")
	write_dict(vocab_file2, vocab_dict2, "key")

	compare_chat_reply(dialogue_file1, dialogue_file2, compare_count)
    '''


	end = time.time()
	time_lib.elapsed_time(start, end)
def pre_process_dialouge(out_dir_path, input_file_path, movie_dir, output_file1, output_file2, vocab_file1, vocab_file2, dialouge_count, type_d, symbol_seq, compare_count):
	print("Extracting file")
	start = time.time()
	delete_all_file_dir(out_dir_path, type_d)
	if movie_dir == cornell_movie_dir:
		output_file1, output_file2 = dialouge_seperator_cornell_movie(input_file_path, symbol_seq, dialouge_count, output_file1, output_file2)
	elif movie_dir == open_subtitles_dir:
		output_file1, output_file2 = dialouge_seperator_open_subtitle(input_file_path, output_file1, output_file2, dialouge_count)
	elif movie_dir == movie_subtitles_dir:
		output_file1, output_file2 = dialouge_seperator_movie_subtitle(input_file_path, output_file1, output_file2, dialouge_count)
	else:
		print("directory match not found")
		return

	print(output_file1, output_file2)

	vocab_dict1 = get_vocab(output_file1)
	vocab_dict2 = get_vocab(output_file2)

	write_dict(vocab_file1, vocab_dict1, "key")
	write_dict(vocab_file2, vocab_dict2, "key")

	compare_chat_reply(output_file1, output_file2, compare_count)

	end = time.time()
	time_lib.elapsed_time(start, end)
def pre_process_dialouge_seq_2_seq(raw_dialogue_file_path, pre_processed_dir, movie_dir_name, dialogue_file1, dialogue_file2, train_file1, train_file2, test_file1, test_file2, max_dialouge_count, max_dialouge_count_train_test, train_percentile, compare_count, symbol_seq, type_d, max_len, type_rand, max_test_dev_count):
	print("Extracting file")

	start = time.time()
	
	delete_all_file_dir(pre_processed_dir, type_d)
	
	if movie_dir_name == cornell_movie_dir:
		dialogue_file1, dialogue_file2 = dialouge_seperator_cornell_movie(raw_dialogue_file_path, dialogue_file1, dialogue_file2, symbol_seq, max_dialouge_count, max_len)
	elif movie_dir_name == open_subtitles_dir:
		dialogue_file1, dialogue_file2 = dialouge_seperator_open_subtitle(raw_dialogue_file_path, dialogue_file1, dialogue_file2, max_dialouge_count)
	elif movie_dir_name == movie_subtitles_dir:
		dialogue_file1, dialogue_file2 = dialouge_seperator_movie_subtitle(raw_dialogue_file_path, dialogue_file1, dialogue_file2, max_dialouge_count)
	else:
		print("directory match not found")
		return

	#print(dialogue_file1, dialogue_file2)

	# gnmt model
	#train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2 = train_test_split(dialogue_file1, dialogue_file2, max_dialouge_count_train_test, train_percentile, train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2, max_test_dev_count)
	
	train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2 = train_test_split_seq2_seq(dialogue_file1, dialogue_file2, max_dialouge_count_train_test, train_percentile, train_file1, train_file2, test_file1, test_file2, max_test_dev_count)
		

	compare_chat_reply(dialogue_file1, dialogue_file2, compare_count, type_rand)



	end = time.time()
	time_lib.elapsed_time(start, end)
def pre_process_dialouge(raw_dialogue_file_path, pre_processed_dir,
                         movie_dir_name, dialogue_file1, dialogue_file2,
                         train_file1, train_file2, test_file1, test_file2,
                         dev_file1, dev_file2, vocab_file1, vocab_file2,
                         max_dialouge_count, max_dialouge_count_train_test,
                         train_percentile, symbol_seq, type_d, max_len,
                         max_test_dev_count):
    print("Extracting file")

    start = time.time()

    delete_all_file_dir(pre_processed_dir, type_d)

    dialogue_file1, dialogue_file2 = dialouge_seperator_cornell_movie(
        raw_dialogue_file_path, dialogue_file1, dialogue_file2, symbol_seq,
        max_dialouge_count, max_len)

    train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2 = train_test_split(
        dialogue_file1, dialogue_file2, max_dialouge_count_train_test,
        train_percentile, train_file1, train_file2, test_file1, test_file2,
        dev_file1, dev_file2, max_test_dev_count)

    end = time.time()
    time_lib.elapsed_time(start, end)

    return train_file1, train_file2, test_file1, test_file2, dev_file1, dev_file2
Exemple #5
0
def all(out_dir_path, input_file_path, output_file1, output_file2, vocab_file1,
        vocab_file2, dialouge_count, type_d):

    start = time.time()
    delete_all_file_dir(out_dir_path, type_d)

    output_file1, output_file2 = dialouge_seperator_open_subtitle(
        filename, output_file1, output_file2, dialouge_count)
    print(output_file1, output_file2)

    vocab_dict1 = get_vocab(output_file1)
    vocab_dict2 = get_vocab(output_file2)

    write_dict(vocab_file1, vocab_dict1, "key")
    write_dict(vocab_file2, vocab_dict2, "key")

    compare_chat_reply(output_file1, output_file2, 100)

    end = time.time()
    time_lib.elapsed_time(start, end)
output_file2 = "processed_dialouge/cornell movie/train.vi"
vocab_file1 = "processed_dialouge/cornell movie/vocab.en"
vocab_file2 = "processed_dialouge/cornell movie/vocab.vi"
#dialouge_count = 300
dialouge_count = -1
symbol_seq = '+++$+++ '

start = time.time()
output_file1, output_file2 = dialouge_seperator_cornell_movie(filename, symbol_seq, 300, output_file1, output_file2)
vocab_dict1 = get_vocab(output_file1)
vocab_dict2 = get_vocab(output_file2)
write_dict(vocab_file1, vocab_dict1, "key")
write_dict(vocab_file2, vocab_dict2, "key")
compare_chat_reply(output_file1, output_file2, 10)
end = time.time()
time_lib.elapsed_time(start, end)

'''
filename = "corpus/extracted/movie_subtitles_en.txt"
output_file1 = "processed_dialouge/movie_subtitles/train.en"
output_file2 = "processed_dialouge/movie_subtitles/train.vi"
vocab_file1 = "processed_dialouge/movie_subtitles/vocab.en"
vocab_file2 = "processed_dialouge/movie_subtitles/vocab.vi"
#dialouge_count = 300
dialouge_count = -1


start = time.time()
output_file1, output_file2 = dialouge_seperator_movie_subtitle(filename, output_file1, output_file2, dialouge_count)
print(output_file1, output_file2)
vocab_dict1 = get_vocab(output_file1)