def create_frequent_words_only_corpus_creator( input_file_path: str, output_file_path: str, collapse_word_case_str: str, vocabulary_size_str: str): collapse_word_case = Utils.str2bool(collapse_word_case_str) vocabulary_size = int(vocabulary_size_str) word_frequency_table = WordFrequencyTable.create_word_frequency_table( input_file_path, collapse_word_case) return FrequentWordsOnlyCorpusCreator(input_file_path, output_file_path, word_frequency_table, vocabulary_size)
def create_lines_with_explicit_word_separator_file_creator( input_file_path: str, output_file_path: str, integrate_word_separator_with_words_string: str): integrate_word_separator_with_words = Utils.str2bool( integrate_word_separator_with_words_string) return LinesWithExplicitWordSeparatorFileCreator( input_file_path, output_file_path, LinesWithExplicitWordSeparatorFileCreator. EXPLICIT_WORD_SEPARATOR_SYMBOL, integrate_word_separator_with_words)
def main(): # if len(sys.argv) != 2: # raise RuntimeError("Error: test_word_frequency_table INPUT_FILE_PATH") # # input_file_path = sys.argv[1] # WordFrequencyTable.test_word_frequency_table(input_file_path) if len(sys.argv) != 4: raise RuntimeError( "Error: vocabulary_word_coverage_analysis " "LANGUAGE_MODEL_TRAINING_FILE_PATH TEST_SET_FILE_PATH " "COLLAPSE_CASING") language_model_training_file_path = sys.argv[1] test_set_file_path = sys.argv[2] collapse_word_casing = Utils.str2bool(sys.argv[3]) vocabulary_word_coverage_analysis = VocabularyWordCoverageAnalysis.create_vocabulary_word_coverage_analysis( language_model_training_file_path, test_set_file_path, collapse_word_casing) vocabulary_word_coverage_analysis.make_coverage_for_vocabulary_sizes_table( )
def main(): for i, arg in enumerate(sys.argv[1:]): print("sys.argv[" + str(i + 1) + "]: " + arg) if len(sys.argv) != 8: print("number of arguments: " + str(len(sys.argv))) raise RuntimeError( "Error - usage: " "iam_database_fragments_remover IAM_LINES_FILE_PATH " "IAM_DATABASE_LINE_IMAGES_ROOT_FOLDER_PATH " "IAM_ORIGINAL_FILES_DIRECTORY_PATH " "CORPUS_OUTPUT_FILE_PATH " "PERMUTATION_FILE_PATH " "VOCABULARY_FILE_PATH " "KEEP_NEWLINES_WITHIN_FRAGMENTS") iam_lines_file_path = sys.argv[1] print("iam_lines_file_path: " + iam_lines_file_path) iam_database_line_images_root_folder_path = sys.argv[2] print("iam_database_line_images_root_folder_path: " + iam_database_line_images_root_folder_path) iam_original_files_directory_path = sys.argv[3] corpus_output_file_path = sys.argv[4] permutation_file_path = sys.argv[5] vocabulary_file_path = sys.argv[6] keep_newlines_within_fragments_string = sys.argv[7] keep_newlines_within_fragments = Utils.str2bool( keep_newlines_within_fragments_string) filtered_lob_corpus_creator = FilteredLobCorpusCreator.create_filterd_lob_corpus_creator( iam_lines_file_path, iam_database_line_images_root_folder_path, permutation_file_path, vocabulary_file_path, iam_original_files_directory_path, corpus_output_file_path, keep_newlines_within_fragments) filtered_lob_corpus_creator.create_iam_validation_and_test_fragments_filtered_output_file( )