コード例 #1
0
 def create_frequent_words_only_corpus_creator(
         input_file_path: str, output_file_path: str,
         collapse_word_case_str: str, vocabulary_size_str: str):
     collapse_word_case = Utils.str2bool(collapse_word_case_str)
     vocabulary_size = int(vocabulary_size_str)
     word_frequency_table = WordFrequencyTable.create_word_frequency_table(
         input_file_path, collapse_word_case)
     return FrequentWordsOnlyCorpusCreator(input_file_path, output_file_path,
                                           word_frequency_table, vocabulary_size)
    def create_lines_with_explicit_word_separator_file_creator(
            input_file_path: str, output_file_path: str,
            integrate_word_separator_with_words_string: str):

        integrate_word_separator_with_words = Utils.str2bool(
            integrate_word_separator_with_words_string)

        return LinesWithExplicitWordSeparatorFileCreator(
            input_file_path, output_file_path,
            LinesWithExplicitWordSeparatorFileCreator.
            EXPLICIT_WORD_SEPARATOR_SYMBOL,
            integrate_word_separator_with_words)
コード例 #3
0
def main():

    # if len(sys.argv) != 2:
    #     raise RuntimeError("Error: test_word_frequency_table INPUT_FILE_PATH")
    #
    # input_file_path = sys.argv[1]
    # WordFrequencyTable.test_word_frequency_table(input_file_path)

    if len(sys.argv) != 4:
        raise RuntimeError(
            "Error: vocabulary_word_coverage_analysis "
            "LANGUAGE_MODEL_TRAINING_FILE_PATH TEST_SET_FILE_PATH "
            "COLLAPSE_CASING")

    language_model_training_file_path = sys.argv[1]
    test_set_file_path = sys.argv[2]
    collapse_word_casing = Utils.str2bool(sys.argv[3])
    vocabulary_word_coverage_analysis = VocabularyWordCoverageAnalysis.create_vocabulary_word_coverage_analysis(
        language_model_training_file_path, test_set_file_path,
        collapse_word_casing)
    vocabulary_word_coverage_analysis.make_coverage_for_vocabulary_sizes_table(
    )
コード例 #4
0
def main():

    for i, arg in enumerate(sys.argv[1:]):
        print("sys.argv[" + str(i + 1) + "]: " + arg)

    if len(sys.argv) != 8:
        print("number of arguments: " + str(len(sys.argv)))
        raise RuntimeError(
            "Error - usage: "
            "iam_database_fragments_remover IAM_LINES_FILE_PATH "
            "IAM_DATABASE_LINE_IMAGES_ROOT_FOLDER_PATH "
            "IAM_ORIGINAL_FILES_DIRECTORY_PATH "
            "CORPUS_OUTPUT_FILE_PATH "
            "PERMUTATION_FILE_PATH "
            "VOCABULARY_FILE_PATH "
            "KEEP_NEWLINES_WITHIN_FRAGMENTS")

    iam_lines_file_path = sys.argv[1]
    print("iam_lines_file_path: " + iam_lines_file_path)
    iam_database_line_images_root_folder_path = sys.argv[2]
    print("iam_database_line_images_root_folder_path: " +
          iam_database_line_images_root_folder_path)
    iam_original_files_directory_path = sys.argv[3]
    corpus_output_file_path = sys.argv[4]
    permutation_file_path = sys.argv[5]
    vocabulary_file_path = sys.argv[6]
    keep_newlines_within_fragments_string = sys.argv[7]
    keep_newlines_within_fragments = Utils.str2bool(
        keep_newlines_within_fragments_string)
    filtered_lob_corpus_creator = FilteredLobCorpusCreator.create_filterd_lob_corpus_creator(
        iam_lines_file_path, iam_database_line_images_root_folder_path,
        permutation_file_path, vocabulary_file_path,
        iam_original_files_directory_path, corpus_output_file_path,
        keep_newlines_within_fragments)
    filtered_lob_corpus_creator.create_iam_validation_and_test_fragments_filtered_output_file(
    )