def __apply_dictionary(dictionary, xml_path, output_path):
    info("\tApplying dictionary to xml files.")
    output_path = __prepare_xml_output_folder(output_path)
    num_tei_files = 0

    # Loop over each file in the folder
    for folder_file in os.listdir(xml_path):
        # Process it if it is an xml file
        if folder_file.endswith(".xml"):
            info("\t\tFound " + folder_file)
            # Create an output file
            write_file = open(output_path + folder_file[0:-4] + ".xml", "w", encoding='utf8')

            # Open the file that will be read
            with open(xml_path + folder_file, encoding='utf8') as fp:
                line = fp.readline()

                # Loop over every line (see line 62)
                while line:

                    # Check if any of the keys in the dictionary occur in the line
                    for key, value in dictionary.items():
                        # If a key exists, then replace it
                        if key in line:
                            line = line.replace(key, "\n<" + value + ">" + key + "</" + value + ">\n")
                    write_file.write(line)
                    line = fp.readline()

            # Write the new file
            write_file.close()
            ok("\t\tWrote new file " + output_path + folder_file[0:-4] + ".xml\n")
            num_tei_files += 1
    ok("\tFinished applying dictionary to all xml files.")
    ok("\tThe processed files have been output to the folder \"%s\"." % output_path)
    return output_path, num_tei_files
def __check_dictionary_folder(dictionary_folder):
    file_count = 0
    for folder_file in os.listdir(dictionary_folder):
        if folder_file.endswith(".txt"):  # Process the file if it is a txt file
            file_count += 1
    if file_count == 0:
        warning("\tThe folder \"%s\" contains no txt files!" % dictionary_folder)
        info("Nothing more to do. Exiting without applying dictionary files.")
        sys.exit(0)
    else:
        info("\tFound %i txt dictionary files in the folder \"%s\".\n" % (file_count, dictionary_folder))
def __prepare_xml_output_folder(processed_folder):
    processed_folder = processed_folder + "tei-prep/" if processed_folder[-1:] == "/" else processed_folder + "/tei-prep/"
    if not os.path.isdir(processed_folder):
        info("\t\tDirectory \"%s\" doesn't yet exist. Trying to create it." % processed_folder)
        assert isinstance(processed_folder, str)
        try:
            os.mkdir(processed_folder)
        except OSError:
            error("\t\tCreation of the output directory \"%s\" failed" % processed_folder)
            sys.exit(1)
        else:
            ok("\t\tSuccessfully created the output directory \"%s\".\n" % processed_folder)
    return processed_folder
def main():
    start_time = time.time()

    info("Converting files...")

    # Get Command line arguments
    parser = argparse.ArgumentParser(description='Prepare docx files for usage as tei.')
    parser.add_argument("-i", dest="unprocessed_folder", required=False, default="./input/",
                        help="the folder containing docx files for input; the default is \"./input/\".", metavar="INFOLDER",
                        type=lambda x: is_valid_input_folder(x))

    parser.add_argument("-d", dest="dictionary", required=False, default="./dictionaries/",
                        help="the folder containing dictionary files for xml transformation; the default is \"./dictionaries/\".", metavar="DICTFOLDER",
                        type=lambda x: is_valid_dictionary_folder(x))

    parser.add_argument("-o", dest="processed_folder", required=False, default="./output/",
                        help="the folder where the processed files will be placed; the default is \"./output/\".", metavar="OUTFOLDER",
                        type=lambda x: is_valid_output_folder(x))

    args = parser.parse_args()

    # Parse the docx files
    xml_dir = docx_to_xml.convert_to_xml(unprocessed_folder=args.unprocessed_folder, processed_folder=args.processed_folder)

    # Apply the dictionary to the xml files
    num_dictionary_entries, tei_path, num_tei_files = apply_dict.process_dictionary(args.dictionary, xml_dir, args.processed_folder)
    ok("Conversion and processing of files is complete.")
    info("All tasks completed in in %f seconds." % (time.time() - start_time))
    info("%i dictionary entries were applied to %i files." % (num_dictionary_entries, num_tei_files))
    info("The fully processed files are available in \"%s\"." % tei_path)
def is_valid_output_folder(arg):
    arg = arg + "/" if arg[-1:] != '/' else arg
    if not os.path.isdir(arg):
        info("\tThe output folder \"%s\" does not exist yet, trying to create it." % arg)
        assert isinstance(arg, str)
        try:
            os.mkdir(arg)
        except OSError:
            error("\tCreation of the output folder \"%s\" failed" % arg)
            sys.exit(1)
        else:
            ok("\tSuccessfully created the output folder \"%s\"." % arg)

    return arg
def convert_to_xml(unprocessed_folder, processed_folder):
    # type: (str, str) -> str
    """
Read every txt file in the folder and wrap any lines like f.1a
in a <pb> milestone. Otherwise wrap the whole line in an <ab>.
    :param unprocessed_folder: The folder containing unprocessed docx files
    :param processed_folder: The output folder for the processed xml files
    :rtype: None
    """
    info("\nConverting the docx files to xml.")
    processed_folder = __prepare_xml_output_folder(processed_folder)
    folio_designation = re.compile(r'.*[f|p].*\..*[0-9].*')
    column_designation = re.compile(r'^.*[C|c]ol.*\..*[0-9].*$')
    leave_alpha_num = re.compile('[^A-z0-9.]')
    leave_num = re.compile('[^0-9]')

    # Get all files in the folder
    for folder_file in os.listdir(unprocessed_folder):
        if folder_file.endswith(
                ".docx"):  # Process the file if it is a txt file
            info("\tFound " + folder_file)

            # Create an output file for the updated file
            write_file = open(processed_folder + folder_file[0:-5] + ".xml",
                              "w",
                              encoding='utf8')

            # Open the file to be read
            doc = Document(unprocessed_folder + folder_file)
            for para in doc.paragraphs:
                line = str(para.text.strip())  # Strip any whitespace
                is_folio = folio_designation.match(line)
                is_column = column_designation.match(line)
                if not is_folio and not is_column:  # if it isn't a folio or col break, then it is text. Use <ab></ab>
                    write_file.write("<ab>" + line + "</ab>\n")
                elif is_folio:  # Otherwise, if it matches the folio regex, wrap it in a <pb>
                    line = leave_alpha_num.sub('', line)
                    write_file.write("<pb n=\"" + line + "\"/>\n")
                elif is_column:  # Otherwise, if it matches column regex, wrap it in a <cb>
                    line = leave_num.sub('', line)
                    write_file.write("<cb n=\"" + line + "\"/>\n")

            # Close the file
            write_file.close()
            ok("\tWrote new file " + processed_folder + folder_file[0:-5] +
               ".xml\n")

    return processed_folder
def is_valid_input_folder(arg):
    arg = arg + "/" if arg[-1:] != '/' else arg
    if not os.path.isdir(arg):
        assert isinstance(arg, str)
        error("\tThe folder \"%s\" does not exist!" % arg)
        sys.exit(1)
    else:
        file_count = 0
        for folder_file in os.listdir(arg):
            if folder_file.endswith(".docx"):  # Process the file if it is a txt file
                file_count += 1
        if file_count == 0:
            error("\tThe folder \"%s\" contains no docx files!" % arg)
            sys.exit(1)
        else:
            info("\tFound %i docx files in the folder \"%s\".\n" % (file_count, arg))

    return arg
def __build_dict(dict_folder):
    info("\tBuilding complete dictionary.")
    __check_dictionary_folder(dict_folder)
    dictionary = {}
    check_alpha_num = re.compile(r"[^A-z0-9]")

    # Loop over each file in the folder
    for folder_file in os.listdir(dict_folder):

        # Process it if it is a txt file
        if folder_file.endswith(".txt"):

            # Start an entry count for this file
            file_entries = 0
            info("\t\tFound " + folder_file)

            # Open the dictionary file
            with open(dict_folder + folder_file, encoding='utf8') as fp:
                line = fp.readline()
                lineCount = 1

                # Loop through each line in the file
                while line:
                    line = line.strip()  # Strip any accidental whitespace
                    entry = line.split(',')  # Split on the ","
                    # Process the line if it has a valid entry with only two items, and the second is only alphanumeric
                    if len(entry) == 2 and not check_alpha_num.findall(entry[1].strip()):
                        dictionary[entry[0].strip()] = entry[1].strip()  # Create the new entry in the dictionary (any old one is overwritten)
                        file_entries += 1
                    else: # Notify the user of the error
                        warning("\t\tThere is an error in line " + str(lineCount) + " of " + folder_file + ".")
                        error(line)

                    lineCount += 1
                    line = fp.readline()

            ok("\t\tFinished parsing " + folder_file + ": " + str(file_entries) + " entries.\n")
    ok("\tFinished loading dictionaries. " + str(len(dictionary)) + " entries in total.\n")
    return dictionary
def process_dictionary(dictionary_path, xml_path, output_path):
    info("Processing the dictionaries.")
    full_dictionary = __build_dict(dictionary_path)
    output_path, num_tei_files = __apply_dictionary(full_dictionary, xml_path, output_path)
    return len(full_dictionary), output_path, num_tei_files