Ejemplo n.º 1
0
def main(language=None, corpus=None, datafolder=None, filename=None,
         maxwordtokens=0, use_corpus=True):

    print("\n*****************************************************\n"
          "Running the phon.py program now...\n")

    infilename, corpusName = get_wordlist_path_corpus_stem(language, corpus,
                                datafolder, filename, maxwordtokens, use_corpus)

    if not infilename.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language, corpus=corpus,
                        datafolder=datafolder, filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist ""\n"
                     "is not found.".format(infilename))

    if filename:
        outfolder = Path(Path(filename).parent, "phon")
    else:
        outfolder = Path(datafolder, language, 'phon')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfilenamePhones = Path(outfolder, corpusName + "_phones.txt")
    outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt")
    outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt")

    phoneDict = Counter()
    triphoneDict = Counter()
    biphoneDict = Counter()
    sep = "\t"

    print('Reading the wordlist file now...')

    with infilename.open() as f:
        lines = f.readlines()

        for line in lines:
            if not line or line.startswith("#"):
                continue

            line = line.strip().casefold()

            phones, *rest = line.split()

            try:
                freq = int(rest[0])
            except (ValueError, IndexError):
                freq = 1

            phones = "#{}#".format(phones) # add word boundaries
            lenPhones = len(phones)

            for i in range(lenPhones-2):

                phone1 = phones[i]
                phone2 = phones[i+1]
                phone3 = phones[i+2]

                phoneDict[phone3] += freq

                if i == 0:
                    phoneDict[phone1] += freq
                    phoneDict[phone2] += freq
                    biphone = phone1 + sep + phone2
                    biphoneDict[biphone] += freq

                biphone = phone2 + sep + phone3
                triphone = phone1 + sep + phone2 + sep + phone3

                triphoneDict[triphone] += freq
                biphoneDict[biphone] += freq

    print("\nCompleted counting phones, biphones, and triphones.")

    intro_string = "# data source: {}".format(str(infilename))

    phonesSorted = sorted_alphabetized(phoneDict.items(),
                                       key=lambda x: x[1], reverse=True)

    biphonesSorted = sorted_alphabetized(biphoneDict.items(),
                                         key=lambda x: x[1], reverse=True)

    triphonesSorted = sorted_alphabetized(triphoneDict.items(),
                                          key=lambda x: x[1], reverse=True)

    #--------------------------------------------------------------------------#
    # generate .txt output files
    #--------------------------------------------------------------------------#

    with outfilenamePhones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(phonesSorted)), file=f)
        print("# token count: {}".format(str(sum(phoneDict.values()))), file=f)
        for (phone, freq) in phonesSorted:
            print(phone + sep + str(freq), file=f)

    with outfilenameBiphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(biphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(biphoneDict.values()))),
                                                                        file=f)
        for (biphone, freq) in biphonesSorted:
            print(biphone + sep +  str(freq), file=f)

    with outfilenameTriphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(triphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(triphoneDict.values()))),
                                                                        file=f)
        for (triphone, freq) in triphonesSorted:
            print(triphone + sep + str(freq), file=f)

    #--------------------------------------------------------------------------#
    # generate .json output files
    #--------------------------------------------------------------------------#

    outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json')
    with outfilenamePhones_json.open('w') as f:
        json_pdump(phoneDict, f, key=lambda x:x[1], reverse=True)

    outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones, '.json')
    with outfilenameBiphones_json.open('w') as f:
        json_pdump(biphoneDict, f, key=lambda x:x[1], reverse=True)

    outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones, '.json')
    with outfilenameTriphones_json.open('w') as f:
        json_pdump(triphoneDict, f, key=lambda x:x[1], reverse=True)

    print('phone, biphone and triphone files ready')

    stdout_list("Output files:",
        outfilenamePhones, outfilenameBiphones, outfilenameTriphones,
        outfilenamePhones_json, outfilenameBiphones_json, outfilenameTriphones_json)
Ejemplo n.º 2
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         maxwordtokens=0,
         use_corpus=True):

    print("\n*****************************************************\n"
          "Running the phon.py program now...\n")

    infilename, corpusName = get_wordlist_path_corpus_stem(
        language, corpus, datafolder, filename, maxwordtokens, use_corpus)

    if not infilename.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language,
                        corpus=corpus,
                        datafolder=datafolder,
                        filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist "
                     "\n"
                     "is not found.".format(infilename))

    if filename:
        outfolder = Path(Path(filename).parent, "phon")
    else:
        outfolder = Path(datafolder, language, 'phon')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfilenamePhones = Path(outfolder, corpusName + "_phones.txt")
    outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt")
    outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt")

    phoneDict = Counter()
    triphoneDict = Counter()
    biphoneDict = Counter()
    sep = "\t"

    print('Reading the wordlist file now...')

    with infilename.open() as f:
        lines = f.readlines()

        for line in lines:
            if not line or line.startswith("#"):
                continue

            line = line.strip().casefold()

            phones, *rest = line.split()

            try:
                freq = int(rest[0])
            except (ValueError, IndexError):
                freq = 1

            phones = "#{}#".format(phones)  # add word boundaries
            lenPhones = len(phones)

            for i in range(lenPhones - 2):

                phone1 = phones[i]
                phone2 = phones[i + 1]
                phone3 = phones[i + 2]

                phoneDict[phone3] += freq

                if i == 0:
                    phoneDict[phone1] += freq
                    phoneDict[phone2] += freq
                    biphone = phone1 + sep + phone2
                    biphoneDict[biphone] += freq

                biphone = phone2 + sep + phone3
                triphone = phone1 + sep + phone2 + sep + phone3

                triphoneDict[triphone] += freq
                biphoneDict[biphone] += freq

    print("\nCompleted counting phones, biphones, and triphones.")

    intro_string = "# data source: {}".format(str(infilename))

    phonesSorted = sorted_alphabetized(phoneDict.items(),
                                       key=lambda x: x[1],
                                       reverse=True)

    biphonesSorted = sorted_alphabetized(biphoneDict.items(),
                                         key=lambda x: x[1],
                                         reverse=True)

    triphonesSorted = sorted_alphabetized(triphoneDict.items(),
                                          key=lambda x: x[1],
                                          reverse=True)

    #--------------------------------------------------------------------------#
    # generate .txt output files
    #--------------------------------------------------------------------------#

    with outfilenamePhones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(phonesSorted)), file=f)
        print("# token count: {}".format(str(sum(phoneDict.values()))), file=f)
        for (phone, freq) in phonesSorted:
            print(phone + sep + str(freq), file=f)

    with outfilenameBiphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(biphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(biphoneDict.values()))),
              file=f)
        for (biphone, freq) in biphonesSorted:
            print(biphone + sep + str(freq), file=f)

    with outfilenameTriphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(triphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(triphoneDict.values()))),
              file=f)
        for (triphone, freq) in triphonesSorted:
            print(triphone + sep + str(freq), file=f)

    #--------------------------------------------------------------------------#
    # generate .json output files
    #--------------------------------------------------------------------------#

    outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json')
    with outfilenamePhones_json.open('w') as f:
        json_pdump(phoneDict, f, key=lambda x: x[1], reverse=True)

    outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones,
                                                    '.json')
    with outfilenameBiphones_json.open('w') as f:
        json_pdump(biphoneDict, f, key=lambda x: x[1], reverse=True)

    outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones,
                                                     '.json')
    with outfilenameTriphones_json.open('w') as f:
        json_pdump(triphoneDict, f, key=lambda x: x[1], reverse=True)

    print('phone, biphone and triphone files ready')

    stdout_list("Output files:", outfilenamePhones, outfilenameBiphones,
                outfilenameTriphones, outfilenamePhones_json,
                outfilenameBiphones_json, outfilenameTriphones_json)
Ejemplo n.º 3
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         MinimumStemLength=4,
         MinimumAffixLength=1,
         SF_threshold=3,
         maxwordtokens=0,
         use_corpus=True):

    print("\n*****************************************************\n"
          "Running the tries.py program now...\n")

    #--------------------------------------------------------------------##
    #        read wordlist
    #--------------------------------------------------------------------##

    print("reading wordlist...", flush=True)

    wordlist_path, corpusName = get_wordlist_path_corpus_stem(
        language, corpus, datafolder, filename, maxwordtokens, use_corpus)

    print("wordlist file path:\n{}\n".format(wordlist_path))

    if not wordlist_path.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language,
                        corpus=corpus,
                        datafolder=datafolder,
                        filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist "
                     "\n"
                     "is not found.".format(wordlist_path))

    wordFreqDict = read_word_freq(wordlist_path)
    wordlist = sorted(wordFreqDict.keys())
    reversedwordlist = sorted([x[::-1] for x in wordlist])

    #--------------------------------------------------------------------##
    #        output settings
    #--------------------------------------------------------------------##

    if filename:
        outfolder = Path(Path(filename).parent, "tries")
    else:
        outfolder = Path(datafolder, language, "tries")

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfile_SF_name = Path(outfolder, corpusName + "_SF.txt")
    outfile_trieLtoR_name = Path(outfolder, corpusName + "_trieLtoR.txt")

    outfile_trieRtoL_name = Path(outfolder, corpusName + "_trieRtoL.txt")
    outfile_PF_name = Path(outfolder, corpusName + "_PF.txt")

    outfile_Signatures_name = Path(outfolder, corpusName + "_Signatures.txt")

    #--------------------------------------------------------------------##
    #        Find breaks in words (left-to-right and right-to-left)
    #--------------------------------------------------------------------##

    print("finding breaks in words...", flush=True)

    breaks_LtoR = findBreaksInWords(wordlist, MinimumStemLength)
    breaks_RtoL = findBreaksInWords(reversedwordlist, MinimumStemLength)

    #--------------------------------------------------------------------##
    #        Break up each word (left-to-right and right-to-left)
    #--------------------------------------------------------------------##

    WordsBrokenLtoR = BreakUpEachWord(wordlist, breaks_LtoR)
    WordsBrokenRtoL = BreakUpEachWord(reversedwordlist, breaks_RtoL)

    #--------------------------------------------------------------------------#
    #        Compute successors and predecessors
    #--------------------------------------------------------------------------#

    print("computing successors and predecessors...", flush=True)

    successors = GetSuccessors(wordlist, WordsBrokenLtoR)
    OutputSuccessors(outfile_SF_name, successors, SF_threshold)

    predecessors = GetSuccessors(reversedwordlist, WordsBrokenRtoL)
    OutputSuccessors(outfile_PF_name, predecessors, SF_threshold, reverse=True)

    outfile_SF_name_json = changeFilenameSuffix(outfile_SF_name, ".json")
    json_pdump(successors, outfile_SF_name_json.open("w"))

    outfile_PF_name_json = changeFilenameSuffix(outfile_PF_name, ".json")
    json_pdump(predecessors, outfile_PF_name_json.open("w"))

    print("printing signatures...", flush=True)
    OutputSignatures1(outfile_Signatures_name, successors)

    #--------------------------------------------------------------------------#
    #        Print tries (left-to-right, right-to-left)
    #--------------------------------------------------------------------------#

    print("printing tries...", flush=True)

    OutputTrie(outfile_trieLtoR_name, wordlist, WordsBrokenLtoR)
    OutputTrie(outfile_trieRtoL_name,
               reversedwordlist,
               WordsBrokenRtoL,
               reverse=True)

    outfile_trieLtoR_name_json = changeFilenameSuffix(outfile_trieLtoR_name,
                                                      ".json")
    json_pdump(WordsBrokenLtoR, outfile_trieLtoR_name_json.open("w"))

    outfile_trieRtoL_name_json = changeFilenameSuffix(outfile_trieRtoL_name,
                                                      ".json")
    json_pdump(WordsBrokenRtoL, outfile_trieRtoL_name_json.open("w"))

    stdout_list("Output files:", outfile_SF_name, outfile_PF_name,
                outfile_trieLtoR_name, outfile_trieRtoL_name,
                outfile_Signatures_name, outfile_SF_name_json,
                outfile_PF_name_json, outfile_trieLtoR_name_json,
                outfile_trieRtoL_name_json)
Ejemplo n.º 4
0
def main(language=None, corpus=None, datafolder=None, filename=None,
         MinimumStemLength=4, MinimumAffixLength=1, SF_threshold=3,
         maxwordtokens=0, use_corpus=True):

    print("\n*****************************************************\n"
          "Running the tries.py program now...\n")

    #--------------------------------------------------------------------##
    #        read wordlist
    #--------------------------------------------------------------------##

    print("reading wordlist...", flush=True)

    wordlist_path, corpusName = get_wordlist_path_corpus_stem(language, corpus,
                                datafolder, filename, maxwordtokens, use_corpus)

    print("wordlist file path:\n{}\n".format(wordlist_path))

    if not wordlist_path.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language, corpus=corpus,
                        datafolder=datafolder, filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist ""\n"
                     "is not found.".format(wordlist_path))

    wordFreqDict = read_word_freq(wordlist_path)
    wordlist = sorted(wordFreqDict.keys())
    reversedwordlist = sorted([x[::-1] for x in wordlist])

    #--------------------------------------------------------------------##
    #        output settings
    #--------------------------------------------------------------------##

    if filename:
        outfolder = Path(Path(filename).parent, "tries")
    else:
        outfolder = Path(datafolder, language, "tries")

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfile_SF_name = Path(outfolder, corpusName + "_SF.txt")
    outfile_trieLtoR_name = Path(outfolder, corpusName + "_trieLtoR.txt")
     
    outfile_trieRtoL_name = Path(outfolder, corpusName + "_trieRtoL.txt")
    outfile_PF_name = Path(outfolder, corpusName + "_PF.txt")

    outfile_Signatures_name = Path(outfolder, corpusName + "_Signatures.txt")

    #--------------------------------------------------------------------##
    #        Find breaks in words (left-to-right and right-to-left)
    #--------------------------------------------------------------------##

    print("finding breaks in words...", flush=True)

    breaks_LtoR = findBreaksInWords(wordlist, MinimumStemLength)
    breaks_RtoL = findBreaksInWords(reversedwordlist, MinimumStemLength)

    #--------------------------------------------------------------------##
    #        Break up each word (left-to-right and right-to-left)
    #--------------------------------------------------------------------##

    WordsBrokenLtoR = BreakUpEachWord(wordlist, breaks_LtoR)
    WordsBrokenRtoL = BreakUpEachWord(reversedwordlist, breaks_RtoL)

    #--------------------------------------------------------------------------#
    #        Compute successors and predecessors
    #--------------------------------------------------------------------------# 

    print("computing successors and predecessors...", flush=True)

    successors = GetSuccessors(wordlist, WordsBrokenLtoR)
    OutputSuccessors(outfile_SF_name, successors, SF_threshold)

    predecessors = GetSuccessors(reversedwordlist, WordsBrokenRtoL)
    OutputSuccessors(outfile_PF_name, predecessors, SF_threshold, reverse=True)

    outfile_SF_name_json = changeFilenameSuffix(outfile_SF_name, ".json")
    json_pdump(successors, outfile_SF_name_json.open("w"))

    outfile_PF_name_json = changeFilenameSuffix(outfile_PF_name, ".json")
    json_pdump(predecessors, outfile_PF_name_json.open("w"))

    print("printing signatures...", flush=True)
    OutputSignatures1(outfile_Signatures_name, successors)

    #--------------------------------------------------------------------------#
    #        Print tries (left-to-right, right-to-left)
    #--------------------------------------------------------------------------# 

    print("printing tries...", flush=True)

    OutputTrie(outfile_trieLtoR_name, wordlist, WordsBrokenLtoR)
    OutputTrie(outfile_trieRtoL_name, reversedwordlist, WordsBrokenRtoL, reverse=True)

    outfile_trieLtoR_name_json = changeFilenameSuffix(outfile_trieLtoR_name, ".json")
    json_pdump(WordsBrokenLtoR, outfile_trieLtoR_name_json.open("w"))

    outfile_trieRtoL_name_json = changeFilenameSuffix(outfile_trieRtoL_name, ".json")
    json_pdump(WordsBrokenRtoL, outfile_trieRtoL_name_json.open("w"))

    stdout_list("Output files:", outfile_SF_name, outfile_PF_name,
                                 outfile_trieLtoR_name, outfile_trieRtoL_name,
                                 outfile_Signatures_name,
                                 outfile_SF_name_json, outfile_PF_name_json,
                                 outfile_trieLtoR_name_json,
                                 outfile_trieRtoL_name_json)
Ejemplo n.º 5
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         MinimumStemLength=4,
         MaximumAffixLength=3,
         MinimumNumberofSigUses=5,
         maxwordtokens=0,
         use_corpus=True):

    print("\n*****************************************************\n"
          "Running the lxa5.py program now...\n")

    # -------------------------------------------------------------------------#
    #       decide suffixing or prefixing
    # -------------------------------------------------------------------------#

    suffix_languages = {
        "english", "french", "hungarian", "turkish", "russian", "german",
        "spanish", 'test'
    }
    prefix_languages = {"swahili"}

    if str(language).casefold() in prefix_languages:
        FindSuffixesFlag = False  # prefixal
    else:
        FindSuffixesFlag = True  # suffixal

    wordlist_path, corpus_stem = get_wordlist_path_corpus_stem(
        language, corpus, datafolder, filename, maxwordtokens, use_corpus)

    print("wordlist file path:\n{}\n".format(wordlist_path))

    if not wordlist_path.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language,
                        corpus=corpus,
                        datafolder=datafolder,
                        filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist "
                     "\n"
                     "is not found.".format(wordlist_path))

    wordFreqDict = read_word_freq(wordlist_path)
    wordlist = sorted(wordFreqDict.keys())

    if filename:
        outfolder = Path(Path(filename).parent, "lxa")
    else:
        outfolder = Path(datafolder, language, 'lxa')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    # TODO -- filenames not yet used in main()
    outfile_Signatures_name = str(outfolder) + corpus_stem + "_Signatures.txt"
    outfile_SigTransforms_name = str(
        outfolder) + corpus_stem + "_SigTransforms.txt"
    outfile_FSA_name = str(outfolder) + corpus_stem + "_FSA.txt"
    outfile_FSA_graphics_name = str(
        outfolder) + corpus_stem + "_FSA_graphics.png"

    # -------------------------------------------------------------------------#
    #   create: BisigToTuple
    #                  (key: tuple of bisig | value: set of (stem, word1, word2)
    #           StemToWords (key: stem | value: set of words)
    #           SigToStems  (key: tuple of sig | value: set of stems )
    #           StemToSig   (key: str of stem  | value: tuple of sig )
    #           WordToSigs  (key: str of word  | value: set of sigs )
    #           AffixToSigs (key: str of affix | value: set of sigs )
    # -------------------------------------------------------------------------#

    BisigToTuple = MakeBiSignatures(wordlist, MinimumStemLength,
                                    MaximumAffixLength, FindSuffixesFlag)
    print("BisigToTuple ready", flush=True)

    StemToWords = MakeStemToWords(BisigToTuple, MinimumNumberofSigUses)
    print("StemToWords ready", flush=True)

    SigToStems = MakeSigToStems(StemToWords, MaximumAffixLength,
                                MinimumNumberofSigUses, FindSuffixesFlag)
    print("SigToStems ready", flush=True)

    StemToSig = MakeStemToSig(SigToStems)
    print("StemToSig ready", flush=True)

    WordToSigs = MakeWordToSigs(StemToWords, StemToSig)
    print("WordToSigs ready", flush=True)

    WordToSigtransforms = MakeWordToSigtransforms(WordToSigs)
    print("WordToSigtransforms ready", flush=True)

    AffixToSigs = MakeAffixToSigs(SigToStems)
    print("AffixToSigs ready", flush=True)

    # -------------------------------------------------------------------------#
    #   generate graphs for several dicts
    # -------------------------------------------------------------------------#
    #    GenerateGraphFromDict(StemToWords, outfolder, 'StemToWords.gexf')
    #    GenerateGraphFromDict(SigToStems, outfolder, 'SigToStems.gexf')
    #    GenerateGraphFromDict(WordToSigs, outfolder, 'WordToSigs.gexf')
    #    GenerateGraphFromDict(StemToSig, outfolder, 'StemToSig.gexf')
    # -------------------------------------------------------------------------#

    # -------------------------------------------------------------------------#
    #      output stem file
    # -------------------------------------------------------------------------#

    stemfilename = Path(outfolder, '{}_StemToWords.txt'.format(corpus_stem))
    OutputLargeDict(stemfilename,
                    StemToWords,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    min_cell_width=25,
                    howmanyperline=5)

    print('===> stem file generated:', stemfilename, flush=True)

    # -------------------------------------------------------------------------#
    #      output affix file
    # -------------------------------------------------------------------------#

    affixfilename = Path(outfolder, '{}_AffixToSigs.txt'.format(corpus_stem))
    OutputLargeDict(affixfilename,
                    AffixToSigs,
                    min_cell_width=25,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    howmanyperline=5,
                    SignatureValues=True)
    print('===> affix file generated:', affixfilename, flush=True)

    # -------------------------------------------------------------------------#
    #   output SigToStems
    # -------------------------------------------------------------------------#

    SigToStems_outfilename = Path(outfolder, corpus_stem + "_SigToStems.txt")
    OutputLargeDict(SigToStems_outfilename,
                    SigToStems,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    howmanyperline=5,
                    SignatureKeys=True)

    SigToStems_outfilename_json = changeFilenameSuffix(SigToStems_outfilename,
                                                       ".json")
    json_pdump(SigToStems,
               SigToStems_outfilename_json.open("w"),
               key=lambda x: len(x[1]),
               reverse=True)

    print('===> output file generated:', SigToStems_outfilename, flush=True)
    print('===> output file generated:',
          SigToStems_outfilename_json,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output WordToSigs
    # -------------------------------------------------------------------------#

    WordToSigs_outfilename = Path(outfolder, corpus_stem + "_WordToSigs.txt")
    OutputLargeDict(WordToSigs_outfilename,
                    WordToSigs,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    min_cell_width=25,
                    SignatureValues=True)

    WordToSigs_outfilename_json = changeFilenameSuffix(WordToSigs_outfilename,
                                                       ".json")
    json_pdump(WordToSigs,
               WordToSigs_outfilename_json.open("w"),
               key=lambda x: len(x[1]),
               reverse=True)

    print('===> output file generated:', WordToSigs_outfilename, flush=True)
    print('===> output file generated:',
          WordToSigs_outfilename_json,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output WordToSigtransforms
    # -------------------------------------------------------------------------#

    WordToSigtransforms_outfilename = Path(
        outfolder, corpus_stem + "_WordToSigtransforms.txt")
    OutputLargeDict(WordToSigtransforms_outfilename,
                    WordToSigtransforms,
                    min_cell_width=25,
                    sigtransforms=True,
                    key=lambda x: len(x[1]),
                    reverse=True)
    print('===> output file generated:',
          WordToSigtransforms_outfilename,
          flush=True)

    WordToSigtransforms_outfilename_json = changeFilenameSuffix(
        WordToSigtransforms_outfilename, ".json")
    json_pdump(WordToSigtransforms,
               WordToSigtransforms_outfilename_json.open("w"),
               key=lambda x: len(x[1]),
               reverse=True)
    print('===> output file generated:',
          WordToSigtransforms_outfilename_json,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output the most freq word types not in any induced paradigms {the, of..}
    # -------------------------------------------------------------------------#

    wordFreqDict_sorted = sorted_alphabetized(wordFreqDict.items(),
                                              key=lambda x: x[1],
                                              reverse=True)

    mostFreqWordsNotInSigs_outfilename = Path(
        outfolder, corpus_stem + "_mostFreqWordsNotInSigs.txt")

    with mostFreqWordsNotInSigs_outfilename.open('w') as f:
        for (word, freq) in wordFreqDict_sorted:
            if word not in WordToSigs:
                print(word, freq, file=f)
            else:
                break

    print('===> output file generated:',
          mostFreqWordsNotInSigs_outfilename,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output the word types in induced paradigms
    # -------------------------------------------------------------------------#

    WordsInSigs_outfilename = Path(outfolder, corpus_stem + "_WordsInSigs.txt")

    with WordsInSigs_outfilename.open('w') as f:
        for (word, freq) in wordFreqDict_sorted:
            if word in WordToSigs:
                print(word, freq, file=f)

    print('===> output file generated:', WordsInSigs_outfilename, flush=True)

    # -------------------------------------------------------------------------#
    #   output the word types NOT in induced paradigms
    # -------------------------------------------------------------------------#

    WordsNotInSigs_outfilename = Path(outfolder,
                                      corpus_stem + "_WordsNotInSigs.txt")

    with WordsNotInSigs_outfilename.open('w') as f:
        for (word, freq) in wordFreqDict_sorted:
            if word not in WordToSigs:
                print(word, freq, file=f)

    print('===> output file generated:',
          WordsNotInSigs_outfilename,
          flush=True)