Esempio n. 1
0
def output_WordToSharedContextsOfNeighbors(outfilenameSharedcontexts,
                                        WordToSharedContextsOfNeighbors,
                                        worddict, contextdict,
                                        nWordsForAnalysis):

    _worddict = {v:k for k,v in worddict.items()} # from index to word
    _contextdict = {v:k for k,v in contextdict.items()} # from index to context tuple

    with outfilenameSharedcontexts.open("w") as f:
        for word_idx in range(nWordsForAnalysis):

            ContextToNeighbors = WordToSharedContextsOfNeighbors[word_idx] # a dict

            if not ContextToNeighbors:
                continue

            ContextToNeighbors = sorted_alphabetized(ContextToNeighbors.items(),
                                        key=lambda x: len(x[1]), reverse=True,
                                        subkey=lambda x:x[1])

            # ContextToNeighbors is now a list of tuples, not a dict anymore

            word = _worddict[word_idx]

            print("{} {} ({})".format(word_idx+1, word,
                                      len(ContextToNeighbors)), file=f)

            for context_idx, neighbor_indices in ContextToNeighbors:
                context = " ".join(_contextdict[context_idx])
                neighbors = " ".join([_worddict[i] for i in neighbor_indices])

                print("          {:20} | {}".format(context, neighbors), file=f)

            print(file=f)
Esempio n. 2
0
def output_ImportantContextToWords(outfilename, ImportantContextToWords,
                                   contextdict, worddict):

    _contextdict = {v:k for k,v in contextdict.items()} # from index to context tuple
    _worddict = {v:k for k,v in worddict.items()} # from index to word
    ImportantContextToWords_sorted = sorted_alphabetized(
                                        ImportantContextToWords.items(),
                                        key=lambda x: len(x[1]), reverse=True)

    context_str_list = [" ".join(_contextdict[context_index])
                        for context_index, v in ImportantContextToWords_sorted]
    max_key_length = max([len(x) for x in context_str_list])

    WordToCount_list = [WordToCount for _, WordToCount in ImportantContextToWords_sorted]

    with outfilename.open("w") as f:
        for context_str, WordToCount in zip(context_str_list, WordToCount_list):
            print("{} {}".format(context_str.ljust(max_key_length),
                                 len(WordToCount)), file=f)
        print(file=f)

        for context_str, WordToCount in zip(context_str_list, WordToCount_list):
            if not WordToCount:
                continue

            print("\n===============================================\n", file=f)
            print("{} {}".format(context_str.ljust(max_key_length),
                                 len(WordToCount)), file=f)
            print(file=f)

            WordToCount_sorted = sorted_alphabetized(WordToCount.items(),
                                    key=lambda x :x[1], reverse=True)

            # don't use "count" as a variable (it's the name of a function in python)
            max_word_length = max([len(_worddict[word_no])
                                   for word_no, c in WordToCount_sorted])

            for word_no, c in WordToCount_sorted:
                print("        {} {}".format(
                          _worddict[word_no].ljust(max_word_length), c), file=f)
Esempio n. 3
0
def main(language=None, corpus=None, datafolder=None, filename=None,
         maxwordtokens=0, use_corpus=True):

    print("\n*****************************************************\n"
          "Running the phon.py program now...\n")

    infilename, corpusName = get_wordlist_path_corpus_stem(language, corpus,
                                datafolder, filename, maxwordtokens, use_corpus)

    if not infilename.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language, corpus=corpus,
                        datafolder=datafolder, filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist ""\n"
                     "is not found.".format(infilename))

    if filename:
        outfolder = Path(Path(filename).parent, "phon")
    else:
        outfolder = Path(datafolder, language, 'phon')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfilenamePhones = Path(outfolder, corpusName + "_phones.txt")
    outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt")
    outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt")

    phoneDict = Counter()
    triphoneDict = Counter()
    biphoneDict = Counter()
    sep = "\t"

    print('Reading the wordlist file now...')

    with infilename.open() as f:
        lines = f.readlines()

        for line in lines:
            if not line or line.startswith("#"):
                continue

            line = line.strip().casefold()

            phones, *rest = line.split()

            try:
                freq = int(rest[0])
            except (ValueError, IndexError):
                freq = 1

            phones = "#{}#".format(phones) # add word boundaries
            lenPhones = len(phones)

            for i in range(lenPhones-2):

                phone1 = phones[i]
                phone2 = phones[i+1]
                phone3 = phones[i+2]

                phoneDict[phone3] += freq

                if i == 0:
                    phoneDict[phone1] += freq
                    phoneDict[phone2] += freq
                    biphone = phone1 + sep + phone2
                    biphoneDict[biphone] += freq

                biphone = phone2 + sep + phone3
                triphone = phone1 + sep + phone2 + sep + phone3

                triphoneDict[triphone] += freq
                biphoneDict[biphone] += freq

    print("\nCompleted counting phones, biphones, and triphones.")

    intro_string = "# data source: {}".format(str(infilename))

    phonesSorted = sorted_alphabetized(phoneDict.items(),
                                       key=lambda x: x[1], reverse=True)

    biphonesSorted = sorted_alphabetized(biphoneDict.items(),
                                         key=lambda x: x[1], reverse=True)

    triphonesSorted = sorted_alphabetized(triphoneDict.items(),
                                          key=lambda x: x[1], reverse=True)

    #--------------------------------------------------------------------------#
    # generate .txt output files
    #--------------------------------------------------------------------------#

    with outfilenamePhones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(phonesSorted)), file=f)
        print("# token count: {}".format(str(sum(phoneDict.values()))), file=f)
        for (phone, freq) in phonesSorted:
            print(phone + sep + str(freq), file=f)

    with outfilenameBiphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(biphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(biphoneDict.values()))),
                                                                        file=f)
        for (biphone, freq) in biphonesSorted:
            print(biphone + sep +  str(freq), file=f)

    with outfilenameTriphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(triphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(triphoneDict.values()))),
                                                                        file=f)
        for (triphone, freq) in triphonesSorted:
            print(triphone + sep + str(freq), file=f)

    #--------------------------------------------------------------------------#
    # generate .json output files
    #--------------------------------------------------------------------------#

    outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json')
    with outfilenamePhones_json.open('w') as f:
        json_pdump(phoneDict, f, key=lambda x:x[1], reverse=True)

    outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones, '.json')
    with outfilenameBiphones_json.open('w') as f:
        json_pdump(biphoneDict, f, key=lambda x:x[1], reverse=True)

    outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones, '.json')
    with outfilenameTriphones_json.open('w') as f:
        json_pdump(triphoneDict, f, key=lambda x:x[1], reverse=True)

    print('phone, biphone and triphone files ready')

    stdout_list("Output files:",
        outfilenamePhones, outfilenameBiphones, outfilenameTriphones,
        outfilenamePhones_json, outfilenameBiphones_json, outfilenameTriphones_json)
Esempio n. 4
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         maxwordtokens=0,
         use_corpus=True):

    print("\n*****************************************************\n"
          "Running the phon.py program now...\n")

    infilename, corpusName = get_wordlist_path_corpus_stem(
        language, corpus, datafolder, filename, maxwordtokens, use_corpus)

    if not infilename.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language,
                        corpus=corpus,
                        datafolder=datafolder,
                        filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist "
                     "\n"
                     "is not found.".format(infilename))

    if filename:
        outfolder = Path(Path(filename).parent, "phon")
    else:
        outfolder = Path(datafolder, language, 'phon')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfilenamePhones = Path(outfolder, corpusName + "_phones.txt")
    outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt")
    outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt")

    phoneDict = Counter()
    triphoneDict = Counter()
    biphoneDict = Counter()
    sep = "\t"

    print('Reading the wordlist file now...')

    with infilename.open() as f:
        lines = f.readlines()

        for line in lines:
            if not line or line.startswith("#"):
                continue

            line = line.strip().casefold()

            phones, *rest = line.split()

            try:
                freq = int(rest[0])
            except (ValueError, IndexError):
                freq = 1

            phones = "#{}#".format(phones)  # add word boundaries
            lenPhones = len(phones)

            for i in range(lenPhones - 2):

                phone1 = phones[i]
                phone2 = phones[i + 1]
                phone3 = phones[i + 2]

                phoneDict[phone3] += freq

                if i == 0:
                    phoneDict[phone1] += freq
                    phoneDict[phone2] += freq
                    biphone = phone1 + sep + phone2
                    biphoneDict[biphone] += freq

                biphone = phone2 + sep + phone3
                triphone = phone1 + sep + phone2 + sep + phone3

                triphoneDict[triphone] += freq
                biphoneDict[biphone] += freq

    print("\nCompleted counting phones, biphones, and triphones.")

    intro_string = "# data source: {}".format(str(infilename))

    phonesSorted = sorted_alphabetized(phoneDict.items(),
                                       key=lambda x: x[1],
                                       reverse=True)

    biphonesSorted = sorted_alphabetized(biphoneDict.items(),
                                         key=lambda x: x[1],
                                         reverse=True)

    triphonesSorted = sorted_alphabetized(triphoneDict.items(),
                                          key=lambda x: x[1],
                                          reverse=True)

    #--------------------------------------------------------------------------#
    # generate .txt output files
    #--------------------------------------------------------------------------#

    with outfilenamePhones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(phonesSorted)), file=f)
        print("# token count: {}".format(str(sum(phoneDict.values()))), file=f)
        for (phone, freq) in phonesSorted:
            print(phone + sep + str(freq), file=f)

    with outfilenameBiphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(biphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(biphoneDict.values()))),
              file=f)
        for (biphone, freq) in biphonesSorted:
            print(biphone + sep + str(freq), file=f)

    with outfilenameTriphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(triphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(triphoneDict.values()))),
              file=f)
        for (triphone, freq) in triphonesSorted:
            print(triphone + sep + str(freq), file=f)

    #--------------------------------------------------------------------------#
    # generate .json output files
    #--------------------------------------------------------------------------#

    outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json')
    with outfilenamePhones_json.open('w') as f:
        json_pdump(phoneDict, f, key=lambda x: x[1], reverse=True)

    outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones,
                                                    '.json')
    with outfilenameBiphones_json.open('w') as f:
        json_pdump(biphoneDict, f, key=lambda x: x[1], reverse=True)

    outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones,
                                                     '.json')
    with outfilenameTriphones_json.open('w') as f:
        json_pdump(triphoneDict, f, key=lambda x: x[1], reverse=True)

    print('phone, biphone and triphone files ready')

    stdout_list("Output files:", outfilenamePhones, outfilenameBiphones,
                outfilenameTriphones, outfilenamePhones_json,
                outfilenameBiphones_json, outfilenameTriphones_json)
Esempio n. 5
0
def main(language=None, corpus=None, datafolder=None, filename=None,
         maxwordtokens=0):

    print("\n*****************************************************\n"
          "Running the ngrams.py program now...\n")

    if filename:
        infilename = Path(filename)
        outfolder = Path(infilename.parent, "ngrams")
        outfolderDx1 = Path(infilename.parent, "dx1")
        corpus = infilename.name
    else:
        infilename = Path(datafolder, language, corpus)
        outfolder = Path(datafolder, language, "ngrams")
        outfolderDx1 = Path(datafolder, language, "dx1")

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    if not outfolderDx1.exists():
        outfolderDx1.mkdir(parents=True)

    if maxwordtokens:
        corpusName = Path(corpus).stem + "_{}-tokens".format(maxwordtokens)
    else:
        corpusName = Path(corpus).stem

    outfilenameWords = Path(outfolder, corpusName + "_words.txt")
    outfilenameBigrams = Path(outfolder, corpusName + "_bigrams.txt")
    outfilenameTrigrams = Path(outfolder, corpusName + "_trigrams.txt")
    outfilenameDx1 = Path(outfolderDx1, corpusName + ".dx1")

    wordDict = Counter()
    trigramDict = Counter()
    bigramDict = Counter()
    sep = "\t"
    corpusCurrentSize = 0 # running word token count

    print('Reading the corpus file now...')

    with infilename.open() as f:
        for line in f.readlines():
            if not line:
                continue

            line = line.strip().casefold()

            # TODO: modify/combine these with "scrubbing", cf. Alchemist and Lxa4
            line = line.replace(".", " . ")
            line = line.replace(",", " , ")
            line = line.replace(";", " ; ")
            line = line.replace("!", " ! ")
            line = line.replace("?", " ? ")
            line = line.replace(":", " : ")
            line = line.replace(")", " ) ")
            line = line.replace("(", " ( ")

            words = line.split()
            lenWords = len(words)

            corpusCurrentSize += lenWords

            for i in range(lenWords-2):

                word1 = words[i]
                word2 = words[i+1]
                word3 = words[i+2]

                wordDict[word3] += 1

                if i == 0:
                    wordDict[word1] += 1
                    wordDict[word2] += 1
                    bigram = word1 + sep + word2
                    bigramDict[bigram] += 1

                bigram = word2 + sep + word3
                trigram = word1 + sep + word2 + sep + word3

                trigramDict[trigram] += 1
                bigramDict[bigram] += 1

            if maxwordtokens and corpusCurrentSize > maxwordtokens:
                break

    print("\nCompleted counting words, bigrams, and trigrams.")
    print("Token count: {}".format(corpusCurrentSize))

    intro_string = "# data source: {}\n# token count: {}".format(str(infilename),
                                                                   corpusCurrentSize)

#    wordsSorted = sorted(wordDict.items(),
#                                      key=lambda x: x[1], reverse=True)
    wordsSorted = sorted_alphabetized(wordDict.items(),
                                      key=lambda x: x[1], reverse=True)

    bigramsSorted = sorted_alphabetized(bigramDict.items(),
                                        key=lambda x: x[1], reverse=True)

    trigramsSorted = sorted_alphabetized(trigramDict.items(),
                                         key=lambda x: x[1], reverse=True)

    # print txt outputs
    with outfilenameWords.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(wordsSorted)), file=f)
        for (word, freq) in wordsSorted:
            print(word + sep + str(freq), file=f)

    with outfilenameBigrams.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(bigramsSorted)), file=f)
        for (bigram, freq) in bigramsSorted:
            print(bigram + sep +  str(freq), file=f)

    with outfilenameTrigrams.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(trigramsSorted)), file=f)
        for (trigram, freq) in trigramsSorted:
            print(trigram + sep + str(freq), file=f)

    # print dx1 output
    with outfilenameDx1.open('w') as f:
        for (word, freq) in wordsSorted:
            print(word, freq, ' '.join(word), file=f)

    # print json outputs
    with changeFilenameSuffix(outfilenameWords, ".json").open('w') as f:
        json_pdump(dict(wordsSorted), f)

    with changeFilenameSuffix(outfilenameBigrams, ".json").open('w') as f:
        json_pdump(dict(bigramsSorted), f)

    with changeFilenameSuffix(outfilenameTrigrams, ".json").open('w') as f:
        json_pdump(dict(trigramsSorted), f)

    print('wordlist, bigram and trigram files ready')
    print('dx1 file ready')

    stdout_list("Output files:", outfilenameWords,
                outfilenameBigrams, outfilenameTrigrams, outfilenameDx1,
                changeFilenameSuffix(outfilenameWords, ".json"),
                changeFilenameSuffix(outfilenameBigrams, ".json"),
                changeFilenameSuffix(outfilenameTrigrams, ".json"))
Esempio n. 6
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         maxwordtokens=0):

    print("\n*****************************************************\n"
          "Running the ngrams.py program now...\n")

    if filename:
        infilename = Path(filename)
        outfolder = Path(infilename.parent, "ngrams")
        outfolderDx1 = Path(infilename.parent, "dx1")
        corpus = infilename.name
    else:
        infilename = Path(datafolder, language, corpus)
        outfolder = Path(datafolder, language, "ngrams")
        outfolderDx1 = Path(datafolder, language, "dx1")

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    if not outfolderDx1.exists():
        outfolderDx1.mkdir(parents=True)

    if maxwordtokens:
        corpusName = Path(corpus).stem + "_{}-tokens".format(maxwordtokens)
    else:
        corpusName = Path(corpus).stem

    outfilenameWords = Path(outfolder, corpusName + "_words.txt")
    outfilenameBigrams = Path(outfolder, corpusName + "_bigrams.txt")
    outfilenameTrigrams = Path(outfolder, corpusName + "_trigrams.txt")
    outfilenameDx1 = Path(outfolderDx1, corpusName + ".dx1")

    wordDict = Counter()
    trigramDict = Counter()
    bigramDict = Counter()
    sep = "\t"
    corpusCurrentSize = 0  # running word token count

    print('Reading the corpus file now...')

    with infilename.open() as f:
        for line in f.readlines():
            if not line:
                continue

            line = line.strip().casefold()

            # TODO: modify/combine these with "scrubbing", cf. Alchemist and Lxa4
            line = line.replace(".", " . ")
            line = line.replace(",", " , ")
            line = line.replace(";", " ; ")
            line = line.replace("!", " ! ")
            line = line.replace("?", " ? ")
            line = line.replace(":", " : ")
            line = line.replace(")", " ) ")
            line = line.replace("(", " ( ")

            words = line.split()
            lenWords = len(words)

            corpusCurrentSize += lenWords

            for i in range(lenWords - 2):

                word1 = words[i]
                word2 = words[i + 1]
                word3 = words[i + 2]

                wordDict[word3] += 1

                if i == 0:
                    wordDict[word1] += 1
                    wordDict[word2] += 1
                    bigram = word1 + sep + word2
                    bigramDict[bigram] += 1

                bigram = word2 + sep + word3
                trigram = word1 + sep + word2 + sep + word3

                trigramDict[trigram] += 1
                bigramDict[bigram] += 1

            if maxwordtokens and corpusCurrentSize > maxwordtokens:
                break

    print("\nCompleted counting words, bigrams, and trigrams.")
    print("Token count: {}".format(corpusCurrentSize))

    intro_string = "# data source: {}\n# token count: {}".format(
        str(infilename), corpusCurrentSize)

    #    wordsSorted = sorted(wordDict.items(),
    #                                      key=lambda x: x[1], reverse=True)
    wordsSorted = sorted_alphabetized(wordDict.items(),
                                      key=lambda x: x[1],
                                      reverse=True)

    bigramsSorted = sorted_alphabetized(bigramDict.items(),
                                        key=lambda x: x[1],
                                        reverse=True)

    trigramsSorted = sorted_alphabetized(trigramDict.items(),
                                         key=lambda x: x[1],
                                         reverse=True)

    # print txt outputs
    with outfilenameWords.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(wordsSorted)), file=f)
        for (word, freq) in wordsSorted:
            print(word + sep + str(freq), file=f)

    with outfilenameBigrams.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(bigramsSorted)), file=f)
        for (bigram, freq) in bigramsSorted:
            print(bigram + sep + str(freq), file=f)

    with outfilenameTrigrams.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(trigramsSorted)), file=f)
        for (trigram, freq) in trigramsSorted:
            print(trigram + sep + str(freq), file=f)

    # print dx1 output
    with outfilenameDx1.open('w') as f:
        for (word, freq) in wordsSorted:
            print(word, freq, ' '.join(word), file=f)

    # print json outputs
    with changeFilenameSuffix(outfilenameWords, ".json").open('w') as f:
        json_pdump(dict(wordsSorted), f)

    with changeFilenameSuffix(outfilenameBigrams, ".json").open('w') as f:
        json_pdump(dict(bigramsSorted), f)

    with changeFilenameSuffix(outfilenameTrigrams, ".json").open('w') as f:
        json_pdump(dict(trigramsSorted), f)

    print('wordlist, bigram and trigram files ready')
    print('dx1 file ready')

    stdout_list("Output files:", outfilenameWords, outfilenameBigrams,
                outfilenameTrigrams, outfilenameDx1,
                changeFilenameSuffix(outfilenameWords, ".json"),
                changeFilenameSuffix(outfilenameBigrams, ".json"),
                changeFilenameSuffix(outfilenameTrigrams, ".json"))
Esempio n. 7
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         MinimumStemLength=4,
         MaximumAffixLength=3,
         MinimumNumberofSigUses=5,
         maxwordtokens=0,
         use_corpus=True):

    print("\n*****************************************************\n"
          "Running the lxa5.py program now...\n")

    # -------------------------------------------------------------------------#
    #       decide suffixing or prefixing
    # -------------------------------------------------------------------------#

    suffix_languages = {
        "english", "french", "hungarian", "turkish", "russian", "german",
        "spanish", 'test'
    }
    prefix_languages = {"swahili"}

    if str(language).casefold() in prefix_languages:
        FindSuffixesFlag = False  # prefixal
    else:
        FindSuffixesFlag = True  # suffixal

    wordlist_path, corpus_stem = get_wordlist_path_corpus_stem(
        language, corpus, datafolder, filename, maxwordtokens, use_corpus)

    print("wordlist file path:\n{}\n".format(wordlist_path))

    if not wordlist_path.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language,
                        corpus=corpus,
                        datafolder=datafolder,
                        filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist "
                     "\n"
                     "is not found.".format(wordlist_path))

    wordFreqDict = read_word_freq(wordlist_path)
    wordlist = sorted(wordFreqDict.keys())

    if filename:
        outfolder = Path(Path(filename).parent, "lxa")
    else:
        outfolder = Path(datafolder, language, 'lxa')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    # TODO -- filenames not yet used in main()
    outfile_Signatures_name = str(outfolder) + corpus_stem + "_Signatures.txt"
    outfile_SigTransforms_name = str(
        outfolder) + corpus_stem + "_SigTransforms.txt"
    outfile_FSA_name = str(outfolder) + corpus_stem + "_FSA.txt"
    outfile_FSA_graphics_name = str(
        outfolder) + corpus_stem + "_FSA_graphics.png"

    # -------------------------------------------------------------------------#
    #   create: BisigToTuple
    #                  (key: tuple of bisig | value: set of (stem, word1, word2)
    #           StemToWords (key: stem | value: set of words)
    #           SigToStems  (key: tuple of sig | value: set of stems )
    #           StemToSig   (key: str of stem  | value: tuple of sig )
    #           WordToSigs  (key: str of word  | value: set of sigs )
    #           AffixToSigs (key: str of affix | value: set of sigs )
    # -------------------------------------------------------------------------#

    BisigToTuple = MakeBiSignatures(wordlist, MinimumStemLength,
                                    MaximumAffixLength, FindSuffixesFlag)
    print("BisigToTuple ready", flush=True)

    StemToWords = MakeStemToWords(BisigToTuple, MinimumNumberofSigUses)
    print("StemToWords ready", flush=True)

    SigToStems = MakeSigToStems(StemToWords, MaximumAffixLength,
                                MinimumNumberofSigUses, FindSuffixesFlag)
    print("SigToStems ready", flush=True)

    StemToSig = MakeStemToSig(SigToStems)
    print("StemToSig ready", flush=True)

    WordToSigs = MakeWordToSigs(StemToWords, StemToSig)
    print("WordToSigs ready", flush=True)

    WordToSigtransforms = MakeWordToSigtransforms(WordToSigs)
    print("WordToSigtransforms ready", flush=True)

    AffixToSigs = MakeAffixToSigs(SigToStems)
    print("AffixToSigs ready", flush=True)

    # -------------------------------------------------------------------------#
    #   generate graphs for several dicts
    # -------------------------------------------------------------------------#
    #    GenerateGraphFromDict(StemToWords, outfolder, 'StemToWords.gexf')
    #    GenerateGraphFromDict(SigToStems, outfolder, 'SigToStems.gexf')
    #    GenerateGraphFromDict(WordToSigs, outfolder, 'WordToSigs.gexf')
    #    GenerateGraphFromDict(StemToSig, outfolder, 'StemToSig.gexf')
    # -------------------------------------------------------------------------#

    # -------------------------------------------------------------------------#
    #      output stem file
    # -------------------------------------------------------------------------#

    stemfilename = Path(outfolder, '{}_StemToWords.txt'.format(corpus_stem))
    OutputLargeDict(stemfilename,
                    StemToWords,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    min_cell_width=25,
                    howmanyperline=5)

    print('===> stem file generated:', stemfilename, flush=True)

    # -------------------------------------------------------------------------#
    #      output affix file
    # -------------------------------------------------------------------------#

    affixfilename = Path(outfolder, '{}_AffixToSigs.txt'.format(corpus_stem))
    OutputLargeDict(affixfilename,
                    AffixToSigs,
                    min_cell_width=25,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    howmanyperline=5,
                    SignatureValues=True)
    print('===> affix file generated:', affixfilename, flush=True)

    # -------------------------------------------------------------------------#
    #   output SigToStems
    # -------------------------------------------------------------------------#

    SigToStems_outfilename = Path(outfolder, corpus_stem + "_SigToStems.txt")
    OutputLargeDict(SigToStems_outfilename,
                    SigToStems,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    howmanyperline=5,
                    SignatureKeys=True)

    SigToStems_outfilename_json = changeFilenameSuffix(SigToStems_outfilename,
                                                       ".json")
    json_pdump(SigToStems,
               SigToStems_outfilename_json.open("w"),
               key=lambda x: len(x[1]),
               reverse=True)

    print('===> output file generated:', SigToStems_outfilename, flush=True)
    print('===> output file generated:',
          SigToStems_outfilename_json,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output WordToSigs
    # -------------------------------------------------------------------------#

    WordToSigs_outfilename = Path(outfolder, corpus_stem + "_WordToSigs.txt")
    OutputLargeDict(WordToSigs_outfilename,
                    WordToSigs,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    min_cell_width=25,
                    SignatureValues=True)

    WordToSigs_outfilename_json = changeFilenameSuffix(WordToSigs_outfilename,
                                                       ".json")
    json_pdump(WordToSigs,
               WordToSigs_outfilename_json.open("w"),
               key=lambda x: len(x[1]),
               reverse=True)

    print('===> output file generated:', WordToSigs_outfilename, flush=True)
    print('===> output file generated:',
          WordToSigs_outfilename_json,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output WordToSigtransforms
    # -------------------------------------------------------------------------#

    WordToSigtransforms_outfilename = Path(
        outfolder, corpus_stem + "_WordToSigtransforms.txt")
    OutputLargeDict(WordToSigtransforms_outfilename,
                    WordToSigtransforms,
                    min_cell_width=25,
                    sigtransforms=True,
                    key=lambda x: len(x[1]),
                    reverse=True)
    print('===> output file generated:',
          WordToSigtransforms_outfilename,
          flush=True)

    WordToSigtransforms_outfilename_json = changeFilenameSuffix(
        WordToSigtransforms_outfilename, ".json")
    json_pdump(WordToSigtransforms,
               WordToSigtransforms_outfilename_json.open("w"),
               key=lambda x: len(x[1]),
               reverse=True)
    print('===> output file generated:',
          WordToSigtransforms_outfilename_json,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output the most freq word types not in any induced paradigms {the, of..}
    # -------------------------------------------------------------------------#

    wordFreqDict_sorted = sorted_alphabetized(wordFreqDict.items(),
                                              key=lambda x: x[1],
                                              reverse=True)

    mostFreqWordsNotInSigs_outfilename = Path(
        outfolder, corpus_stem + "_mostFreqWordsNotInSigs.txt")

    with mostFreqWordsNotInSigs_outfilename.open('w') as f:
        for (word, freq) in wordFreqDict_sorted:
            if word not in WordToSigs:
                print(word, freq, file=f)
            else:
                break

    print('===> output file generated:',
          mostFreqWordsNotInSigs_outfilename,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output the word types in induced paradigms
    # -------------------------------------------------------------------------#

    WordsInSigs_outfilename = Path(outfolder, corpus_stem + "_WordsInSigs.txt")

    with WordsInSigs_outfilename.open('w') as f:
        for (word, freq) in wordFreqDict_sorted:
            if word in WordToSigs:
                print(word, freq, file=f)

    print('===> output file generated:', WordsInSigs_outfilename, flush=True)

    # -------------------------------------------------------------------------#
    #   output the word types NOT in induced paradigms
    # -------------------------------------------------------------------------#

    WordsNotInSigs_outfilename = Path(outfolder,
                                      corpus_stem + "_WordsNotInSigs.txt")

    with WordsNotInSigs_outfilename.open('w') as f:
        for (word, freq) in wordFreqDict_sorted:
            if word not in WordToSigs:
                print(word, freq, file=f)

    print('===> output file generated:',
          WordsNotInSigs_outfilename,
          flush=True)