def output_WordToSharedContextsOfNeighbors(outfilenameSharedcontexts, WordToSharedContextsOfNeighbors, worddict, contextdict, nWordsForAnalysis): _worddict = {v:k for k,v in worddict.items()} # from index to word _contextdict = {v:k for k,v in contextdict.items()} # from index to context tuple with outfilenameSharedcontexts.open("w") as f: for word_idx in range(nWordsForAnalysis): ContextToNeighbors = WordToSharedContextsOfNeighbors[word_idx] # a dict if not ContextToNeighbors: continue ContextToNeighbors = sorted_alphabetized(ContextToNeighbors.items(), key=lambda x: len(x[1]), reverse=True, subkey=lambda x:x[1]) # ContextToNeighbors is now a list of tuples, not a dict anymore word = _worddict[word_idx] print("{} {} ({})".format(word_idx+1, word, len(ContextToNeighbors)), file=f) for context_idx, neighbor_indices in ContextToNeighbors: context = " ".join(_contextdict[context_idx]) neighbors = " ".join([_worddict[i] for i in neighbor_indices]) print(" {:20} | {}".format(context, neighbors), file=f) print(file=f)
def output_ImportantContextToWords(outfilename, ImportantContextToWords, contextdict, worddict): _contextdict = {v:k for k,v in contextdict.items()} # from index to context tuple _worddict = {v:k for k,v in worddict.items()} # from index to word ImportantContextToWords_sorted = sorted_alphabetized( ImportantContextToWords.items(), key=lambda x: len(x[1]), reverse=True) context_str_list = [" ".join(_contextdict[context_index]) for context_index, v in ImportantContextToWords_sorted] max_key_length = max([len(x) for x in context_str_list]) WordToCount_list = [WordToCount for _, WordToCount in ImportantContextToWords_sorted] with outfilename.open("w") as f: for context_str, WordToCount in zip(context_str_list, WordToCount_list): print("{} {}".format(context_str.ljust(max_key_length), len(WordToCount)), file=f) print(file=f) for context_str, WordToCount in zip(context_str_list, WordToCount_list): if not WordToCount: continue print("\n===============================================\n", file=f) print("{} {}".format(context_str.ljust(max_key_length), len(WordToCount)), file=f) print(file=f) WordToCount_sorted = sorted_alphabetized(WordToCount.items(), key=lambda x :x[1], reverse=True) # don't use "count" as a variable (it's the name of a function in python) max_word_length = max([len(_worddict[word_no]) for word_no, c in WordToCount_sorted]) for word_no, c in WordToCount_sorted: print(" {} {}".format( _worddict[word_no].ljust(max_word_length), c), file=f)
def main(language=None, corpus=None, datafolder=None, filename=None, maxwordtokens=0, use_corpus=True): print("\n*****************************************************\n" "Running the phon.py program now...\n") infilename, corpusName = get_wordlist_path_corpus_stem(language, corpus, datafolder, filename, maxwordtokens, use_corpus) if not infilename.exists(): if use_corpus: if maxwordtokens: warning = " ({} tokens)".format(maxwordtokens) else: warning = "" print("\nWordlist for {}{} not found.\n" "ngrams.py is now run.\n".format(corpus, warning)) ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename, maxwordtokens=maxwordtokens) else: sys.exit("\nThe specified wordlist ""\n" "is not found.".format(infilename)) if filename: outfolder = Path(Path(filename).parent, "phon") else: outfolder = Path(datafolder, language, 'phon') if not outfolder.exists(): outfolder.mkdir(parents=True) outfilenamePhones = Path(outfolder, corpusName + "_phones.txt") outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt") outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt") phoneDict = Counter() triphoneDict = Counter() biphoneDict = Counter() sep = "\t" print('Reading the wordlist file now...') with infilename.open() as f: lines = f.readlines() for line in lines: if not line or line.startswith("#"): continue line = line.strip().casefold() phones, *rest = line.split() try: freq = int(rest[0]) except (ValueError, IndexError): freq = 1 phones = "#{}#".format(phones) # add word boundaries lenPhones = len(phones) for i in range(lenPhones-2): phone1 = phones[i] phone2 = phones[i+1] phone3 = phones[i+2] phoneDict[phone3] += freq if i == 0: phoneDict[phone1] += freq phoneDict[phone2] += freq biphone = phone1 + sep + phone2 biphoneDict[biphone] += freq biphone = phone2 + sep + phone3 triphone = phone1 + sep + phone2 + sep + phone3 triphoneDict[triphone] += freq biphoneDict[biphone] += freq print("\nCompleted counting phones, biphones, and triphones.") intro_string = "# data source: {}".format(str(infilename)) phonesSorted = sorted_alphabetized(phoneDict.items(), key=lambda x: x[1], reverse=True) biphonesSorted = sorted_alphabetized(biphoneDict.items(), key=lambda x: x[1], reverse=True) triphonesSorted = sorted_alphabetized(triphoneDict.items(), key=lambda x: x[1], reverse=True) #--------------------------------------------------------------------------# # generate .txt output files #--------------------------------------------------------------------------# with outfilenamePhones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(phonesSorted)), file=f) print("# token count: {}".format(str(sum(phoneDict.values()))), file=f) for (phone, freq) in phonesSorted: print(phone + sep + str(freq), file=f) with outfilenameBiphones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(biphonesSorted)), file=f) print("# token count: {}".format(str(sum(biphoneDict.values()))), file=f) for (biphone, freq) in biphonesSorted: print(biphone + sep + str(freq), file=f) with outfilenameTriphones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(triphonesSorted)), file=f) print("# token count: {}".format(str(sum(triphoneDict.values()))), file=f) for (triphone, freq) in triphonesSorted: print(triphone + sep + str(freq), file=f) #--------------------------------------------------------------------------# # generate .json output files #--------------------------------------------------------------------------# outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json') with outfilenamePhones_json.open('w') as f: json_pdump(phoneDict, f, key=lambda x:x[1], reverse=True) outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones, '.json') with outfilenameBiphones_json.open('w') as f: json_pdump(biphoneDict, f, key=lambda x:x[1], reverse=True) outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones, '.json') with outfilenameTriphones_json.open('w') as f: json_pdump(triphoneDict, f, key=lambda x:x[1], reverse=True) print('phone, biphone and triphone files ready') stdout_list("Output files:", outfilenamePhones, outfilenameBiphones, outfilenameTriphones, outfilenamePhones_json, outfilenameBiphones_json, outfilenameTriphones_json)
def main(language=None, corpus=None, datafolder=None, filename=None, maxwordtokens=0, use_corpus=True): print("\n*****************************************************\n" "Running the phon.py program now...\n") infilename, corpusName = get_wordlist_path_corpus_stem( language, corpus, datafolder, filename, maxwordtokens, use_corpus) if not infilename.exists(): if use_corpus: if maxwordtokens: warning = " ({} tokens)".format(maxwordtokens) else: warning = "" print("\nWordlist for {}{} not found.\n" "ngrams.py is now run.\n".format(corpus, warning)) ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename, maxwordtokens=maxwordtokens) else: sys.exit("\nThe specified wordlist " "\n" "is not found.".format(infilename)) if filename: outfolder = Path(Path(filename).parent, "phon") else: outfolder = Path(datafolder, language, 'phon') if not outfolder.exists(): outfolder.mkdir(parents=True) outfilenamePhones = Path(outfolder, corpusName + "_phones.txt") outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt") outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt") phoneDict = Counter() triphoneDict = Counter() biphoneDict = Counter() sep = "\t" print('Reading the wordlist file now...') with infilename.open() as f: lines = f.readlines() for line in lines: if not line or line.startswith("#"): continue line = line.strip().casefold() phones, *rest = line.split() try: freq = int(rest[0]) except (ValueError, IndexError): freq = 1 phones = "#{}#".format(phones) # add word boundaries lenPhones = len(phones) for i in range(lenPhones - 2): phone1 = phones[i] phone2 = phones[i + 1] phone3 = phones[i + 2] phoneDict[phone3] += freq if i == 0: phoneDict[phone1] += freq phoneDict[phone2] += freq biphone = phone1 + sep + phone2 biphoneDict[biphone] += freq biphone = phone2 + sep + phone3 triphone = phone1 + sep + phone2 + sep + phone3 triphoneDict[triphone] += freq biphoneDict[biphone] += freq print("\nCompleted counting phones, biphones, and triphones.") intro_string = "# data source: {}".format(str(infilename)) phonesSorted = sorted_alphabetized(phoneDict.items(), key=lambda x: x[1], reverse=True) biphonesSorted = sorted_alphabetized(biphoneDict.items(), key=lambda x: x[1], reverse=True) triphonesSorted = sorted_alphabetized(triphoneDict.items(), key=lambda x: x[1], reverse=True) #--------------------------------------------------------------------------# # generate .txt output files #--------------------------------------------------------------------------# with outfilenamePhones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(phonesSorted)), file=f) print("# token count: {}".format(str(sum(phoneDict.values()))), file=f) for (phone, freq) in phonesSorted: print(phone + sep + str(freq), file=f) with outfilenameBiphones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(biphonesSorted)), file=f) print("# token count: {}".format(str(sum(biphoneDict.values()))), file=f) for (biphone, freq) in biphonesSorted: print(biphone + sep + str(freq), file=f) with outfilenameTriphones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(triphonesSorted)), file=f) print("# token count: {}".format(str(sum(triphoneDict.values()))), file=f) for (triphone, freq) in triphonesSorted: print(triphone + sep + str(freq), file=f) #--------------------------------------------------------------------------# # generate .json output files #--------------------------------------------------------------------------# outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json') with outfilenamePhones_json.open('w') as f: json_pdump(phoneDict, f, key=lambda x: x[1], reverse=True) outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones, '.json') with outfilenameBiphones_json.open('w') as f: json_pdump(biphoneDict, f, key=lambda x: x[1], reverse=True) outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones, '.json') with outfilenameTriphones_json.open('w') as f: json_pdump(triphoneDict, f, key=lambda x: x[1], reverse=True) print('phone, biphone and triphone files ready') stdout_list("Output files:", outfilenamePhones, outfilenameBiphones, outfilenameTriphones, outfilenamePhones_json, outfilenameBiphones_json, outfilenameTriphones_json)
def main(language=None, corpus=None, datafolder=None, filename=None, maxwordtokens=0): print("\n*****************************************************\n" "Running the ngrams.py program now...\n") if filename: infilename = Path(filename) outfolder = Path(infilename.parent, "ngrams") outfolderDx1 = Path(infilename.parent, "dx1") corpus = infilename.name else: infilename = Path(datafolder, language, corpus) outfolder = Path(datafolder, language, "ngrams") outfolderDx1 = Path(datafolder, language, "dx1") if not outfolder.exists(): outfolder.mkdir(parents=True) if not outfolderDx1.exists(): outfolderDx1.mkdir(parents=True) if maxwordtokens: corpusName = Path(corpus).stem + "_{}-tokens".format(maxwordtokens) else: corpusName = Path(corpus).stem outfilenameWords = Path(outfolder, corpusName + "_words.txt") outfilenameBigrams = Path(outfolder, corpusName + "_bigrams.txt") outfilenameTrigrams = Path(outfolder, corpusName + "_trigrams.txt") outfilenameDx1 = Path(outfolderDx1, corpusName + ".dx1") wordDict = Counter() trigramDict = Counter() bigramDict = Counter() sep = "\t" corpusCurrentSize = 0 # running word token count print('Reading the corpus file now...') with infilename.open() as f: for line in f.readlines(): if not line: continue line = line.strip().casefold() # TODO: modify/combine these with "scrubbing", cf. Alchemist and Lxa4 line = line.replace(".", " . ") line = line.replace(",", " , ") line = line.replace(";", " ; ") line = line.replace("!", " ! ") line = line.replace("?", " ? ") line = line.replace(":", " : ") line = line.replace(")", " ) ") line = line.replace("(", " ( ") words = line.split() lenWords = len(words) corpusCurrentSize += lenWords for i in range(lenWords-2): word1 = words[i] word2 = words[i+1] word3 = words[i+2] wordDict[word3] += 1 if i == 0: wordDict[word1] += 1 wordDict[word2] += 1 bigram = word1 + sep + word2 bigramDict[bigram] += 1 bigram = word2 + sep + word3 trigram = word1 + sep + word2 + sep + word3 trigramDict[trigram] += 1 bigramDict[bigram] += 1 if maxwordtokens and corpusCurrentSize > maxwordtokens: break print("\nCompleted counting words, bigrams, and trigrams.") print("Token count: {}".format(corpusCurrentSize)) intro_string = "# data source: {}\n# token count: {}".format(str(infilename), corpusCurrentSize) # wordsSorted = sorted(wordDict.items(), # key=lambda x: x[1], reverse=True) wordsSorted = sorted_alphabetized(wordDict.items(), key=lambda x: x[1], reverse=True) bigramsSorted = sorted_alphabetized(bigramDict.items(), key=lambda x: x[1], reverse=True) trigramsSorted = sorted_alphabetized(trigramDict.items(), key=lambda x: x[1], reverse=True) # print txt outputs with outfilenameWords.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(wordsSorted)), file=f) for (word, freq) in wordsSorted: print(word + sep + str(freq), file=f) with outfilenameBigrams.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(bigramsSorted)), file=f) for (bigram, freq) in bigramsSorted: print(bigram + sep + str(freq), file=f) with outfilenameTrigrams.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(trigramsSorted)), file=f) for (trigram, freq) in trigramsSorted: print(trigram + sep + str(freq), file=f) # print dx1 output with outfilenameDx1.open('w') as f: for (word, freq) in wordsSorted: print(word, freq, ' '.join(word), file=f) # print json outputs with changeFilenameSuffix(outfilenameWords, ".json").open('w') as f: json_pdump(dict(wordsSorted), f) with changeFilenameSuffix(outfilenameBigrams, ".json").open('w') as f: json_pdump(dict(bigramsSorted), f) with changeFilenameSuffix(outfilenameTrigrams, ".json").open('w') as f: json_pdump(dict(trigramsSorted), f) print('wordlist, bigram and trigram files ready') print('dx1 file ready') stdout_list("Output files:", outfilenameWords, outfilenameBigrams, outfilenameTrigrams, outfilenameDx1, changeFilenameSuffix(outfilenameWords, ".json"), changeFilenameSuffix(outfilenameBigrams, ".json"), changeFilenameSuffix(outfilenameTrigrams, ".json"))
def main(language=None, corpus=None, datafolder=None, filename=None, maxwordtokens=0): print("\n*****************************************************\n" "Running the ngrams.py program now...\n") if filename: infilename = Path(filename) outfolder = Path(infilename.parent, "ngrams") outfolderDx1 = Path(infilename.parent, "dx1") corpus = infilename.name else: infilename = Path(datafolder, language, corpus) outfolder = Path(datafolder, language, "ngrams") outfolderDx1 = Path(datafolder, language, "dx1") if not outfolder.exists(): outfolder.mkdir(parents=True) if not outfolderDx1.exists(): outfolderDx1.mkdir(parents=True) if maxwordtokens: corpusName = Path(corpus).stem + "_{}-tokens".format(maxwordtokens) else: corpusName = Path(corpus).stem outfilenameWords = Path(outfolder, corpusName + "_words.txt") outfilenameBigrams = Path(outfolder, corpusName + "_bigrams.txt") outfilenameTrigrams = Path(outfolder, corpusName + "_trigrams.txt") outfilenameDx1 = Path(outfolderDx1, corpusName + ".dx1") wordDict = Counter() trigramDict = Counter() bigramDict = Counter() sep = "\t" corpusCurrentSize = 0 # running word token count print('Reading the corpus file now...') with infilename.open() as f: for line in f.readlines(): if not line: continue line = line.strip().casefold() # TODO: modify/combine these with "scrubbing", cf. Alchemist and Lxa4 line = line.replace(".", " . ") line = line.replace(",", " , ") line = line.replace(";", " ; ") line = line.replace("!", " ! ") line = line.replace("?", " ? ") line = line.replace(":", " : ") line = line.replace(")", " ) ") line = line.replace("(", " ( ") words = line.split() lenWords = len(words) corpusCurrentSize += lenWords for i in range(lenWords - 2): word1 = words[i] word2 = words[i + 1] word3 = words[i + 2] wordDict[word3] += 1 if i == 0: wordDict[word1] += 1 wordDict[word2] += 1 bigram = word1 + sep + word2 bigramDict[bigram] += 1 bigram = word2 + sep + word3 trigram = word1 + sep + word2 + sep + word3 trigramDict[trigram] += 1 bigramDict[bigram] += 1 if maxwordtokens and corpusCurrentSize > maxwordtokens: break print("\nCompleted counting words, bigrams, and trigrams.") print("Token count: {}".format(corpusCurrentSize)) intro_string = "# data source: {}\n# token count: {}".format( str(infilename), corpusCurrentSize) # wordsSorted = sorted(wordDict.items(), # key=lambda x: x[1], reverse=True) wordsSorted = sorted_alphabetized(wordDict.items(), key=lambda x: x[1], reverse=True) bigramsSorted = sorted_alphabetized(bigramDict.items(), key=lambda x: x[1], reverse=True) trigramsSorted = sorted_alphabetized(trigramDict.items(), key=lambda x: x[1], reverse=True) # print txt outputs with outfilenameWords.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(wordsSorted)), file=f) for (word, freq) in wordsSorted: print(word + sep + str(freq), file=f) with outfilenameBigrams.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(bigramsSorted)), file=f) for (bigram, freq) in bigramsSorted: print(bigram + sep + str(freq), file=f) with outfilenameTrigrams.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(trigramsSorted)), file=f) for (trigram, freq) in trigramsSorted: print(trigram + sep + str(freq), file=f) # print dx1 output with outfilenameDx1.open('w') as f: for (word, freq) in wordsSorted: print(word, freq, ' '.join(word), file=f) # print json outputs with changeFilenameSuffix(outfilenameWords, ".json").open('w') as f: json_pdump(dict(wordsSorted), f) with changeFilenameSuffix(outfilenameBigrams, ".json").open('w') as f: json_pdump(dict(bigramsSorted), f) with changeFilenameSuffix(outfilenameTrigrams, ".json").open('w') as f: json_pdump(dict(trigramsSorted), f) print('wordlist, bigram and trigram files ready') print('dx1 file ready') stdout_list("Output files:", outfilenameWords, outfilenameBigrams, outfilenameTrigrams, outfilenameDx1, changeFilenameSuffix(outfilenameWords, ".json"), changeFilenameSuffix(outfilenameBigrams, ".json"), changeFilenameSuffix(outfilenameTrigrams, ".json"))
def main(language=None, corpus=None, datafolder=None, filename=None, MinimumStemLength=4, MaximumAffixLength=3, MinimumNumberofSigUses=5, maxwordtokens=0, use_corpus=True): print("\n*****************************************************\n" "Running the lxa5.py program now...\n") # -------------------------------------------------------------------------# # decide suffixing or prefixing # -------------------------------------------------------------------------# suffix_languages = { "english", "french", "hungarian", "turkish", "russian", "german", "spanish", 'test' } prefix_languages = {"swahili"} if str(language).casefold() in prefix_languages: FindSuffixesFlag = False # prefixal else: FindSuffixesFlag = True # suffixal wordlist_path, corpus_stem = get_wordlist_path_corpus_stem( language, corpus, datafolder, filename, maxwordtokens, use_corpus) print("wordlist file path:\n{}\n".format(wordlist_path)) if not wordlist_path.exists(): if use_corpus: if maxwordtokens: warning = " ({} tokens)".format(maxwordtokens) else: warning = "" print("\nWordlist for {}{} not found.\n" "ngrams.py is now run.\n".format(corpus, warning)) ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename, maxwordtokens=maxwordtokens) else: sys.exit("\nThe specified wordlist " "\n" "is not found.".format(wordlist_path)) wordFreqDict = read_word_freq(wordlist_path) wordlist = sorted(wordFreqDict.keys()) if filename: outfolder = Path(Path(filename).parent, "lxa") else: outfolder = Path(datafolder, language, 'lxa') if not outfolder.exists(): outfolder.mkdir(parents=True) # TODO -- filenames not yet used in main() outfile_Signatures_name = str(outfolder) + corpus_stem + "_Signatures.txt" outfile_SigTransforms_name = str( outfolder) + corpus_stem + "_SigTransforms.txt" outfile_FSA_name = str(outfolder) + corpus_stem + "_FSA.txt" outfile_FSA_graphics_name = str( outfolder) + corpus_stem + "_FSA_graphics.png" # -------------------------------------------------------------------------# # create: BisigToTuple # (key: tuple of bisig | value: set of (stem, word1, word2) # StemToWords (key: stem | value: set of words) # SigToStems (key: tuple of sig | value: set of stems ) # StemToSig (key: str of stem | value: tuple of sig ) # WordToSigs (key: str of word | value: set of sigs ) # AffixToSigs (key: str of affix | value: set of sigs ) # -------------------------------------------------------------------------# BisigToTuple = MakeBiSignatures(wordlist, MinimumStemLength, MaximumAffixLength, FindSuffixesFlag) print("BisigToTuple ready", flush=True) StemToWords = MakeStemToWords(BisigToTuple, MinimumNumberofSigUses) print("StemToWords ready", flush=True) SigToStems = MakeSigToStems(StemToWords, MaximumAffixLength, MinimumNumberofSigUses, FindSuffixesFlag) print("SigToStems ready", flush=True) StemToSig = MakeStemToSig(SigToStems) print("StemToSig ready", flush=True) WordToSigs = MakeWordToSigs(StemToWords, StemToSig) print("WordToSigs ready", flush=True) WordToSigtransforms = MakeWordToSigtransforms(WordToSigs) print("WordToSigtransforms ready", flush=True) AffixToSigs = MakeAffixToSigs(SigToStems) print("AffixToSigs ready", flush=True) # -------------------------------------------------------------------------# # generate graphs for several dicts # -------------------------------------------------------------------------# # GenerateGraphFromDict(StemToWords, outfolder, 'StemToWords.gexf') # GenerateGraphFromDict(SigToStems, outfolder, 'SigToStems.gexf') # GenerateGraphFromDict(WordToSigs, outfolder, 'WordToSigs.gexf') # GenerateGraphFromDict(StemToSig, outfolder, 'StemToSig.gexf') # -------------------------------------------------------------------------# # -------------------------------------------------------------------------# # output stem file # -------------------------------------------------------------------------# stemfilename = Path(outfolder, '{}_StemToWords.txt'.format(corpus_stem)) OutputLargeDict(stemfilename, StemToWords, key=lambda x: len(x[1]), reverse=True, min_cell_width=25, howmanyperline=5) print('===> stem file generated:', stemfilename, flush=True) # -------------------------------------------------------------------------# # output affix file # -------------------------------------------------------------------------# affixfilename = Path(outfolder, '{}_AffixToSigs.txt'.format(corpus_stem)) OutputLargeDict(affixfilename, AffixToSigs, min_cell_width=25, key=lambda x: len(x[1]), reverse=True, howmanyperline=5, SignatureValues=True) print('===> affix file generated:', affixfilename, flush=True) # -------------------------------------------------------------------------# # output SigToStems # -------------------------------------------------------------------------# SigToStems_outfilename = Path(outfolder, corpus_stem + "_SigToStems.txt") OutputLargeDict(SigToStems_outfilename, SigToStems, key=lambda x: len(x[1]), reverse=True, howmanyperline=5, SignatureKeys=True) SigToStems_outfilename_json = changeFilenameSuffix(SigToStems_outfilename, ".json") json_pdump(SigToStems, SigToStems_outfilename_json.open("w"), key=lambda x: len(x[1]), reverse=True) print('===> output file generated:', SigToStems_outfilename, flush=True) print('===> output file generated:', SigToStems_outfilename_json, flush=True) # -------------------------------------------------------------------------# # output WordToSigs # -------------------------------------------------------------------------# WordToSigs_outfilename = Path(outfolder, corpus_stem + "_WordToSigs.txt") OutputLargeDict(WordToSigs_outfilename, WordToSigs, key=lambda x: len(x[1]), reverse=True, min_cell_width=25, SignatureValues=True) WordToSigs_outfilename_json = changeFilenameSuffix(WordToSigs_outfilename, ".json") json_pdump(WordToSigs, WordToSigs_outfilename_json.open("w"), key=lambda x: len(x[1]), reverse=True) print('===> output file generated:', WordToSigs_outfilename, flush=True) print('===> output file generated:', WordToSigs_outfilename_json, flush=True) # -------------------------------------------------------------------------# # output WordToSigtransforms # -------------------------------------------------------------------------# WordToSigtransforms_outfilename = Path( outfolder, corpus_stem + "_WordToSigtransforms.txt") OutputLargeDict(WordToSigtransforms_outfilename, WordToSigtransforms, min_cell_width=25, sigtransforms=True, key=lambda x: len(x[1]), reverse=True) print('===> output file generated:', WordToSigtransforms_outfilename, flush=True) WordToSigtransforms_outfilename_json = changeFilenameSuffix( WordToSigtransforms_outfilename, ".json") json_pdump(WordToSigtransforms, WordToSigtransforms_outfilename_json.open("w"), key=lambda x: len(x[1]), reverse=True) print('===> output file generated:', WordToSigtransforms_outfilename_json, flush=True) # -------------------------------------------------------------------------# # output the most freq word types not in any induced paradigms {the, of..} # -------------------------------------------------------------------------# wordFreqDict_sorted = sorted_alphabetized(wordFreqDict.items(), key=lambda x: x[1], reverse=True) mostFreqWordsNotInSigs_outfilename = Path( outfolder, corpus_stem + "_mostFreqWordsNotInSigs.txt") with mostFreqWordsNotInSigs_outfilename.open('w') as f: for (word, freq) in wordFreqDict_sorted: if word not in WordToSigs: print(word, freq, file=f) else: break print('===> output file generated:', mostFreqWordsNotInSigs_outfilename, flush=True) # -------------------------------------------------------------------------# # output the word types in induced paradigms # -------------------------------------------------------------------------# WordsInSigs_outfilename = Path(outfolder, corpus_stem + "_WordsInSigs.txt") with WordsInSigs_outfilename.open('w') as f: for (word, freq) in wordFreqDict_sorted: if word in WordToSigs: print(word, freq, file=f) print('===> output file generated:', WordsInSigs_outfilename, flush=True) # -------------------------------------------------------------------------# # output the word types NOT in induced paradigms # -------------------------------------------------------------------------# WordsNotInSigs_outfilename = Path(outfolder, corpus_stem + "_WordsNotInSigs.txt") with WordsNotInSigs_outfilename.open('w') as f: for (word, freq) in wordFreqDict_sorted: if word not in WordToSigs: print(word, freq, file=f) print('===> output file generated:', WordsNotInSigs_outfilename, flush=True)