def main(language=None, corpus=None, datafolder=None, filename=None, maxwordtokens=0, use_corpus=True): print("\n*****************************************************\n" "Running the phon.py program now...\n") infilename, corpusName = get_wordlist_path_corpus_stem(language, corpus, datafolder, filename, maxwordtokens, use_corpus) if not infilename.exists(): if use_corpus: if maxwordtokens: warning = " ({} tokens)".format(maxwordtokens) else: warning = "" print("\nWordlist for {}{} not found.\n" "ngrams.py is now run.\n".format(corpus, warning)) ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename, maxwordtokens=maxwordtokens) else: sys.exit("\nThe specified wordlist ""\n" "is not found.".format(infilename)) if filename: outfolder = Path(Path(filename).parent, "phon") else: outfolder = Path(datafolder, language, 'phon') if not outfolder.exists(): outfolder.mkdir(parents=True) outfilenamePhones = Path(outfolder, corpusName + "_phones.txt") outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt") outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt") phoneDict = Counter() triphoneDict = Counter() biphoneDict = Counter() sep = "\t" print('Reading the wordlist file now...') with infilename.open() as f: lines = f.readlines() for line in lines: if not line or line.startswith("#"): continue line = line.strip().casefold() phones, *rest = line.split() try: freq = int(rest[0]) except (ValueError, IndexError): freq = 1 phones = "#{}#".format(phones) # add word boundaries lenPhones = len(phones) for i in range(lenPhones-2): phone1 = phones[i] phone2 = phones[i+1] phone3 = phones[i+2] phoneDict[phone3] += freq if i == 0: phoneDict[phone1] += freq phoneDict[phone2] += freq biphone = phone1 + sep + phone2 biphoneDict[biphone] += freq biphone = phone2 + sep + phone3 triphone = phone1 + sep + phone2 + sep + phone3 triphoneDict[triphone] += freq biphoneDict[biphone] += freq print("\nCompleted counting phones, biphones, and triphones.") intro_string = "# data source: {}".format(str(infilename)) phonesSorted = sorted_alphabetized(phoneDict.items(), key=lambda x: x[1], reverse=True) biphonesSorted = sorted_alphabetized(biphoneDict.items(), key=lambda x: x[1], reverse=True) triphonesSorted = sorted_alphabetized(triphoneDict.items(), key=lambda x: x[1], reverse=True) #--------------------------------------------------------------------------# # generate .txt output files #--------------------------------------------------------------------------# with outfilenamePhones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(phonesSorted)), file=f) print("# token count: {}".format(str(sum(phoneDict.values()))), file=f) for (phone, freq) in phonesSorted: print(phone + sep + str(freq), file=f) with outfilenameBiphones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(biphonesSorted)), file=f) print("# token count: {}".format(str(sum(biphoneDict.values()))), file=f) for (biphone, freq) in biphonesSorted: print(biphone + sep + str(freq), file=f) with outfilenameTriphones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(triphonesSorted)), file=f) print("# token count: {}".format(str(sum(triphoneDict.values()))), file=f) for (triphone, freq) in triphonesSorted: print(triphone + sep + str(freq), file=f) #--------------------------------------------------------------------------# # generate .json output files #--------------------------------------------------------------------------# outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json') with outfilenamePhones_json.open('w') as f: json_pdump(phoneDict, f, key=lambda x:x[1], reverse=True) outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones, '.json') with outfilenameBiphones_json.open('w') as f: json_pdump(biphoneDict, f, key=lambda x:x[1], reverse=True) outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones, '.json') with outfilenameTriphones_json.open('w') as f: json_pdump(triphoneDict, f, key=lambda x:x[1], reverse=True) print('phone, biphone and triphone files ready') stdout_list("Output files:", outfilenamePhones, outfilenameBiphones, outfilenameTriphones, outfilenamePhones_json, outfilenameBiphones_json, outfilenameTriphones_json)
def main(language=None, corpus=None, datafolder=None, filename=None, maxwordtypes=1000, nNeighbors=9, nEigenvectors=11, create_WordToContexts=False, create_ContextToWords=False, mincontexts=3, usesigtransforms=True): print("\n*****************************************************\n" "Running the manifold.py program now...\n") if filename: corpusStem = Path(filename).stem infolder = Path(Path(filename).parent, 'ngrams') outfolder = Path(Path(filename).parent, 'neighbors') outcontextsfolder = Path(Path(filename).parent, 'word_contexts') else: corpusStem = Path(corpus).stem infolder = Path(datafolder, language, 'ngrams') outfolder = Path(datafolder, language, 'neighbors') outcontextsfolder = Path(datafolder, language, 'word_contexts') if not outfolder.exists(): outfolder.mkdir(parents=True) if not outcontextsfolder.exists(): outcontextsfolder.mkdir(parents=True) infileWordsname = Path(infolder, corpusStem + '_words.txt') infileBigramsname = Path(infolder, corpusStem + '_bigrams.txt') infileTrigramsname = Path(infolder, corpusStem + '_trigrams.txt') if (not infileWordsname.exists()) or \ (not infileBigramsname.exists()) or \ (not infileTrigramsname.exists()): print("Error in locating n-gram data files.\n" "The program now creates them.\n") ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename) if usesigtransforms: if filename: infolderlxa = Path(Path(filename).parent, 'lxa') else: infolderlxa = Path(datafolder, language, 'lxa') sigtransform_json_fname = Path( infolderlxa, corpusStem + "_WordToSigtransforms.json") try: WordToSigtransforms = json_pload(sigtransform_json_fname.open()) except FileNotFoundError: print("The file \"{}\" is not found.\n" "The program now creates it.\n".format( sigtransform_json_fname)) lxa5.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename) WordToSigtransforms = json_pload(sigtransform_json_fname.open()) # WordToSigtransforms just read into the program; to be used soon... print('Reading word list...', flush=True) mywords = GetMyWords(infileWordsname, corpus) print("Word file is", infileWordsname, flush=True) print("Number of neighbors to find for each word type: ", nNeighbors) print('Corpus has', len(mywords), 'word types', flush=True) lenMywords = len(mywords) if lenMywords > maxwordtypes: nWordsForAnalysis = maxwordtypes else: nWordsForAnalysis = lenMywords print('number of words for analysis adjusted to', nWordsForAnalysis) analyzedwordlist = list(mywords.keys())[:nWordsForAnalysis] worddict = {w: analyzedwordlist.index(w) for w in analyzedwordlist} corpusName = corpusStem + '_' + str(nWordsForAnalysis) + '_' + str( nNeighbors) outfilenameNeighbors = Path(outfolder, corpusName + "_neighbors.txt") outfilenameSharedcontexts = Path(outfolder, corpusName + \ "_shared_contexts.txt") outfilenameNeighborGraph = Path(outfolder, corpusName + "_neighbors.gexf") outfilenameImportantContextToWords = Path(outfolder, corpusName + \ "_ImportantContextToWords.txt") outWordToContexts_json = Path(outcontextsfolder, corpusName + \ "_WordToContexts.json") outContextToWords_json = Path(outcontextsfolder, corpusName + \ "_ContextToWords.json") print("Reading bigrams/trigrams and computing context array...", flush=True) context_array, contextdict, \ WordToContexts, ContextToWords = GetContextArray(nWordsForAnalysis, worddict, infileBigramsname, infileTrigramsname, mincontexts) print("Computing shared context master matrix...", flush=True) CountOfSharedContexts = context_array.dot(context_array.T).todense() del context_array print("Computing diameter...", flush=True) Diameter = Normalize(nWordsForAnalysis, CountOfSharedContexts) print("Computing incidence graph...", flush=True) incidencegraph = compute_incidence_graph(nWordsForAnalysis, Diameter, CountOfSharedContexts) del CountOfSharedContexts print("Computing mylaplacian...", flush=True) mylaplacian = compute_laplacian(nWordsForAnalysis, Diameter, incidencegraph) del Diameter del incidencegraph print("Computing eigenvectors...", flush=True) myeigenvalues, myeigenvectors = GetEigenvectors(mylaplacian) del mylaplacian del myeigenvalues print('Computing distances between words...', flush=True) # take first N columns of eigenvector matrix coordinates = myeigenvectors[:, :nEigenvectors] wordsdistance = compute_words_distance(nWordsForAnalysis, coordinates) del coordinates print('Computing nearest neighbors now... ', flush=True) closestNeighbors = compute_closest_neighbors(wordsdistance, nNeighbors) WordToNeighbors_by_str = OrderedDict() WordToNeighbors = dict() for wordno in range(nWordsForAnalysis): line = closestNeighbors[wordno] word_idx, neighbors_idx = line[0], line[1:] word = analyzedwordlist[word_idx] neighbors = [analyzedwordlist[idx] for idx in neighbors_idx] WordToNeighbors_by_str[word] = neighbors WordToNeighbors[word_idx] = neighbors_idx del closestNeighbors with outfilenameNeighbors.open('w') as f: print("# language: {}\n# corpus: {}\n" "# Number of word types analyzed: {}\n" "# Number of neighbors: {}\n".format(language, corpus, nWordsForAnalysis, nNeighbors), file=f) for word, neighbors in WordToNeighbors_by_str.items(): print(word, " ".join(neighbors), file=f) neighbor_graph = GetMyGraph(WordToNeighbors_by_str) # output manifold as gexf data file nx.write_gexf(neighbor_graph, str(outfilenameNeighborGraph)) # output manifold as json for d3 visualization manifold_json_data = json_graph.node_link_data(neighbor_graph) outfilenameManifoldJson = Path(outfolder, corpusName + "_manifold.json") json.dump(manifold_json_data, outfilenameManifoldJson.open("w"), indent=2) WordToNeighbors_json = changeFilenameSuffix(outfilenameNeighbors, ".json") json_pdump(WordToNeighbors_by_str, WordToNeighbors_json.open("w"), asis=True) print("Computing shared contexts among neighbors...", flush=True) WordToSharedContextsOfNeighbors, \ ImportantContextToWords = compute_WordToSharedContextsOfNeighbors( nWordsForAnalysis, WordToContexts, WordToNeighbors, ContextToWords, nNeighbors, mincontexts) output_WordToSharedContextsOfNeighbors(outfilenameSharedcontexts, WordToSharedContextsOfNeighbors, worddict, contextdict, nWordsForAnalysis) output_ImportantContextToWords(outfilenameImportantContextToWords, ImportantContextToWords, contextdict, worddict) outputfilelist = [ outfilenameNeighbors, outfilenameNeighborGraph, WordToNeighbors_json, outfilenameSharedcontexts, outfilenameImportantContextToWords, outfilenameManifoldJson ] if create_WordToContexts: outputfilelist.append(outWordToContexts_json) json_pdump(WordToContexts, outWordToContexts_json.open("w"), key=lambda x: len(x[1]), reverse=True) if create_ContextToWords: outputfilelist.append(outContextToWords_json) json_pdump(ContextToWords, outContextToWords_json.open("w"), key=lambda x: len(x[1]), reverse=True) stdout_list("Output files:", *outputfilelist)
def main(language=None, corpus=None, datafolder=None, filename=None, maxwordtokens=0, use_corpus=True): print("\n*****************************************************\n" "Running the phon.py program now...\n") infilename, corpusName = get_wordlist_path_corpus_stem( language, corpus, datafolder, filename, maxwordtokens, use_corpus) if not infilename.exists(): if use_corpus: if maxwordtokens: warning = " ({} tokens)".format(maxwordtokens) else: warning = "" print("\nWordlist for {}{} not found.\n" "ngrams.py is now run.\n".format(corpus, warning)) ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename, maxwordtokens=maxwordtokens) else: sys.exit("\nThe specified wordlist " "\n" "is not found.".format(infilename)) if filename: outfolder = Path(Path(filename).parent, "phon") else: outfolder = Path(datafolder, language, 'phon') if not outfolder.exists(): outfolder.mkdir(parents=True) outfilenamePhones = Path(outfolder, corpusName + "_phones.txt") outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt") outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt") phoneDict = Counter() triphoneDict = Counter() biphoneDict = Counter() sep = "\t" print('Reading the wordlist file now...') with infilename.open() as f: lines = f.readlines() for line in lines: if not line or line.startswith("#"): continue line = line.strip().casefold() phones, *rest = line.split() try: freq = int(rest[0]) except (ValueError, IndexError): freq = 1 phones = "#{}#".format(phones) # add word boundaries lenPhones = len(phones) for i in range(lenPhones - 2): phone1 = phones[i] phone2 = phones[i + 1] phone3 = phones[i + 2] phoneDict[phone3] += freq if i == 0: phoneDict[phone1] += freq phoneDict[phone2] += freq biphone = phone1 + sep + phone2 biphoneDict[biphone] += freq biphone = phone2 + sep + phone3 triphone = phone1 + sep + phone2 + sep + phone3 triphoneDict[triphone] += freq biphoneDict[biphone] += freq print("\nCompleted counting phones, biphones, and triphones.") intro_string = "# data source: {}".format(str(infilename)) phonesSorted = sorted_alphabetized(phoneDict.items(), key=lambda x: x[1], reverse=True) biphonesSorted = sorted_alphabetized(biphoneDict.items(), key=lambda x: x[1], reverse=True) triphonesSorted = sorted_alphabetized(triphoneDict.items(), key=lambda x: x[1], reverse=True) #--------------------------------------------------------------------------# # generate .txt output files #--------------------------------------------------------------------------# with outfilenamePhones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(phonesSorted)), file=f) print("# token count: {}".format(str(sum(phoneDict.values()))), file=f) for (phone, freq) in phonesSorted: print(phone + sep + str(freq), file=f) with outfilenameBiphones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(biphonesSorted)), file=f) print("# token count: {}".format(str(sum(biphoneDict.values()))), file=f) for (biphone, freq) in biphonesSorted: print(biphone + sep + str(freq), file=f) with outfilenameTriphones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(triphonesSorted)), file=f) print("# token count: {}".format(str(sum(triphoneDict.values()))), file=f) for (triphone, freq) in triphonesSorted: print(triphone + sep + str(freq), file=f) #--------------------------------------------------------------------------# # generate .json output files #--------------------------------------------------------------------------# outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json') with outfilenamePhones_json.open('w') as f: json_pdump(phoneDict, f, key=lambda x: x[1], reverse=True) outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones, '.json') with outfilenameBiphones_json.open('w') as f: json_pdump(biphoneDict, f, key=lambda x: x[1], reverse=True) outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones, '.json') with outfilenameTriphones_json.open('w') as f: json_pdump(triphoneDict, f, key=lambda x: x[1], reverse=True) print('phone, biphone and triphone files ready') stdout_list("Output files:", outfilenamePhones, outfilenameBiphones, outfilenameTriphones, outfilenamePhones_json, outfilenameBiphones_json, outfilenameTriphones_json)
def main(language=None, corpus=None, datafolder=None, filename=None, MinimumStemLength=4, MinimumAffixLength=1, SF_threshold=3, maxwordtokens=0, use_corpus=True): print("\n*****************************************************\n" "Running the tries.py program now...\n") #--------------------------------------------------------------------## # read wordlist #--------------------------------------------------------------------## print("reading wordlist...", flush=True) wordlist_path, corpusName = get_wordlist_path_corpus_stem(language, corpus, datafolder, filename, maxwordtokens, use_corpus) print("wordlist file path:\n{}\n".format(wordlist_path)) if not wordlist_path.exists(): if use_corpus: if maxwordtokens: warning = " ({} tokens)".format(maxwordtokens) else: warning = "" print("\nWordlist for {}{} not found.\n" "ngrams.py is now run.\n".format(corpus, warning)) ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename, maxwordtokens=maxwordtokens) else: sys.exit("\nThe specified wordlist ""\n" "is not found.".format(wordlist_path)) wordFreqDict = read_word_freq(wordlist_path) wordlist = sorted(wordFreqDict.keys()) reversedwordlist = sorted([x[::-1] for x in wordlist]) #--------------------------------------------------------------------## # output settings #--------------------------------------------------------------------## if filename: outfolder = Path(Path(filename).parent, "tries") else: outfolder = Path(datafolder, language, "tries") if not outfolder.exists(): outfolder.mkdir(parents=True) outfile_SF_name = Path(outfolder, corpusName + "_SF.txt") outfile_trieLtoR_name = Path(outfolder, corpusName + "_trieLtoR.txt") outfile_trieRtoL_name = Path(outfolder, corpusName + "_trieRtoL.txt") outfile_PF_name = Path(outfolder, corpusName + "_PF.txt") outfile_Signatures_name = Path(outfolder, corpusName + "_Signatures.txt") #--------------------------------------------------------------------## # Find breaks in words (left-to-right and right-to-left) #--------------------------------------------------------------------## print("finding breaks in words...", flush=True) breaks_LtoR = findBreaksInWords(wordlist, MinimumStemLength) breaks_RtoL = findBreaksInWords(reversedwordlist, MinimumStemLength) #--------------------------------------------------------------------## # Break up each word (left-to-right and right-to-left) #--------------------------------------------------------------------## WordsBrokenLtoR = BreakUpEachWord(wordlist, breaks_LtoR) WordsBrokenRtoL = BreakUpEachWord(reversedwordlist, breaks_RtoL) #--------------------------------------------------------------------------# # Compute successors and predecessors #--------------------------------------------------------------------------# print("computing successors and predecessors...", flush=True) successors = GetSuccessors(wordlist, WordsBrokenLtoR) OutputSuccessors(outfile_SF_name, successors, SF_threshold) predecessors = GetSuccessors(reversedwordlist, WordsBrokenRtoL) OutputSuccessors(outfile_PF_name, predecessors, SF_threshold, reverse=True) outfile_SF_name_json = changeFilenameSuffix(outfile_SF_name, ".json") json_pdump(successors, outfile_SF_name_json.open("w")) outfile_PF_name_json = changeFilenameSuffix(outfile_PF_name, ".json") json_pdump(predecessors, outfile_PF_name_json.open("w")) print("printing signatures...", flush=True) OutputSignatures1(outfile_Signatures_name, successors) #--------------------------------------------------------------------------# # Print tries (left-to-right, right-to-left) #--------------------------------------------------------------------------# print("printing tries...", flush=True) OutputTrie(outfile_trieLtoR_name, wordlist, WordsBrokenLtoR) OutputTrie(outfile_trieRtoL_name, reversedwordlist, WordsBrokenRtoL, reverse=True) outfile_trieLtoR_name_json = changeFilenameSuffix(outfile_trieLtoR_name, ".json") json_pdump(WordsBrokenLtoR, outfile_trieLtoR_name_json.open("w")) outfile_trieRtoL_name_json = changeFilenameSuffix(outfile_trieRtoL_name, ".json") json_pdump(WordsBrokenRtoL, outfile_trieRtoL_name_json.open("w")) stdout_list("Output files:", outfile_SF_name, outfile_PF_name, outfile_trieLtoR_name, outfile_trieRtoL_name, outfile_Signatures_name, outfile_SF_name_json, outfile_PF_name_json, outfile_trieLtoR_name_json, outfile_trieRtoL_name_json)
def main(language=None, corpus=None, datafolder=None, filename=None, maxwordtokens=0): print("\n*****************************************************\n" "Running the ngrams.py program now...\n") if filename: infilename = Path(filename) outfolder = Path(infilename.parent, "ngrams") outfolderDx1 = Path(infilename.parent, "dx1") corpus = infilename.name else: infilename = Path(datafolder, language, corpus) outfolder = Path(datafolder, language, "ngrams") outfolderDx1 = Path(datafolder, language, "dx1") if not outfolder.exists(): outfolder.mkdir(parents=True) if not outfolderDx1.exists(): outfolderDx1.mkdir(parents=True) if maxwordtokens: corpusName = Path(corpus).stem + "_{}-tokens".format(maxwordtokens) else: corpusName = Path(corpus).stem outfilenameWords = Path(outfolder, corpusName + "_words.txt") outfilenameBigrams = Path(outfolder, corpusName + "_bigrams.txt") outfilenameTrigrams = Path(outfolder, corpusName + "_trigrams.txt") outfilenameDx1 = Path(outfolderDx1, corpusName + ".dx1") wordDict = Counter() trigramDict = Counter() bigramDict = Counter() sep = "\t" corpusCurrentSize = 0 # running word token count print('Reading the corpus file now...') with infilename.open() as f: for line in f.readlines(): if not line: continue line = line.strip().casefold() # TODO: modify/combine these with "scrubbing", cf. Alchemist and Lxa4 line = line.replace(".", " . ") line = line.replace(",", " , ") line = line.replace(";", " ; ") line = line.replace("!", " ! ") line = line.replace("?", " ? ") line = line.replace(":", " : ") line = line.replace(")", " ) ") line = line.replace("(", " ( ") words = line.split() lenWords = len(words) corpusCurrentSize += lenWords for i in range(lenWords-2): word1 = words[i] word2 = words[i+1] word3 = words[i+2] wordDict[word3] += 1 if i == 0: wordDict[word1] += 1 wordDict[word2] += 1 bigram = word1 + sep + word2 bigramDict[bigram] += 1 bigram = word2 + sep + word3 trigram = word1 + sep + word2 + sep + word3 trigramDict[trigram] += 1 bigramDict[bigram] += 1 if maxwordtokens and corpusCurrentSize > maxwordtokens: break print("\nCompleted counting words, bigrams, and trigrams.") print("Token count: {}".format(corpusCurrentSize)) intro_string = "# data source: {}\n# token count: {}".format(str(infilename), corpusCurrentSize) # wordsSorted = sorted(wordDict.items(), # key=lambda x: x[1], reverse=True) wordsSorted = sorted_alphabetized(wordDict.items(), key=lambda x: x[1], reverse=True) bigramsSorted = sorted_alphabetized(bigramDict.items(), key=lambda x: x[1], reverse=True) trigramsSorted = sorted_alphabetized(trigramDict.items(), key=lambda x: x[1], reverse=True) # print txt outputs with outfilenameWords.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(wordsSorted)), file=f) for (word, freq) in wordsSorted: print(word + sep + str(freq), file=f) with outfilenameBigrams.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(bigramsSorted)), file=f) for (bigram, freq) in bigramsSorted: print(bigram + sep + str(freq), file=f) with outfilenameTrigrams.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(trigramsSorted)), file=f) for (trigram, freq) in trigramsSorted: print(trigram + sep + str(freq), file=f) # print dx1 output with outfilenameDx1.open('w') as f: for (word, freq) in wordsSorted: print(word, freq, ' '.join(word), file=f) # print json outputs with changeFilenameSuffix(outfilenameWords, ".json").open('w') as f: json_pdump(dict(wordsSorted), f) with changeFilenameSuffix(outfilenameBigrams, ".json").open('w') as f: json_pdump(dict(bigramsSorted), f) with changeFilenameSuffix(outfilenameTrigrams, ".json").open('w') as f: json_pdump(dict(trigramsSorted), f) print('wordlist, bigram and trigram files ready') print('dx1 file ready') stdout_list("Output files:", outfilenameWords, outfilenameBigrams, outfilenameTrigrams, outfilenameDx1, changeFilenameSuffix(outfilenameWords, ".json"), changeFilenameSuffix(outfilenameBigrams, ".json"), changeFilenameSuffix(outfilenameTrigrams, ".json"))
def main(language=None, corpus=None, datafolder=None, filename=None, maxwordtokens=0): print("\n*****************************************************\n" "Running the ngrams.py program now...\n") if filename: infilename = Path(filename) outfolder = Path(infilename.parent, "ngrams") outfolderDx1 = Path(infilename.parent, "dx1") corpus = infilename.name else: infilename = Path(datafolder, language, corpus) outfolder = Path(datafolder, language, "ngrams") outfolderDx1 = Path(datafolder, language, "dx1") if not outfolder.exists(): outfolder.mkdir(parents=True) if not outfolderDx1.exists(): outfolderDx1.mkdir(parents=True) if maxwordtokens: corpusName = Path(corpus).stem + "_{}-tokens".format(maxwordtokens) else: corpusName = Path(corpus).stem outfilenameWords = Path(outfolder, corpusName + "_words.txt") outfilenameBigrams = Path(outfolder, corpusName + "_bigrams.txt") outfilenameTrigrams = Path(outfolder, corpusName + "_trigrams.txt") outfilenameDx1 = Path(outfolderDx1, corpusName + ".dx1") wordDict = Counter() trigramDict = Counter() bigramDict = Counter() sep = "\t" corpusCurrentSize = 0 # running word token count print('Reading the corpus file now...') with infilename.open() as f: for line in f.readlines(): if not line: continue line = line.strip().casefold() # TODO: modify/combine these with "scrubbing", cf. Alchemist and Lxa4 line = line.replace(".", " . ") line = line.replace(",", " , ") line = line.replace(";", " ; ") line = line.replace("!", " ! ") line = line.replace("?", " ? ") line = line.replace(":", " : ") line = line.replace(")", " ) ") line = line.replace("(", " ( ") words = line.split() lenWords = len(words) corpusCurrentSize += lenWords for i in range(lenWords - 2): word1 = words[i] word2 = words[i + 1] word3 = words[i + 2] wordDict[word3] += 1 if i == 0: wordDict[word1] += 1 wordDict[word2] += 1 bigram = word1 + sep + word2 bigramDict[bigram] += 1 bigram = word2 + sep + word3 trigram = word1 + sep + word2 + sep + word3 trigramDict[trigram] += 1 bigramDict[bigram] += 1 if maxwordtokens and corpusCurrentSize > maxwordtokens: break print("\nCompleted counting words, bigrams, and trigrams.") print("Token count: {}".format(corpusCurrentSize)) intro_string = "# data source: {}\n# token count: {}".format( str(infilename), corpusCurrentSize) # wordsSorted = sorted(wordDict.items(), # key=lambda x: x[1], reverse=True) wordsSorted = sorted_alphabetized(wordDict.items(), key=lambda x: x[1], reverse=True) bigramsSorted = sorted_alphabetized(bigramDict.items(), key=lambda x: x[1], reverse=True) trigramsSorted = sorted_alphabetized(trigramDict.items(), key=lambda x: x[1], reverse=True) # print txt outputs with outfilenameWords.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(wordsSorted)), file=f) for (word, freq) in wordsSorted: print(word + sep + str(freq), file=f) with outfilenameBigrams.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(bigramsSorted)), file=f) for (bigram, freq) in bigramsSorted: print(bigram + sep + str(freq), file=f) with outfilenameTrigrams.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(trigramsSorted)), file=f) for (trigram, freq) in trigramsSorted: print(trigram + sep + str(freq), file=f) # print dx1 output with outfilenameDx1.open('w') as f: for (word, freq) in wordsSorted: print(word, freq, ' '.join(word), file=f) # print json outputs with changeFilenameSuffix(outfilenameWords, ".json").open('w') as f: json_pdump(dict(wordsSorted), f) with changeFilenameSuffix(outfilenameBigrams, ".json").open('w') as f: json_pdump(dict(bigramsSorted), f) with changeFilenameSuffix(outfilenameTrigrams, ".json").open('w') as f: json_pdump(dict(trigramsSorted), f) print('wordlist, bigram and trigram files ready') print('dx1 file ready') stdout_list("Output files:", outfilenameWords, outfilenameBigrams, outfilenameTrigrams, outfilenameDx1, changeFilenameSuffix(outfilenameWords, ".json"), changeFilenameSuffix(outfilenameBigrams, ".json"), changeFilenameSuffix(outfilenameTrigrams, ".json"))
def main(language=None, corpus=None, datafolder=None, filename=None, MinimumStemLength=4, MinimumAffixLength=1, SF_threshold=3, maxwordtokens=0, use_corpus=True): print("\n*****************************************************\n" "Running the tries.py program now...\n") #--------------------------------------------------------------------## # read wordlist #--------------------------------------------------------------------## print("reading wordlist...", flush=True) wordlist_path, corpusName = get_wordlist_path_corpus_stem( language, corpus, datafolder, filename, maxwordtokens, use_corpus) print("wordlist file path:\n{}\n".format(wordlist_path)) if not wordlist_path.exists(): if use_corpus: if maxwordtokens: warning = " ({} tokens)".format(maxwordtokens) else: warning = "" print("\nWordlist for {}{} not found.\n" "ngrams.py is now run.\n".format(corpus, warning)) ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename, maxwordtokens=maxwordtokens) else: sys.exit("\nThe specified wordlist " "\n" "is not found.".format(wordlist_path)) wordFreqDict = read_word_freq(wordlist_path) wordlist = sorted(wordFreqDict.keys()) reversedwordlist = sorted([x[::-1] for x in wordlist]) #--------------------------------------------------------------------## # output settings #--------------------------------------------------------------------## if filename: outfolder = Path(Path(filename).parent, "tries") else: outfolder = Path(datafolder, language, "tries") if not outfolder.exists(): outfolder.mkdir(parents=True) outfile_SF_name = Path(outfolder, corpusName + "_SF.txt") outfile_trieLtoR_name = Path(outfolder, corpusName + "_trieLtoR.txt") outfile_trieRtoL_name = Path(outfolder, corpusName + "_trieRtoL.txt") outfile_PF_name = Path(outfolder, corpusName + "_PF.txt") outfile_Signatures_name = Path(outfolder, corpusName + "_Signatures.txt") #--------------------------------------------------------------------## # Find breaks in words (left-to-right and right-to-left) #--------------------------------------------------------------------## print("finding breaks in words...", flush=True) breaks_LtoR = findBreaksInWords(wordlist, MinimumStemLength) breaks_RtoL = findBreaksInWords(reversedwordlist, MinimumStemLength) #--------------------------------------------------------------------## # Break up each word (left-to-right and right-to-left) #--------------------------------------------------------------------## WordsBrokenLtoR = BreakUpEachWord(wordlist, breaks_LtoR) WordsBrokenRtoL = BreakUpEachWord(reversedwordlist, breaks_RtoL) #--------------------------------------------------------------------------# # Compute successors and predecessors #--------------------------------------------------------------------------# print("computing successors and predecessors...", flush=True) successors = GetSuccessors(wordlist, WordsBrokenLtoR) OutputSuccessors(outfile_SF_name, successors, SF_threshold) predecessors = GetSuccessors(reversedwordlist, WordsBrokenRtoL) OutputSuccessors(outfile_PF_name, predecessors, SF_threshold, reverse=True) outfile_SF_name_json = changeFilenameSuffix(outfile_SF_name, ".json") json_pdump(successors, outfile_SF_name_json.open("w")) outfile_PF_name_json = changeFilenameSuffix(outfile_PF_name, ".json") json_pdump(predecessors, outfile_PF_name_json.open("w")) print("printing signatures...", flush=True) OutputSignatures1(outfile_Signatures_name, successors) #--------------------------------------------------------------------------# # Print tries (left-to-right, right-to-left) #--------------------------------------------------------------------------# print("printing tries...", flush=True) OutputTrie(outfile_trieLtoR_name, wordlist, WordsBrokenLtoR) OutputTrie(outfile_trieRtoL_name, reversedwordlist, WordsBrokenRtoL, reverse=True) outfile_trieLtoR_name_json = changeFilenameSuffix(outfile_trieLtoR_name, ".json") json_pdump(WordsBrokenLtoR, outfile_trieLtoR_name_json.open("w")) outfile_trieRtoL_name_json = changeFilenameSuffix(outfile_trieRtoL_name, ".json") json_pdump(WordsBrokenRtoL, outfile_trieRtoL_name_json.open("w")) stdout_list("Output files:", outfile_SF_name, outfile_PF_name, outfile_trieLtoR_name, outfile_trieRtoL_name, outfile_Signatures_name, outfile_SF_name_json, outfile_PF_name_json, outfile_trieLtoR_name_json, outfile_trieRtoL_name_json)
def main( language=None, corpus=None, datafolder=None, filename=None, maxwordtypes=1000, nNeighbors=9, nEigenvectors=11, create_WordToContexts=False, create_ContextToWords=False, mincontexts=3, usesigtransforms=True, ): print("\n*****************************************************\n" "Running the manifold.py program now...\n") if filename: corpusStem = Path(filename).stem infolder = Path(Path(filename).parent, "ngrams") outfolder = Path(Path(filename).parent, "neighbors") outcontextsfolder = Path(Path(filename).parent, "word_contexts") else: corpusStem = Path(corpus).stem infolder = Path(datafolder, language, "ngrams") outfolder = Path(datafolder, language, "neighbors") outcontextsfolder = Path(datafolder, language, "word_contexts") if not outfolder.exists(): outfolder.mkdir(parents=True) if not outcontextsfolder.exists(): outcontextsfolder.mkdir(parents=True) infileWordsname = Path(infolder, corpusStem + "_words.txt") infileBigramsname = Path(infolder, corpusStem + "_bigrams.txt") infileTrigramsname = Path(infolder, corpusStem + "_trigrams.txt") if (not infileWordsname.exists()) or (not infileBigramsname.exists()) or (not infileTrigramsname.exists()): print("Error in locating n-gram data files.\n" "The program now creates them.\n") ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename) if usesigtransforms: if filename: infolderlxa = Path(Path(filename).parent, "lxa") else: infolderlxa = Path(datafolder, language, "lxa") sigtransform_json_fname = Path(infolderlxa, corpusStem + "_WordToSigtransforms.json") try: WordToSigtransforms = json_pload(sigtransform_json_fname.open()) except FileNotFoundError: print('The file "{}" is not found.\n' "The program now creates it.\n".format(sigtransform_json_fname)) lxa5.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename) WordToSigtransforms = json_pload(sigtransform_json_fname.open()) # WordToSigtransforms just read into the program; to be used soon... print("Reading word list...", flush=True) mywords = GetMyWords(infileWordsname, corpus) print("Word file is", infileWordsname, flush=True) print("Number of neighbors to find for each word type: ", nNeighbors) print("Corpus has", len(mywords), "word types", flush=True) lenMywords = len(mywords) if lenMywords > maxwordtypes: nWordsForAnalysis = maxwordtypes else: nWordsForAnalysis = lenMywords print("number of words for analysis adjusted to", nWordsForAnalysis) analyzedwordlist = list(mywords.keys())[:nWordsForAnalysis] worddict = {w: analyzedwordlist.index(w) for w in analyzedwordlist} corpusName = corpusStem + "_" + str(nWordsForAnalysis) + "_" + str(nNeighbors) outfilenameNeighbors = Path(outfolder, corpusName + "_neighbors.txt") outfilenameSharedcontexts = Path(outfolder, corpusName + "_shared_contexts.txt") outfilenameNeighborGraph = Path(outfolder, corpusName + "_neighbors.gexf") outfilenameImportantContextToWords = Path(outfolder, corpusName + "_ImportantContextToWords.txt") outWordToContexts_json = Path(outcontextsfolder, corpusName + "_WordToContexts.json") outContextToWords_json = Path(outcontextsfolder, corpusName + "_ContextToWords.json") print("Reading bigrams/trigrams and computing context array...", flush=True) context_array, contextdict, WordToContexts, ContextToWords = GetContextArray( nWordsForAnalysis, worddict, infileBigramsname, infileTrigramsname, mincontexts ) print("Computing shared context master matrix...", flush=True) CountOfSharedContexts = context_array.dot(context_array.T).todense() del context_array print("Computing diameter...", flush=True) Diameter = Normalize(nWordsForAnalysis, CountOfSharedContexts) print("Computing incidence graph...", flush=True) incidencegraph = compute_incidence_graph(nWordsForAnalysis, Diameter, CountOfSharedContexts) del CountOfSharedContexts print("Computing mylaplacian...", flush=True) mylaplacian = compute_laplacian(nWordsForAnalysis, Diameter, incidencegraph) del Diameter del incidencegraph print("Computing eigenvectors...", flush=True) myeigenvalues, myeigenvectors = GetEigenvectors(mylaplacian) del mylaplacian del myeigenvalues print("Computing distances between words...", flush=True) # take first N columns of eigenvector matrix coordinates = myeigenvectors[:, :nEigenvectors] wordsdistance = compute_words_distance(nWordsForAnalysis, coordinates) del coordinates print("Computing nearest neighbors now... ", flush=True) closestNeighbors = compute_closest_neighbors(wordsdistance, nNeighbors) WordToNeighbors_by_str = OrderedDict() WordToNeighbors = dict() for wordno in range(nWordsForAnalysis): line = closestNeighbors[wordno] word_idx, neighbors_idx = line[0], line[1:] word = analyzedwordlist[word_idx] neighbors = [analyzedwordlist[idx] for idx in neighbors_idx] WordToNeighbors_by_str[word] = neighbors WordToNeighbors[word_idx] = neighbors_idx del closestNeighbors with outfilenameNeighbors.open("w") as f: print( "# language: {}\n# corpus: {}\n" "# Number of word types analyzed: {}\n" "# Number of neighbors: {}\n".format(language, corpus, nWordsForAnalysis, nNeighbors), file=f, ) for word, neighbors in WordToNeighbors_by_str.items(): print(word, " ".join(neighbors), file=f) neighbor_graph = GetMyGraph(WordToNeighbors_by_str) # output manifold as gexf data file nx.write_gexf(neighbor_graph, str(outfilenameNeighborGraph)) # output manifold as json for d3 visualization manifold_json_data = json_graph.node_link_data(neighbor_graph) outfilenameManifoldJson = Path(outfolder, corpusName + "_manifold.json") json.dump(manifold_json_data, outfilenameManifoldJson.open("w"), indent=2) WordToNeighbors_json = changeFilenameSuffix(outfilenameNeighbors, ".json") json_pdump(WordToNeighbors_by_str, WordToNeighbors_json.open("w"), asis=True) print("Computing shared contexts among neighbors...", flush=True) WordToSharedContextsOfNeighbors, ImportantContextToWords = compute_WordToSharedContextsOfNeighbors( nWordsForAnalysis, WordToContexts, WordToNeighbors, ContextToWords, nNeighbors, mincontexts ) output_WordToSharedContextsOfNeighbors( outfilenameSharedcontexts, WordToSharedContextsOfNeighbors, worddict, contextdict, nWordsForAnalysis ) output_ImportantContextToWords(outfilenameImportantContextToWords, ImportantContextToWords, contextdict, worddict) outputfilelist = [ outfilenameNeighbors, outfilenameNeighborGraph, WordToNeighbors_json, outfilenameSharedcontexts, outfilenameImportantContextToWords, outfilenameManifoldJson, ] if create_WordToContexts: outputfilelist.append(outWordToContexts_json) json_pdump(WordToContexts, outWordToContexts_json.open("w"), key=lambda x: len(x[1]), reverse=True) if create_ContextToWords: outputfilelist.append(outContextToWords_json) json_pdump(ContextToWords, outContextToWords_json.open("w"), key=lambda x: len(x[1]), reverse=True) stdout_list("Output files:", *outputfilelist)