Exemple #1
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         maxwordtokens=0,
         use_corpus=True):

    print("\n*****************************************************\n"
          "Running the phon.py program now...\n")

    infilename, corpusName = get_wordlist_path_corpus_stem(
        language, corpus, datafolder, filename, maxwordtokens, use_corpus)

    if not infilename.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language,
                        corpus=corpus,
                        datafolder=datafolder,
                        filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist "
                     "\n"
                     "is not found.".format(infilename))

    if filename:
        outfolder = Path(Path(filename).parent, "phon")
    else:
        outfolder = Path(datafolder, language, 'phon')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfilenamePhones = Path(outfolder, corpusName + "_phones.txt")
    outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt")
    outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt")

    phoneDict = Counter()
    triphoneDict = Counter()
    biphoneDict = Counter()
    sep = "\t"

    print('Reading the wordlist file now...')

    with infilename.open() as f:
        lines = f.readlines()

        for line in lines:
            if not line or line.startswith("#"):
                continue

            line = line.strip().casefold()

            phones, *rest = line.split()

            try:
                freq = int(rest[0])
            except (ValueError, IndexError):
                freq = 1

            phones = "#{}#".format(phones)  # add word boundaries
            lenPhones = len(phones)

            for i in range(lenPhones - 2):

                phone1 = phones[i]
                phone2 = phones[i + 1]
                phone3 = phones[i + 2]

                phoneDict[phone3] += freq

                if i == 0:
                    phoneDict[phone1] += freq
                    phoneDict[phone2] += freq
                    biphone = phone1 + sep + phone2
                    biphoneDict[biphone] += freq

                biphone = phone2 + sep + phone3
                triphone = phone1 + sep + phone2 + sep + phone3

                triphoneDict[triphone] += freq
                biphoneDict[biphone] += freq

    print("\nCompleted counting phones, biphones, and triphones.")

    intro_string = "# data source: {}".format(str(infilename))

    phonesSorted = sorted_alphabetized(phoneDict.items(),
                                       key=lambda x: x[1],
                                       reverse=True)

    biphonesSorted = sorted_alphabetized(biphoneDict.items(),
                                         key=lambda x: x[1],
                                         reverse=True)

    triphonesSorted = sorted_alphabetized(triphoneDict.items(),
                                          key=lambda x: x[1],
                                          reverse=True)

    #--------------------------------------------------------------------------#
    # generate .txt output files
    #--------------------------------------------------------------------------#

    with outfilenamePhones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(phonesSorted)), file=f)
        print("# token count: {}".format(str(sum(phoneDict.values()))), file=f)
        for (phone, freq) in phonesSorted:
            print(phone + sep + str(freq), file=f)

    with outfilenameBiphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(biphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(biphoneDict.values()))),
              file=f)
        for (biphone, freq) in biphonesSorted:
            print(biphone + sep + str(freq), file=f)

    with outfilenameTriphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(triphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(triphoneDict.values()))),
              file=f)
        for (triphone, freq) in triphonesSorted:
            print(triphone + sep + str(freq), file=f)

    #--------------------------------------------------------------------------#
    # generate .json output files
    #--------------------------------------------------------------------------#

    outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json')
    with outfilenamePhones_json.open('w') as f:
        json_pdump(phoneDict, f, key=lambda x: x[1], reverse=True)

    outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones,
                                                    '.json')
    with outfilenameBiphones_json.open('w') as f:
        json_pdump(biphoneDict, f, key=lambda x: x[1], reverse=True)

    outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones,
                                                     '.json')
    with outfilenameTriphones_json.open('w') as f:
        json_pdump(triphoneDict, f, key=lambda x: x[1], reverse=True)

    print('phone, biphone and triphone files ready')

    stdout_list("Output files:", outfilenamePhones, outfilenameBiphones,
                outfilenameTriphones, outfilenamePhones_json,
                outfilenameBiphones_json, outfilenameTriphones_json)
Exemple #2
0
def main(language=None, corpus=None, datafolder=None, filename=None,
         maxwordtokens=0, use_corpus=True):

    print("\n*****************************************************\n"
          "Running the phon.py program now...\n")

    infilename, corpusName = get_wordlist_path_corpus_stem(language, corpus,
                                datafolder, filename, maxwordtokens, use_corpus)

    if not infilename.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language, corpus=corpus,
                        datafolder=datafolder, filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist ""\n"
                     "is not found.".format(infilename))

    if filename:
        outfolder = Path(Path(filename).parent, "phon")
    else:
        outfolder = Path(datafolder, language, 'phon')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfilenamePhones = Path(outfolder, corpusName + "_phones.txt")
    outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt")
    outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt")

    phoneDict = Counter()
    triphoneDict = Counter()
    biphoneDict = Counter()
    sep = "\t"

    print('Reading the wordlist file now...')

    with infilename.open() as f:
        lines = f.readlines()

        for line in lines:
            if not line or line.startswith("#"):
                continue

            line = line.strip().casefold()

            phones, *rest = line.split()

            try:
                freq = int(rest[0])
            except (ValueError, IndexError):
                freq = 1

            phones = "#{}#".format(phones) # add word boundaries
            lenPhones = len(phones)

            for i in range(lenPhones-2):

                phone1 = phones[i]
                phone2 = phones[i+1]
                phone3 = phones[i+2]

                phoneDict[phone3] += freq

                if i == 0:
                    phoneDict[phone1] += freq
                    phoneDict[phone2] += freq
                    biphone = phone1 + sep + phone2
                    biphoneDict[biphone] += freq

                biphone = phone2 + sep + phone3
                triphone = phone1 + sep + phone2 + sep + phone3

                triphoneDict[triphone] += freq
                biphoneDict[biphone] += freq

    print("\nCompleted counting phones, biphones, and triphones.")

    intro_string = "# data source: {}".format(str(infilename))

    phonesSorted = sorted_alphabetized(phoneDict.items(),
                                       key=lambda x: x[1], reverse=True)

    biphonesSorted = sorted_alphabetized(biphoneDict.items(),
                                         key=lambda x: x[1], reverse=True)

    triphonesSorted = sorted_alphabetized(triphoneDict.items(),
                                          key=lambda x: x[1], reverse=True)

    #--------------------------------------------------------------------------#
    # generate .txt output files
    #--------------------------------------------------------------------------#

    with outfilenamePhones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(phonesSorted)), file=f)
        print("# token count: {}".format(str(sum(phoneDict.values()))), file=f)
        for (phone, freq) in phonesSorted:
            print(phone + sep + str(freq), file=f)

    with outfilenameBiphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(biphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(biphoneDict.values()))),
                                                                        file=f)
        for (biphone, freq) in biphonesSorted:
            print(biphone + sep +  str(freq), file=f)

    with outfilenameTriphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(triphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(triphoneDict.values()))),
                                                                        file=f)
        for (triphone, freq) in triphonesSorted:
            print(triphone + sep + str(freq), file=f)

    #--------------------------------------------------------------------------#
    # generate .json output files
    #--------------------------------------------------------------------------#

    outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json')
    with outfilenamePhones_json.open('w') as f:
        json_pdump(phoneDict, f, key=lambda x:x[1], reverse=True)

    outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones, '.json')
    with outfilenameBiphones_json.open('w') as f:
        json_pdump(biphoneDict, f, key=lambda x:x[1], reverse=True)

    outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones, '.json')
    with outfilenameTriphones_json.open('w') as f:
        json_pdump(triphoneDict, f, key=lambda x:x[1], reverse=True)

    print('phone, biphone and triphone files ready')

    stdout_list("Output files:",
        outfilenamePhones, outfilenameBiphones, outfilenameTriphones,
        outfilenamePhones_json, outfilenameBiphones_json, outfilenameTriphones_json)
Exemple #3
0
def main(language=None, corpus=None, datafolder=None, filename=None,
         MinimumStemLength=4, MinimumAffixLength=1, SF_threshold=3,
         maxwordtokens=0, use_corpus=True):

    print("\n*****************************************************\n"
          "Running the tries.py program now...\n")

    #--------------------------------------------------------------------##
    #        read wordlist
    #--------------------------------------------------------------------##

    print("reading wordlist...", flush=True)

    wordlist_path, corpusName = get_wordlist_path_corpus_stem(language, corpus,
                                datafolder, filename, maxwordtokens, use_corpus)

    print("wordlist file path:\n{}\n".format(wordlist_path))

    if not wordlist_path.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language, corpus=corpus,
                        datafolder=datafolder, filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist ""\n"
                     "is not found.".format(wordlist_path))

    wordFreqDict = read_word_freq(wordlist_path)
    wordlist = sorted(wordFreqDict.keys())
    reversedwordlist = sorted([x[::-1] for x in wordlist])

    #--------------------------------------------------------------------##
    #        output settings
    #--------------------------------------------------------------------##

    if filename:
        outfolder = Path(Path(filename).parent, "tries")
    else:
        outfolder = Path(datafolder, language, "tries")

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfile_SF_name = Path(outfolder, corpusName + "_SF.txt")
    outfile_trieLtoR_name = Path(outfolder, corpusName + "_trieLtoR.txt")
     
    outfile_trieRtoL_name = Path(outfolder, corpusName + "_trieRtoL.txt")
    outfile_PF_name = Path(outfolder, corpusName + "_PF.txt")

    outfile_Signatures_name = Path(outfolder, corpusName + "_Signatures.txt")

    #--------------------------------------------------------------------##
    #        Find breaks in words (left-to-right and right-to-left)
    #--------------------------------------------------------------------##

    print("finding breaks in words...", flush=True)

    breaks_LtoR = findBreaksInWords(wordlist, MinimumStemLength)
    breaks_RtoL = findBreaksInWords(reversedwordlist, MinimumStemLength)

    #--------------------------------------------------------------------##
    #        Break up each word (left-to-right and right-to-left)
    #--------------------------------------------------------------------##

    WordsBrokenLtoR = BreakUpEachWord(wordlist, breaks_LtoR)
    WordsBrokenRtoL = BreakUpEachWord(reversedwordlist, breaks_RtoL)

    #--------------------------------------------------------------------------#
    #        Compute successors and predecessors
    #--------------------------------------------------------------------------# 

    print("computing successors and predecessors...", flush=True)

    successors = GetSuccessors(wordlist, WordsBrokenLtoR)
    OutputSuccessors(outfile_SF_name, successors, SF_threshold)

    predecessors = GetSuccessors(reversedwordlist, WordsBrokenRtoL)
    OutputSuccessors(outfile_PF_name, predecessors, SF_threshold, reverse=True)

    outfile_SF_name_json = changeFilenameSuffix(outfile_SF_name, ".json")
    json_pdump(successors, outfile_SF_name_json.open("w"))

    outfile_PF_name_json = changeFilenameSuffix(outfile_PF_name, ".json")
    json_pdump(predecessors, outfile_PF_name_json.open("w"))

    print("printing signatures...", flush=True)
    OutputSignatures1(outfile_Signatures_name, successors)

    #--------------------------------------------------------------------------#
    #        Print tries (left-to-right, right-to-left)
    #--------------------------------------------------------------------------# 

    print("printing tries...", flush=True)

    OutputTrie(outfile_trieLtoR_name, wordlist, WordsBrokenLtoR)
    OutputTrie(outfile_trieRtoL_name, reversedwordlist, WordsBrokenRtoL, reverse=True)

    outfile_trieLtoR_name_json = changeFilenameSuffix(outfile_trieLtoR_name, ".json")
    json_pdump(WordsBrokenLtoR, outfile_trieLtoR_name_json.open("w"))

    outfile_trieRtoL_name_json = changeFilenameSuffix(outfile_trieRtoL_name, ".json")
    json_pdump(WordsBrokenRtoL, outfile_trieRtoL_name_json.open("w"))

    stdout_list("Output files:", outfile_SF_name, outfile_PF_name,
                                 outfile_trieLtoR_name, outfile_trieRtoL_name,
                                 outfile_Signatures_name,
                                 outfile_SF_name_json, outfile_PF_name_json,
                                 outfile_trieLtoR_name_json,
                                 outfile_trieRtoL_name_json)
Exemple #4
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         maxwordtypes=1000,
         nNeighbors=9,
         nEigenvectors=11,
         create_WordToContexts=False,
         create_ContextToWords=False,
         mincontexts=3,
         usesigtransforms=True):

    print("\n*****************************************************\n"
          "Running the manifold.py program now...\n")

    if filename:
        corpusStem = Path(filename).stem
        infolder = Path(Path(filename).parent, 'ngrams')
        outfolder = Path(Path(filename).parent, 'neighbors')
        outcontextsfolder = Path(Path(filename).parent, 'word_contexts')
    else:
        corpusStem = Path(corpus).stem
        infolder = Path(datafolder, language, 'ngrams')
        outfolder = Path(datafolder, language, 'neighbors')
        outcontextsfolder = Path(datafolder, language, 'word_contexts')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    if not outcontextsfolder.exists():
        outcontextsfolder.mkdir(parents=True)

    infileWordsname = Path(infolder, corpusStem + '_words.txt')
    infileBigramsname = Path(infolder, corpusStem + '_bigrams.txt')
    infileTrigramsname = Path(infolder, corpusStem + '_trigrams.txt')

    if (not infileWordsname.exists()) or \
       (not infileBigramsname.exists()) or \
       (not infileTrigramsname.exists()):
        print("Error in locating n-gram data files.\n"
              "The program now creates them.\n")
        ngrams.main(language=language,
                    corpus=corpus,
                    datafolder=datafolder,
                    filename=filename)

    if usesigtransforms:
        if filename:
            infolderlxa = Path(Path(filename).parent, 'lxa')
        else:
            infolderlxa = Path(datafolder, language, 'lxa')
        sigtransform_json_fname = Path(
            infolderlxa, corpusStem + "_WordToSigtransforms.json")
        try:
            WordToSigtransforms = json_pload(sigtransform_json_fname.open())
        except FileNotFoundError:
            print("The file \"{}\" is not found.\n"
                  "The program now creates it.\n".format(
                      sigtransform_json_fname))
            lxa5.main(language=language,
                      corpus=corpus,
                      datafolder=datafolder,
                      filename=filename)
            WordToSigtransforms = json_pload(sigtransform_json_fname.open())

    # WordToSigtransforms just read into the program; to be used soon...

    print('Reading word list...', flush=True)
    mywords = GetMyWords(infileWordsname, corpus)

    print("Word file is", infileWordsname, flush=True)
    print("Number of neighbors to find for each word type: ", nNeighbors)
    print('Corpus has', len(mywords), 'word types', flush=True)

    lenMywords = len(mywords)
    if lenMywords > maxwordtypes:
        nWordsForAnalysis = maxwordtypes
    else:
        nWordsForAnalysis = lenMywords
    print('number of words for analysis adjusted to', nWordsForAnalysis)

    analyzedwordlist = list(mywords.keys())[:nWordsForAnalysis]
    worddict = {w: analyzedwordlist.index(w) for w in analyzedwordlist}

    corpusName = corpusStem + '_' + str(nWordsForAnalysis) + '_' + str(
        nNeighbors)

    outfilenameNeighbors = Path(outfolder, corpusName + "_neighbors.txt")

    outfilenameSharedcontexts = Path(outfolder, corpusName + \
                                "_shared_contexts.txt")

    outfilenameNeighborGraph = Path(outfolder, corpusName + "_neighbors.gexf")

    outfilenameImportantContextToWords = Path(outfolder, corpusName + \
                                              "_ImportantContextToWords.txt")

    outWordToContexts_json = Path(outcontextsfolder, corpusName + \
                                       "_WordToContexts.json")

    outContextToWords_json = Path(outcontextsfolder, corpusName + \
                                       "_ContextToWords.json")

    print("Reading bigrams/trigrams and computing context array...",
          flush=True)

    context_array, contextdict, \
    WordToContexts, ContextToWords = GetContextArray(nWordsForAnalysis,
        worddict, infileBigramsname, infileTrigramsname, mincontexts)

    print("Computing shared context master matrix...", flush=True)
    CountOfSharedContexts = context_array.dot(context_array.T).todense()
    del context_array

    print("Computing diameter...", flush=True)
    Diameter = Normalize(nWordsForAnalysis, CountOfSharedContexts)

    print("Computing incidence graph...", flush=True)
    incidencegraph = compute_incidence_graph(nWordsForAnalysis, Diameter,
                                             CountOfSharedContexts)
    del CountOfSharedContexts

    print("Computing mylaplacian...", flush=True)
    mylaplacian = compute_laplacian(nWordsForAnalysis, Diameter,
                                    incidencegraph)
    del Diameter
    del incidencegraph

    print("Computing eigenvectors...", flush=True)
    myeigenvalues, myeigenvectors = GetEigenvectors(mylaplacian)
    del mylaplacian
    del myeigenvalues

    print('Computing distances between words...', flush=True)
    # take first N columns of eigenvector matrix
    coordinates = myeigenvectors[:, :nEigenvectors]
    wordsdistance = compute_words_distance(nWordsForAnalysis, coordinates)
    del coordinates

    print('Computing nearest neighbors now... ', flush=True)
    closestNeighbors = compute_closest_neighbors(wordsdistance, nNeighbors)

    WordToNeighbors_by_str = OrderedDict()
    WordToNeighbors = dict()

    for wordno in range(nWordsForAnalysis):
        line = closestNeighbors[wordno]
        word_idx, neighbors_idx = line[0], line[1:]
        word = analyzedwordlist[word_idx]
        neighbors = [analyzedwordlist[idx] for idx in neighbors_idx]
        WordToNeighbors_by_str[word] = neighbors
        WordToNeighbors[word_idx] = neighbors_idx

    del closestNeighbors

    with outfilenameNeighbors.open('w') as f:
        print("# language: {}\n# corpus: {}\n"
              "# Number of word types analyzed: {}\n"
              "# Number of neighbors: {}\n".format(language, corpus,
                                                   nWordsForAnalysis,
                                                   nNeighbors),
              file=f)

        for word, neighbors in WordToNeighbors_by_str.items():
            print(word, " ".join(neighbors), file=f)

    neighbor_graph = GetMyGraph(WordToNeighbors_by_str)

    # output manifold as gexf data file
    nx.write_gexf(neighbor_graph, str(outfilenameNeighborGraph))

    # output manifold as json for d3 visualization
    manifold_json_data = json_graph.node_link_data(neighbor_graph)
    outfilenameManifoldJson = Path(outfolder, corpusName + "_manifold.json")
    json.dump(manifold_json_data, outfilenameManifoldJson.open("w"), indent=2)

    WordToNeighbors_json = changeFilenameSuffix(outfilenameNeighbors, ".json")
    json_pdump(WordToNeighbors_by_str,
               WordToNeighbors_json.open("w"),
               asis=True)

    print("Computing shared contexts among neighbors...", flush=True)
    WordToSharedContextsOfNeighbors, \
    ImportantContextToWords = compute_WordToSharedContextsOfNeighbors(
                                        nWordsForAnalysis, WordToContexts,
                                        WordToNeighbors, ContextToWords,
                                        nNeighbors, mincontexts)

    output_WordToSharedContextsOfNeighbors(outfilenameSharedcontexts,
                                           WordToSharedContextsOfNeighbors,
                                           worddict, contextdict,
                                           nWordsForAnalysis)

    output_ImportantContextToWords(outfilenameImportantContextToWords,
                                   ImportantContextToWords, contextdict,
                                   worddict)

    outputfilelist = [
        outfilenameNeighbors, outfilenameNeighborGraph, WordToNeighbors_json,
        outfilenameSharedcontexts, outfilenameImportantContextToWords,
        outfilenameManifoldJson
    ]

    if create_WordToContexts:
        outputfilelist.append(outWordToContexts_json)
        json_pdump(WordToContexts,
                   outWordToContexts_json.open("w"),
                   key=lambda x: len(x[1]),
                   reverse=True)

    if create_ContextToWords:
        outputfilelist.append(outContextToWords_json)
        json_pdump(ContextToWords,
                   outContextToWords_json.open("w"),
                   key=lambda x: len(x[1]),
                   reverse=True)

    stdout_list("Output files:", *outputfilelist)
Exemple #5
0
def main(language=None, corpus=None, datafolder=None, filename=None,
         maxwordtokens=0):

    print("\n*****************************************************\n"
          "Running the ngrams.py program now...\n")

    if filename:
        infilename = Path(filename)
        outfolder = Path(infilename.parent, "ngrams")
        outfolderDx1 = Path(infilename.parent, "dx1")
        corpus = infilename.name
    else:
        infilename = Path(datafolder, language, corpus)
        outfolder = Path(datafolder, language, "ngrams")
        outfolderDx1 = Path(datafolder, language, "dx1")

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    if not outfolderDx1.exists():
        outfolderDx1.mkdir(parents=True)

    if maxwordtokens:
        corpusName = Path(corpus).stem + "_{}-tokens".format(maxwordtokens)
    else:
        corpusName = Path(corpus).stem

    outfilenameWords = Path(outfolder, corpusName + "_words.txt")
    outfilenameBigrams = Path(outfolder, corpusName + "_bigrams.txt")
    outfilenameTrigrams = Path(outfolder, corpusName + "_trigrams.txt")
    outfilenameDx1 = Path(outfolderDx1, corpusName + ".dx1")

    wordDict = Counter()
    trigramDict = Counter()
    bigramDict = Counter()
    sep = "\t"
    corpusCurrentSize = 0 # running word token count

    print('Reading the corpus file now...')

    with infilename.open() as f:
        for line in f.readlines():
            if not line:
                continue

            line = line.strip().casefold()

            # TODO: modify/combine these with "scrubbing", cf. Alchemist and Lxa4
            line = line.replace(".", " . ")
            line = line.replace(",", " , ")
            line = line.replace(";", " ; ")
            line = line.replace("!", " ! ")
            line = line.replace("?", " ? ")
            line = line.replace(":", " : ")
            line = line.replace(")", " ) ")
            line = line.replace("(", " ( ")

            words = line.split()
            lenWords = len(words)

            corpusCurrentSize += lenWords

            for i in range(lenWords-2):

                word1 = words[i]
                word2 = words[i+1]
                word3 = words[i+2]

                wordDict[word3] += 1

                if i == 0:
                    wordDict[word1] += 1
                    wordDict[word2] += 1
                    bigram = word1 + sep + word2
                    bigramDict[bigram] += 1

                bigram = word2 + sep + word3
                trigram = word1 + sep + word2 + sep + word3

                trigramDict[trigram] += 1
                bigramDict[bigram] += 1

            if maxwordtokens and corpusCurrentSize > maxwordtokens:
                break

    print("\nCompleted counting words, bigrams, and trigrams.")
    print("Token count: {}".format(corpusCurrentSize))

    intro_string = "# data source: {}\n# token count: {}".format(str(infilename),
                                                                   corpusCurrentSize)

#    wordsSorted = sorted(wordDict.items(),
#                                      key=lambda x: x[1], reverse=True)
    wordsSorted = sorted_alphabetized(wordDict.items(),
                                      key=lambda x: x[1], reverse=True)

    bigramsSorted = sorted_alphabetized(bigramDict.items(),
                                        key=lambda x: x[1], reverse=True)

    trigramsSorted = sorted_alphabetized(trigramDict.items(),
                                         key=lambda x: x[1], reverse=True)

    # print txt outputs
    with outfilenameWords.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(wordsSorted)), file=f)
        for (word, freq) in wordsSorted:
            print(word + sep + str(freq), file=f)

    with outfilenameBigrams.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(bigramsSorted)), file=f)
        for (bigram, freq) in bigramsSorted:
            print(bigram + sep +  str(freq), file=f)

    with outfilenameTrigrams.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(trigramsSorted)), file=f)
        for (trigram, freq) in trigramsSorted:
            print(trigram + sep + str(freq), file=f)

    # print dx1 output
    with outfilenameDx1.open('w') as f:
        for (word, freq) in wordsSorted:
            print(word, freq, ' '.join(word), file=f)

    # print json outputs
    with changeFilenameSuffix(outfilenameWords, ".json").open('w') as f:
        json_pdump(dict(wordsSorted), f)

    with changeFilenameSuffix(outfilenameBigrams, ".json").open('w') as f:
        json_pdump(dict(bigramsSorted), f)

    with changeFilenameSuffix(outfilenameTrigrams, ".json").open('w') as f:
        json_pdump(dict(trigramsSorted), f)

    print('wordlist, bigram and trigram files ready')
    print('dx1 file ready')

    stdout_list("Output files:", outfilenameWords,
                outfilenameBigrams, outfilenameTrigrams, outfilenameDx1,
                changeFilenameSuffix(outfilenameWords, ".json"),
                changeFilenameSuffix(outfilenameBigrams, ".json"),
                changeFilenameSuffix(outfilenameTrigrams, ".json"))
Exemple #6
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         maxwordtokens=0):

    print("\n*****************************************************\n"
          "Running the ngrams.py program now...\n")

    if filename:
        infilename = Path(filename)
        outfolder = Path(infilename.parent, "ngrams")
        outfolderDx1 = Path(infilename.parent, "dx1")
        corpus = infilename.name
    else:
        infilename = Path(datafolder, language, corpus)
        outfolder = Path(datafolder, language, "ngrams")
        outfolderDx1 = Path(datafolder, language, "dx1")

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    if not outfolderDx1.exists():
        outfolderDx1.mkdir(parents=True)

    if maxwordtokens:
        corpusName = Path(corpus).stem + "_{}-tokens".format(maxwordtokens)
    else:
        corpusName = Path(corpus).stem

    outfilenameWords = Path(outfolder, corpusName + "_words.txt")
    outfilenameBigrams = Path(outfolder, corpusName + "_bigrams.txt")
    outfilenameTrigrams = Path(outfolder, corpusName + "_trigrams.txt")
    outfilenameDx1 = Path(outfolderDx1, corpusName + ".dx1")

    wordDict = Counter()
    trigramDict = Counter()
    bigramDict = Counter()
    sep = "\t"
    corpusCurrentSize = 0  # running word token count

    print('Reading the corpus file now...')

    with infilename.open() as f:
        for line in f.readlines():
            if not line:
                continue

            line = line.strip().casefold()

            # TODO: modify/combine these with "scrubbing", cf. Alchemist and Lxa4
            line = line.replace(".", " . ")
            line = line.replace(",", " , ")
            line = line.replace(";", " ; ")
            line = line.replace("!", " ! ")
            line = line.replace("?", " ? ")
            line = line.replace(":", " : ")
            line = line.replace(")", " ) ")
            line = line.replace("(", " ( ")

            words = line.split()
            lenWords = len(words)

            corpusCurrentSize += lenWords

            for i in range(lenWords - 2):

                word1 = words[i]
                word2 = words[i + 1]
                word3 = words[i + 2]

                wordDict[word3] += 1

                if i == 0:
                    wordDict[word1] += 1
                    wordDict[word2] += 1
                    bigram = word1 + sep + word2
                    bigramDict[bigram] += 1

                bigram = word2 + sep + word3
                trigram = word1 + sep + word2 + sep + word3

                trigramDict[trigram] += 1
                bigramDict[bigram] += 1

            if maxwordtokens and corpusCurrentSize > maxwordtokens:
                break

    print("\nCompleted counting words, bigrams, and trigrams.")
    print("Token count: {}".format(corpusCurrentSize))

    intro_string = "# data source: {}\n# token count: {}".format(
        str(infilename), corpusCurrentSize)

    #    wordsSorted = sorted(wordDict.items(),
    #                                      key=lambda x: x[1], reverse=True)
    wordsSorted = sorted_alphabetized(wordDict.items(),
                                      key=lambda x: x[1],
                                      reverse=True)

    bigramsSorted = sorted_alphabetized(bigramDict.items(),
                                        key=lambda x: x[1],
                                        reverse=True)

    trigramsSorted = sorted_alphabetized(trigramDict.items(),
                                         key=lambda x: x[1],
                                         reverse=True)

    # print txt outputs
    with outfilenameWords.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(wordsSorted)), file=f)
        for (word, freq) in wordsSorted:
            print(word + sep + str(freq), file=f)

    with outfilenameBigrams.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(bigramsSorted)), file=f)
        for (bigram, freq) in bigramsSorted:
            print(bigram + sep + str(freq), file=f)

    with outfilenameTrigrams.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(trigramsSorted)), file=f)
        for (trigram, freq) in trigramsSorted:
            print(trigram + sep + str(freq), file=f)

    # print dx1 output
    with outfilenameDx1.open('w') as f:
        for (word, freq) in wordsSorted:
            print(word, freq, ' '.join(word), file=f)

    # print json outputs
    with changeFilenameSuffix(outfilenameWords, ".json").open('w') as f:
        json_pdump(dict(wordsSorted), f)

    with changeFilenameSuffix(outfilenameBigrams, ".json").open('w') as f:
        json_pdump(dict(bigramsSorted), f)

    with changeFilenameSuffix(outfilenameTrigrams, ".json").open('w') as f:
        json_pdump(dict(trigramsSorted), f)

    print('wordlist, bigram and trigram files ready')
    print('dx1 file ready')

    stdout_list("Output files:", outfilenameWords, outfilenameBigrams,
                outfilenameTrigrams, outfilenameDx1,
                changeFilenameSuffix(outfilenameWords, ".json"),
                changeFilenameSuffix(outfilenameBigrams, ".json"),
                changeFilenameSuffix(outfilenameTrigrams, ".json"))
Exemple #7
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         MinimumStemLength=4,
         MinimumAffixLength=1,
         SF_threshold=3,
         maxwordtokens=0,
         use_corpus=True):

    print("\n*****************************************************\n"
          "Running the tries.py program now...\n")

    #--------------------------------------------------------------------##
    #        read wordlist
    #--------------------------------------------------------------------##

    print("reading wordlist...", flush=True)

    wordlist_path, corpusName = get_wordlist_path_corpus_stem(
        language, corpus, datafolder, filename, maxwordtokens, use_corpus)

    print("wordlist file path:\n{}\n".format(wordlist_path))

    if not wordlist_path.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language,
                        corpus=corpus,
                        datafolder=datafolder,
                        filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist "
                     "\n"
                     "is not found.".format(wordlist_path))

    wordFreqDict = read_word_freq(wordlist_path)
    wordlist = sorted(wordFreqDict.keys())
    reversedwordlist = sorted([x[::-1] for x in wordlist])

    #--------------------------------------------------------------------##
    #        output settings
    #--------------------------------------------------------------------##

    if filename:
        outfolder = Path(Path(filename).parent, "tries")
    else:
        outfolder = Path(datafolder, language, "tries")

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfile_SF_name = Path(outfolder, corpusName + "_SF.txt")
    outfile_trieLtoR_name = Path(outfolder, corpusName + "_trieLtoR.txt")

    outfile_trieRtoL_name = Path(outfolder, corpusName + "_trieRtoL.txt")
    outfile_PF_name = Path(outfolder, corpusName + "_PF.txt")

    outfile_Signatures_name = Path(outfolder, corpusName + "_Signatures.txt")

    #--------------------------------------------------------------------##
    #        Find breaks in words (left-to-right and right-to-left)
    #--------------------------------------------------------------------##

    print("finding breaks in words...", flush=True)

    breaks_LtoR = findBreaksInWords(wordlist, MinimumStemLength)
    breaks_RtoL = findBreaksInWords(reversedwordlist, MinimumStemLength)

    #--------------------------------------------------------------------##
    #        Break up each word (left-to-right and right-to-left)
    #--------------------------------------------------------------------##

    WordsBrokenLtoR = BreakUpEachWord(wordlist, breaks_LtoR)
    WordsBrokenRtoL = BreakUpEachWord(reversedwordlist, breaks_RtoL)

    #--------------------------------------------------------------------------#
    #        Compute successors and predecessors
    #--------------------------------------------------------------------------#

    print("computing successors and predecessors...", flush=True)

    successors = GetSuccessors(wordlist, WordsBrokenLtoR)
    OutputSuccessors(outfile_SF_name, successors, SF_threshold)

    predecessors = GetSuccessors(reversedwordlist, WordsBrokenRtoL)
    OutputSuccessors(outfile_PF_name, predecessors, SF_threshold, reverse=True)

    outfile_SF_name_json = changeFilenameSuffix(outfile_SF_name, ".json")
    json_pdump(successors, outfile_SF_name_json.open("w"))

    outfile_PF_name_json = changeFilenameSuffix(outfile_PF_name, ".json")
    json_pdump(predecessors, outfile_PF_name_json.open("w"))

    print("printing signatures...", flush=True)
    OutputSignatures1(outfile_Signatures_name, successors)

    #--------------------------------------------------------------------------#
    #        Print tries (left-to-right, right-to-left)
    #--------------------------------------------------------------------------#

    print("printing tries...", flush=True)

    OutputTrie(outfile_trieLtoR_name, wordlist, WordsBrokenLtoR)
    OutputTrie(outfile_trieRtoL_name,
               reversedwordlist,
               WordsBrokenRtoL,
               reverse=True)

    outfile_trieLtoR_name_json = changeFilenameSuffix(outfile_trieLtoR_name,
                                                      ".json")
    json_pdump(WordsBrokenLtoR, outfile_trieLtoR_name_json.open("w"))

    outfile_trieRtoL_name_json = changeFilenameSuffix(outfile_trieRtoL_name,
                                                      ".json")
    json_pdump(WordsBrokenRtoL, outfile_trieRtoL_name_json.open("w"))

    stdout_list("Output files:", outfile_SF_name, outfile_PF_name,
                outfile_trieLtoR_name, outfile_trieRtoL_name,
                outfile_Signatures_name, outfile_SF_name_json,
                outfile_PF_name_json, outfile_trieLtoR_name_json,
                outfile_trieRtoL_name_json)
Exemple #8
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         MinimumStemLength=4,
         MaximumAffixLength=3,
         MinimumNumberofSigUses=5,
         maxwordtokens=0,
         use_corpus=True):

    print("\n*****************************************************\n"
          "Running the lxa5.py program now...\n")

    # -------------------------------------------------------------------------#
    #       decide suffixing or prefixing
    # -------------------------------------------------------------------------#

    suffix_languages = {
        "english", "french", "hungarian", "turkish", "russian", "german",
        "spanish", 'test'
    }
    prefix_languages = {"swahili"}

    if str(language).casefold() in prefix_languages:
        FindSuffixesFlag = False  # prefixal
    else:
        FindSuffixesFlag = True  # suffixal

    wordlist_path, corpus_stem = get_wordlist_path_corpus_stem(
        language, corpus, datafolder, filename, maxwordtokens, use_corpus)

    print("wordlist file path:\n{}\n".format(wordlist_path))

    if not wordlist_path.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language,
                        corpus=corpus,
                        datafolder=datafolder,
                        filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist "
                     "\n"
                     "is not found.".format(wordlist_path))

    wordFreqDict = read_word_freq(wordlist_path)
    wordlist = sorted(wordFreqDict.keys())

    if filename:
        outfolder = Path(Path(filename).parent, "lxa")
    else:
        outfolder = Path(datafolder, language, 'lxa')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    # TODO -- filenames not yet used in main()
    outfile_Signatures_name = str(outfolder) + corpus_stem + "_Signatures.txt"
    outfile_SigTransforms_name = str(
        outfolder) + corpus_stem + "_SigTransforms.txt"
    outfile_FSA_name = str(outfolder) + corpus_stem + "_FSA.txt"
    outfile_FSA_graphics_name = str(
        outfolder) + corpus_stem + "_FSA_graphics.png"

    # -------------------------------------------------------------------------#
    #   create: BisigToTuple
    #                  (key: tuple of bisig | value: set of (stem, word1, word2)
    #           StemToWords (key: stem | value: set of words)
    #           SigToStems  (key: tuple of sig | value: set of stems )
    #           StemToSig   (key: str of stem  | value: tuple of sig )
    #           WordToSigs  (key: str of word  | value: set of sigs )
    #           AffixToSigs (key: str of affix | value: set of sigs )
    # -------------------------------------------------------------------------#

    BisigToTuple = MakeBiSignatures(wordlist, MinimumStemLength,
                                    MaximumAffixLength, FindSuffixesFlag)
    print("BisigToTuple ready", flush=True)

    StemToWords = MakeStemToWords(BisigToTuple, MinimumNumberofSigUses)
    print("StemToWords ready", flush=True)

    SigToStems = MakeSigToStems(StemToWords, MaximumAffixLength,
                                MinimumNumberofSigUses, FindSuffixesFlag)
    print("SigToStems ready", flush=True)

    StemToSig = MakeStemToSig(SigToStems)
    print("StemToSig ready", flush=True)

    WordToSigs = MakeWordToSigs(StemToWords, StemToSig)
    print("WordToSigs ready", flush=True)

    WordToSigtransforms = MakeWordToSigtransforms(WordToSigs)
    print("WordToSigtransforms ready", flush=True)

    AffixToSigs = MakeAffixToSigs(SigToStems)
    print("AffixToSigs ready", flush=True)

    # -------------------------------------------------------------------------#
    #   generate graphs for several dicts
    # -------------------------------------------------------------------------#
    #    GenerateGraphFromDict(StemToWords, outfolder, 'StemToWords.gexf')
    #    GenerateGraphFromDict(SigToStems, outfolder, 'SigToStems.gexf')
    #    GenerateGraphFromDict(WordToSigs, outfolder, 'WordToSigs.gexf')
    #    GenerateGraphFromDict(StemToSig, outfolder, 'StemToSig.gexf')
    # -------------------------------------------------------------------------#

    # -------------------------------------------------------------------------#
    #      output stem file
    # -------------------------------------------------------------------------#

    stemfilename = Path(outfolder, '{}_StemToWords.txt'.format(corpus_stem))
    OutputLargeDict(stemfilename,
                    StemToWords,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    min_cell_width=25,
                    howmanyperline=5)

    print('===> stem file generated:', stemfilename, flush=True)

    # -------------------------------------------------------------------------#
    #      output affix file
    # -------------------------------------------------------------------------#

    affixfilename = Path(outfolder, '{}_AffixToSigs.txt'.format(corpus_stem))
    OutputLargeDict(affixfilename,
                    AffixToSigs,
                    min_cell_width=25,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    howmanyperline=5,
                    SignatureValues=True)
    print('===> affix file generated:', affixfilename, flush=True)

    # -------------------------------------------------------------------------#
    #   output SigToStems
    # -------------------------------------------------------------------------#

    SigToStems_outfilename = Path(outfolder, corpus_stem + "_SigToStems.txt")
    OutputLargeDict(SigToStems_outfilename,
                    SigToStems,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    howmanyperline=5,
                    SignatureKeys=True)

    SigToStems_outfilename_json = changeFilenameSuffix(SigToStems_outfilename,
                                                       ".json")
    json_pdump(SigToStems,
               SigToStems_outfilename_json.open("w"),
               key=lambda x: len(x[1]),
               reverse=True)

    print('===> output file generated:', SigToStems_outfilename, flush=True)
    print('===> output file generated:',
          SigToStems_outfilename_json,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output WordToSigs
    # -------------------------------------------------------------------------#

    WordToSigs_outfilename = Path(outfolder, corpus_stem + "_WordToSigs.txt")
    OutputLargeDict(WordToSigs_outfilename,
                    WordToSigs,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    min_cell_width=25,
                    SignatureValues=True)

    WordToSigs_outfilename_json = changeFilenameSuffix(WordToSigs_outfilename,
                                                       ".json")
    json_pdump(WordToSigs,
               WordToSigs_outfilename_json.open("w"),
               key=lambda x: len(x[1]),
               reverse=True)

    print('===> output file generated:', WordToSigs_outfilename, flush=True)
    print('===> output file generated:',
          WordToSigs_outfilename_json,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output WordToSigtransforms
    # -------------------------------------------------------------------------#

    WordToSigtransforms_outfilename = Path(
        outfolder, corpus_stem + "_WordToSigtransforms.txt")
    OutputLargeDict(WordToSigtransforms_outfilename,
                    WordToSigtransforms,
                    min_cell_width=25,
                    sigtransforms=True,
                    key=lambda x: len(x[1]),
                    reverse=True)
    print('===> output file generated:',
          WordToSigtransforms_outfilename,
          flush=True)

    WordToSigtransforms_outfilename_json = changeFilenameSuffix(
        WordToSigtransforms_outfilename, ".json")
    json_pdump(WordToSigtransforms,
               WordToSigtransforms_outfilename_json.open("w"),
               key=lambda x: len(x[1]),
               reverse=True)
    print('===> output file generated:',
          WordToSigtransforms_outfilename_json,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output the most freq word types not in any induced paradigms {the, of..}
    # -------------------------------------------------------------------------#

    wordFreqDict_sorted = sorted_alphabetized(wordFreqDict.items(),
                                              key=lambda x: x[1],
                                              reverse=True)

    mostFreqWordsNotInSigs_outfilename = Path(
        outfolder, corpus_stem + "_mostFreqWordsNotInSigs.txt")

    with mostFreqWordsNotInSigs_outfilename.open('w') as f:
        for (word, freq) in wordFreqDict_sorted:
            if word not in WordToSigs:
                print(word, freq, file=f)
            else:
                break

    print('===> output file generated:',
          mostFreqWordsNotInSigs_outfilename,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output the word types in induced paradigms
    # -------------------------------------------------------------------------#

    WordsInSigs_outfilename = Path(outfolder, corpus_stem + "_WordsInSigs.txt")

    with WordsInSigs_outfilename.open('w') as f:
        for (word, freq) in wordFreqDict_sorted:
            if word in WordToSigs:
                print(word, freq, file=f)

    print('===> output file generated:', WordsInSigs_outfilename, flush=True)

    # -------------------------------------------------------------------------#
    #   output the word types NOT in induced paradigms
    # -------------------------------------------------------------------------#

    WordsNotInSigs_outfilename = Path(outfolder,
                                      corpus_stem + "_WordsNotInSigs.txt")

    with WordsNotInSigs_outfilename.open('w') as f:
        for (word, freq) in wordFreqDict_sorted:
            if word not in WordToSigs:
                print(word, freq, file=f)

    print('===> output file generated:',
          WordsNotInSigs_outfilename,
          flush=True)
Exemple #9
0
def main(
    language=None,
    corpus=None,
    datafolder=None,
    filename=None,
    maxwordtypes=1000,
    nNeighbors=9,
    nEigenvectors=11,
    create_WordToContexts=False,
    create_ContextToWords=False,
    mincontexts=3,
    usesigtransforms=True,
):

    print("\n*****************************************************\n" "Running the manifold.py program now...\n")

    if filename:
        corpusStem = Path(filename).stem
        infolder = Path(Path(filename).parent, "ngrams")
        outfolder = Path(Path(filename).parent, "neighbors")
        outcontextsfolder = Path(Path(filename).parent, "word_contexts")
    else:
        corpusStem = Path(corpus).stem
        infolder = Path(datafolder, language, "ngrams")
        outfolder = Path(datafolder, language, "neighbors")
        outcontextsfolder = Path(datafolder, language, "word_contexts")

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    if not outcontextsfolder.exists():
        outcontextsfolder.mkdir(parents=True)

    infileWordsname = Path(infolder, corpusStem + "_words.txt")
    infileBigramsname = Path(infolder, corpusStem + "_bigrams.txt")
    infileTrigramsname = Path(infolder, corpusStem + "_trigrams.txt")

    if (not infileWordsname.exists()) or (not infileBigramsname.exists()) or (not infileTrigramsname.exists()):
        print("Error in locating n-gram data files.\n" "The program now creates them.\n")
        ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename)

    if usesigtransforms:
        if filename:
            infolderlxa = Path(Path(filename).parent, "lxa")
        else:
            infolderlxa = Path(datafolder, language, "lxa")
        sigtransform_json_fname = Path(infolderlxa, corpusStem + "_WordToSigtransforms.json")
        try:
            WordToSigtransforms = json_pload(sigtransform_json_fname.open())
        except FileNotFoundError:
            print('The file "{}" is not found.\n' "The program now creates it.\n".format(sigtransform_json_fname))
            lxa5.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename)
            WordToSigtransforms = json_pload(sigtransform_json_fname.open())

    # WordToSigtransforms just read into the program; to be used soon...

    print("Reading word list...", flush=True)
    mywords = GetMyWords(infileWordsname, corpus)

    print("Word file is", infileWordsname, flush=True)
    print("Number of neighbors to find for each word type: ", nNeighbors)
    print("Corpus has", len(mywords), "word types", flush=True)

    lenMywords = len(mywords)
    if lenMywords > maxwordtypes:
        nWordsForAnalysis = maxwordtypes
    else:
        nWordsForAnalysis = lenMywords
    print("number of words for analysis adjusted to", nWordsForAnalysis)

    analyzedwordlist = list(mywords.keys())[:nWordsForAnalysis]
    worddict = {w: analyzedwordlist.index(w) for w in analyzedwordlist}

    corpusName = corpusStem + "_" + str(nWordsForAnalysis) + "_" + str(nNeighbors)

    outfilenameNeighbors = Path(outfolder, corpusName + "_neighbors.txt")

    outfilenameSharedcontexts = Path(outfolder, corpusName + "_shared_contexts.txt")

    outfilenameNeighborGraph = Path(outfolder, corpusName + "_neighbors.gexf")

    outfilenameImportantContextToWords = Path(outfolder, corpusName + "_ImportantContextToWords.txt")

    outWordToContexts_json = Path(outcontextsfolder, corpusName + "_WordToContexts.json")

    outContextToWords_json = Path(outcontextsfolder, corpusName + "_ContextToWords.json")

    print("Reading bigrams/trigrams and computing context array...", flush=True)

    context_array, contextdict, WordToContexts, ContextToWords = GetContextArray(
        nWordsForAnalysis, worddict, infileBigramsname, infileTrigramsname, mincontexts
    )

    print("Computing shared context master matrix...", flush=True)
    CountOfSharedContexts = context_array.dot(context_array.T).todense()
    del context_array

    print("Computing diameter...", flush=True)
    Diameter = Normalize(nWordsForAnalysis, CountOfSharedContexts)

    print("Computing incidence graph...", flush=True)
    incidencegraph = compute_incidence_graph(nWordsForAnalysis, Diameter, CountOfSharedContexts)
    del CountOfSharedContexts

    print("Computing mylaplacian...", flush=True)
    mylaplacian = compute_laplacian(nWordsForAnalysis, Diameter, incidencegraph)
    del Diameter
    del incidencegraph

    print("Computing eigenvectors...", flush=True)
    myeigenvalues, myeigenvectors = GetEigenvectors(mylaplacian)
    del mylaplacian
    del myeigenvalues

    print("Computing distances between words...", flush=True)
    # take first N columns of eigenvector matrix
    coordinates = myeigenvectors[:, :nEigenvectors]
    wordsdistance = compute_words_distance(nWordsForAnalysis, coordinates)
    del coordinates

    print("Computing nearest neighbors now... ", flush=True)
    closestNeighbors = compute_closest_neighbors(wordsdistance, nNeighbors)

    WordToNeighbors_by_str = OrderedDict()
    WordToNeighbors = dict()

    for wordno in range(nWordsForAnalysis):
        line = closestNeighbors[wordno]
        word_idx, neighbors_idx = line[0], line[1:]
        word = analyzedwordlist[word_idx]
        neighbors = [analyzedwordlist[idx] for idx in neighbors_idx]
        WordToNeighbors_by_str[word] = neighbors
        WordToNeighbors[word_idx] = neighbors_idx

    del closestNeighbors

    with outfilenameNeighbors.open("w") as f:
        print(
            "# language: {}\n# corpus: {}\n"
            "# Number of word types analyzed: {}\n"
            "# Number of neighbors: {}\n".format(language, corpus, nWordsForAnalysis, nNeighbors),
            file=f,
        )

        for word, neighbors in WordToNeighbors_by_str.items():
            print(word, " ".join(neighbors), file=f)

    neighbor_graph = GetMyGraph(WordToNeighbors_by_str)

    # output manifold as gexf data file
    nx.write_gexf(neighbor_graph, str(outfilenameNeighborGraph))

    # output manifold as json for d3 visualization
    manifold_json_data = json_graph.node_link_data(neighbor_graph)
    outfilenameManifoldJson = Path(outfolder, corpusName + "_manifold.json")
    json.dump(manifold_json_data, outfilenameManifoldJson.open("w"), indent=2)

    WordToNeighbors_json = changeFilenameSuffix(outfilenameNeighbors, ".json")
    json_pdump(WordToNeighbors_by_str, WordToNeighbors_json.open("w"), asis=True)

    print("Computing shared contexts among neighbors...", flush=True)
    WordToSharedContextsOfNeighbors, ImportantContextToWords = compute_WordToSharedContextsOfNeighbors(
        nWordsForAnalysis, WordToContexts, WordToNeighbors, ContextToWords, nNeighbors, mincontexts
    )

    output_WordToSharedContextsOfNeighbors(
        outfilenameSharedcontexts, WordToSharedContextsOfNeighbors, worddict, contextdict, nWordsForAnalysis
    )

    output_ImportantContextToWords(outfilenameImportantContextToWords, ImportantContextToWords, contextdict, worddict)

    outputfilelist = [
        outfilenameNeighbors,
        outfilenameNeighborGraph,
        WordToNeighbors_json,
        outfilenameSharedcontexts,
        outfilenameImportantContextToWords,
        outfilenameManifoldJson,
    ]

    if create_WordToContexts:
        outputfilelist.append(outWordToContexts_json)
        json_pdump(WordToContexts, outWordToContexts_json.open("w"), key=lambda x: len(x[1]), reverse=True)

    if create_ContextToWords:
        outputfilelist.append(outContextToWords_json)
        json_pdump(ContextToWords, outContextToWords_json.open("w"), key=lambda x: len(x[1]), reverse=True)

    stdout_list("Output files:", *outputfilelist)