Exemple #1
0
def extractTopWordsOnly(authors, books, topWords, topWordsName, saveDir):
    saveDir += "%s/textsOnlyTopWords/" % topWordsName
    topWordDict = {}
    for i, w in enumerate(topWords):
        topWordDict[w] = i

    for author in authors:
        tokens = author.allTokens
        onlyTopTokens = []
        for token in tokens:
            if token in topWordDict:
                onlyTopTokens.append(topWordDict[token])

        utils.safeWrite(saveDir + "lists/authors/" + author.getSaveName() +
                        ".json",
                        onlyTopTokens,
                        dumpJSON=True)

    # get the top token frequency features for each book
    for book in books:
        tokens = book.tokens
        onlyTopTokens = []
        for token in tokens:
            if token in topWordDict:
                onlyTopTokens.append(topWordDict[token])

        utils.safeWrite(saveDir + "lists/books/" + book.getSaveName() +
                        ".json",
                        onlyTopTokens,
                        dumpJSON=True)
Exemple #2
0
def getSkippedWordInfo(baseFolder):
    output = []
    splitter = "\n------\n"

    output.append("Greek:\n")
    output.append(
        utils.getContent("output/greek/no_split/top250/chosenWordInfo.txt",
                         False))
    output.append("\nPoetry:")
    output.append(
        utils.getContent(
            "output/greek/no_split/top250+p/chosenWordInfoPoetry.txt", False))
    output.append(splitter)
    output.append("English:\n")
    output.append(
        utils.getContent("output/english/no_split/top250/chosenWordInfo.txt",
                         False))
    output.append("\nPoetry:")
    output.append(
        utils.getContent(
            "output/english/no_split/top250+p/chosenWordInfoPoetry.txt",
            False))
    output.append(splitter)
    output.append("Icelandic:\n")
    output.append(
        utils.getContent("output/icelandic/no_split/top250/chosenWordInfo.txt",
                         False))
    output.append(splitter)

    utils.safeWrite("%s/skippedWords.txt" % baseFolder, "\n".join(output))
Exemple #3
0
def getAuthorBookCounts(baseFolder):
    ab_counts_output = []
    splitter = "\n------\n"

    ab_counts_output.append("Greek:\n")
    ab_counts_output.append(
        utils.getContent("output/greek/numberOfAuthors_Books.txt", False))
    ab_counts_output.append(
        utils.getContent("output/greek/numberOfTypes_Tokens.txt", False))
    ab_counts_output.append(splitter)
    ab_counts_output.append("English:\n")
    ab_counts_output.append(
        utils.getContent("output/english/numberOfAuthors_Books.txt", False))
    ab_counts_output.append(
        utils.getContent("output/english/numberOfTypes_Tokens.txt", False))
    ab_counts_output.append(splitter)
    ab_counts_output.append("Icelandic:\n")
    ab_counts_output.append(
        utils.getContent("output/icelandic/numberOfAuthors_Books.txt", False))
    ab_counts_output.append(
        utils.getContent("output/icelandic/numberOfTypes_Tokens.txt", False))
    ab_counts_output.append(splitter)

    utils.safeWrite("%s/AuthorBookNumbers.txt" % baseFolder,
                    "\n".join(ab_counts_output))
Exemple #4
0
def makeTopAuthorTable(topStr, baseFolder):
    # Grab this from the best metric
    fname = "output/greek/no_split/%s/jensen-shannon/metric/Authors/sims.txt" % (
        topStr)
    allAuthorSims = utils.getContent(fname, False).split("\n")

    topAuthorPairs = []

    topAuthorPairs.append("""\\begin{table}[!bt]
  \\centering
  \\def\\arraystretch{1.2}
  \\begin{tabular}{| r | l | l | l | l |} \\hline
  & \\textbf{Author 1} & \\textbf{Author 2} & \\textbf{Score} & \\textbf{Notes}  \\\\\\hline
""")

    for i, pair in enumerate(allAuthorSims[:10]):
        splt1 = pair.split(" - ")
        sim = splt1[0]
        auths = splt1[1].split(" (")[0].split(", ")
        topAuthorPairs.append("  %.2d & %s & %s & %s & TODO \\\\\\hline" %
                              (i + 1, auths[0], auths[1], sim))

    topAuthorPairs.append("""
  \\end{tabular}
  \\caption{Top author pairs by similarity score according to Jensen-Shannon Similarity.}
  \\label{table:top_author_pairs}
\\end{table}
    """)

    utils.safeWrite("%smetric/topAuthorPairs.tex" % baseFolder,
                    "\n".join(topAuthorPairs))
def downloadText(textName, textSource, startBook, endBook, increments):
    books = []
    if (not (increments) == None):
        # if there are multiple cards, use them all, otherwise, it is single book.
        if (len(increments) >= 1):
            for i in range(len(increments)):
                index = increments[i]
                url = textSource + str(index)
                bookResult = utils.parse_TEI(utils.get_TEI_XML(url), textName,
                                             1, True, index)
                books.extend(bookResult)
        else:
            url = textSource
            bookResult = utils.parse_TEI(utils.get_TEI_XML(url), textName, 1,
                                         False, 0)
            books.extend(bookResult)
    else:
        books = []
        for i in range(startBook - 1, endBook):
            index = i + 1
            url = textSource + str(index)
            bookResult = utils.parse_TEI(utils.get_TEI_XML(url), textName,
                                         index, False, 0)
            books.extend(bookResult)

    print "Lines: " + str(len(books))
    outFileName = utils.getTextFn(textName)
    utils.safeWrite(outFileName, json.dumps(books))
Exemple #6
0
def printAuthorWords(authors, targets, saveDir):
    for author in authors:
        aname = author.authorName
        for targetTokenName in targets[aname]:
            targetToken = targetTokenName.split("_")[-1]
            matches = []
            for i, token in enumerate(author.allTokens):
                if token == targetToken:
                    matches.append(getTokenContext(i, author.allTokens))

            orders = [
                ("", None),
                ("_after", lambda x: x[2]),  # after context
                ("_before", lambda x: x[3]),  # reverse of before context
            ]
            for order in orders:
                oname, keyFunc = order
                if keyFunc == None:
                    myMatches = matches
                else:
                    myMatches = sorted(matches, key=keyFunc)
                output = []
                for match in myMatches:
                    output.append(" ~~ ".join(match[:3]))
                fname = saveDir + ("wordOccurrences%s/%s_%s.txt" %
                                   (oname, targetTokenName, aname))
                utils.safeWrite(fname, "\n".join(output))
Exemple #7
0
def runMLAlgorithms(X, tokens, y, groups, type_name, category_name, saveDir):
    X = np.array(X)
    y = np.array(y)

    # get word counts for naive bayes
    counts = []
    for i in range(len(X)):
        counts.append(X[i]*tokens[i])
    counts = np.array(counts)

    # 9 splits yields ~10 test authors per fold
    kf = GroupKFold(n_splits=9)

    # Ensure no work has books (segments) in both the training and test set.
    splits = list(kf.split(X, None, groups))

    output = []
    output.append("Average results for %s (%s) across %d folds:" % (category_name, type_name, len(splits)))

    # Run Majority Class
    output.append(majorityClass(X, y, splits, saveDir))

    # Run KNN
    output.append(knn(X, y, splits, saveDir))

    # Run Naive Bayes
    try:
        output.append(naiveBayes(counts, y, splits, saveDir))
    except ValueError:
        output.append("  Failed    - Naive Bayes")



    fname = saveDir + "res_%s_%s.txt" % (category_name, type_name)
    utils.safeWrite(fname, "\n".join(output))
Exemple #8
0
def convertBook(location):
    filename = loc.replace(RAW_FOLDER, "")
    newLoc = PARSED_FOLDER + filename
    t = utils.XMLText(loc)
    res = t.convertFromXML()
    utils.safeWrite(newLoc, res, True)
    return newLoc, res["booksRaw"]
Exemple #9
0
def combineTexts(textName, sourceTexts):

    allLines = []
    for source in sourceTexts:
        inFileName = utils.getTextFn(source)
        lines = utils.getContent(inFileName, True)
        allLines.extend(lines)

    jsonDump = json.dumps(allLines)
    outFileName = utils.getTextFn(textName)
    utils.safeWrite(outFileName, jsonDump)
Exemple #10
0
def makeMLTable(source, norm, filename):
    output = []

    output.append("""\\begin{table}[!bt]
  \\centering
  \\def\\arraystretch{1.2}
""")

    # No naive bayes if normed due to negative data
    if norm:
        output.append("  \\begin{tabular}{| r | l | l |} \\hline")
        output.append(
            "  \\textbf{Prediction Task} & \\textbf{Majority Class} & \\textbf{KNN}  \\\\\\hline"
        )
    else:
        output.append("  \\begin{tabular}{| r | l | l | l |} \\hline")
        output.append(
            "  \\textbf{Prediction Task} & \\textbf{Majority Class} & \\textbf{KNN} & \\textbf{Naive Bayes}  \\\\\\hline"
        )

    for t in ["Authors", "Books", "Books_2"]:
        cats = ["genre", "dialect", "timeframe"]
        if (t == "Books"):
            cats.append("author")
        if (t == "Books_2"):
            cats = ["work", "genre", "dialect", "timeframe", "author"]

        for cat in cats:
            fname = source + "res_%s_%s.txt" % (cat, t)
            lines = utils.getContent(fname, False).split("\n")
            maj_class = lines[1].split(" - ")[0].strip()
            knn = lines[2].split(" - ")[0].strip()
            naive_bayes = lines[3].split(" - ")[0].strip()

            t_name = t
            if t_name == "Books":
                t_name = "Segments"
            if t_name == "Books_2":
                t_name = "Segments*"
            if norm:
                output.append(" %s of %s & %s & %s \\\\\\hline" %
                              (cat, t_name, maj_class, knn))
            else:
                output.append(" %s of %s & %s & %s & %s \\\\\\hline" %
                              (cat, t_name, maj_class, knn, naive_bayes))

    output.append("""
  \\end{tabular}
  \\caption{Results of running simple machine learning on the frequency data.}
  \\label{table:ml+p}
\\end{table}
    """)

    utils.safeWrite(filename, "\n".join(output))
Exemple #11
0
def calcTopWordOverlapOverTime(language):
    saveDirBase = mp.languageInfo[language]["saveDir"]

    print("  Loading data...", end=" ", flush=True)
    authors, books, topWords = loadWCData(saveDirBase, -1, "commonWords", "")
    print("done")

    centWordCounts = {}
    for author in authors:
        cent = toCent(author.authorName)
        if not (cent in centWordCounts):
            centWordCounts[cent] = newEmptyWordCounts(topWords)

        for i, w in enumerate(topWords):
            count = author.counts[i]
            if (count > 0):
                centWordCounts[cent][w] += count

    centTopWords = {}
    for cent in centWordCounts:
        wc = centWordCounts[cent]
        wordList = []
        for w in wc:
            # word, count
            wordList.append([w, wc[w]])

        sortedWordList = sorted(wordList, key=lambda x: x[1], reverse=True)
        topWordList = list(map(lambda x: x[0], sortedWordList))
        centTopWords[cent] = set(topWordList[:NUM_TOP_WORDS])

    centuries = []
    for cent in centTopWords:
        centuries.append(cent)

    centuries = sorted(centuries)

    cells = []
    for i, c1 in enumerate(centuries):
        row = []
        for j, c2 in enumerate(centuries):
            overlap = 0
            wordSet1 = centTopWords[c1]
            wordSet2 = centTopWords[c2]
            for w in wordSet1:
                if (w in wordSet2):
                    overlap += 1
            row.append("%d" % overlap)
        cells.append(row)

    output = asciiTable(centuries, centuries, cells)

    utils.safeWrite(saveDirBase + "topWordOverlapOverTime.txt", output)
Exemple #12
0
def getMetricInfo(topStr, comparableTopStr, topNum, poetryNum, comparableNum,
                  simMetrics, baseFolder):
    # Copy full eval files for jensen-shannon
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Books/comparisonInfo.txt %smetric/extraInfo/metricEvaluation_tops.txt"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s+p/jensen-shannon/metric/Books/comparisonInfo.txt %smetric/extraInfo/metricEvaluation_+p.txt"
        % (topStr, baseFolder),
        shell=True)

    # Grab median distance
    fname = "output/greek/no_split/%s/jensen-shannon/metric/Books/comparisonInfo.txt" % (
        topStr)
    metricEvalInfo = utils.getContent(
        fname, False).split("=========")[-2].split("\n")[2:-1]
    sameAuthorRanks = []
    for i, line in enumerate(metricEvalInfo):
        sameAuthorRank = line.split("with same author: ")[1].split(".")[0]
        sameAuthorRanks.append(int(sameAuthorRank))

    median = np.median(sameAuthorRanks)

    utils.safeWrite(
        "%smetric/extraInfo/medianForDifferentAuthor.txt" % (baseFolder),
        "Median distance for closest author: %f" % median)

    # get info on the indica
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Books/sims/Arrian.Indica.1.txt %smetric/extraInfo/arrianIndica.txt"
        % (topStr, baseFolder),
        shell=True)

    # Info on book distance
    # Grab this from the best metric
    fname = "output/greek/no_split/%s/jensen-shannon/metric/Books/sims.txt" % (
        topStr)
    allBookSims = utils.getContent(fname, False).split("\n")

    utils.safeWrite("%smetric/lowestSimilarity.txt" % (baseFolder),
                    "Lowest similarity between segments: %s" % allBookSims[-1])

    # Info on top similar authors
    makeTopAuthorTable(topStr, baseFolder)

    # ===============================

    makeMetricEvalTables("", topStr, comparableTopStr, topNum, poetryNum,
                         comparableNum, simMetrics, baseFolder)
Exemple #13
0
def getAllTokenCounts(authors, saveDir):
    allTokenCounts = {}
    poetryTokenCounts = {}
    totalTokens = 0

    # for each author, keep track of counts; also keep track of prose/poetry
    for i in range(len(authors)):
        author = authors[i]
        totalTokens += len(author.allTokens)
        # print("%s: %d" %(author.authorName, len(author.allTokens)))
        allTokens = author.allTokens

        totalTokenCount = 0
        tokenCounts = {}
        for token in allTokens:

            totalTokenCount += 1
            if (token in tokenCounts):
                tokenCounts[token] = tokenCounts[token] + 1
            else:
                tokenCounts[token] = 1

            if (token in allTokenCounts):
                allTokenCounts[token] = allTokenCounts[token] + 1
            else:
                allTokenCounts[token] = 1

            if (toGenre(author.authorName) == 1):
                if (token in poetryTokenCounts):
                    poetryTokenCounts[token] = poetryTokenCounts[token] + 1
                else:
                    poetryTokenCounts[token] = 1

        author.tokenCounts = tokenCounts

        author.totalTokenCount = totalTokenCount

        # print("Results for %s:" % author.authorName)
        # print(len(allTokens))
        # print("---")
    typeTokenInfo = []
    typeTokenInfo.append("Total tokens: %d" % totalTokens)
    typeTokenInfo.append("Total types: %d" % len(allTokenCounts))

    typeTokenInfoStr = "\n".join(typeTokenInfo)
    print(typeTokenInfoStr)

    utils.safeWrite(saveDir + "numberOfTypes_Tokens.txt", typeTokenInfoStr)

    return allTokenCounts, poetryTokenCounts
Exemple #14
0
def getTextCounts(textLocation, saveDir):
    subprocess.run("cp %savailable.json %savailable.json" %
                   (textLocation, saveDir),
                   shell=True)

    available = utils.getContent(textLocation + "available.json", True)
    # For each available text
    for i, o in enumerate(available):
        if (i % 20 == 0):
            print(i, end=" ", flush=True)

        workLocs = o["works"]

        # Process each work
        for w in workLocs:
            t = utils.getContent(w["location"], True)

            booksRaw = t["booksRaw"]
            booksCounts = []
            for b in booksRaw:
                rawTokens = re.sub(r'\.,;:᾽῾\'', "", b["bookText"]).split(" ")
                tokenCounts = {}
                for token in rawTokens:
                    if (token == ""):
                        continue

                    if not (token in tokenCounts):
                        tokenCounts[token] = 1
                    else:
                        tokenCounts[token] += 1

                bookWithCounts = {}
                bookWithCounts["bookNumber"] = b["bookNumber"]
                bookWithCounts["bookTokenCounts"] = tokenCounts
                bookWithCounts["bookText"] = ""

                booksCounts.append(bookWithCounts)

            t["booksRaw"] = booksCounts

            # Remove "texts/" from start
            filename = "textCounts/" + w["location"][6:]
            utils.safeWrite(filename, t, True)
Exemple #15
0
def getOverlapInfo(baseFolder):
    output = []
    splitter = "\n------\n"

    output.append("Greek:\n")
    output.append(
        utils.getContent("output/greek/topWordOverlapOverTime.txt", False))
    output.append(splitter)
    output.append("English:\n")
    output.append(
        utils.getContent("output/english/topWordOverlapOverTime.txt", False))
    output.append(splitter)
    output.append("Icelandic:\n")
    output.append(
        utils.getContent("output/icelandic/topWordOverlapOverTime.txt", False))
    output.append(splitter)

    utils.safeWrite("%s/topWordOverlapOverTime.txt" % baseFolder,
                    "\n".join(output))
Exemple #16
0
def printKeyWords(dataSplit, top, subsetSize, language, saveDirBase):
    topName, _, _ = top
    # calculate save directory based on input parameters
    saveDir = saveDirBase + "%s" % (topName)

    keyWordsDir = saveDir + "/wordImportance/keyWords/"

    # find all the relevant json files
    files = os.listdir(keyWordsDir)
    for f in files:
        if f[-5:] == ".json":
            nameCore = f.split(".json")[0]

            # get the word info for this author pair
            words = utils.getContent(keyWordsDir + f, True)

            # get the authors
            authors = nameCore.split("_")
            a1 = authors[0]
            a2 = authors[1]
            print(a1, a2)

            # save dir for new files
            wordsDir = keyWordsDir + nameCore + "/"

            # gather the list of words and print them out along with percentiles
            wordList = []
            out = ["index, percentile, token"]
            for word in words:
                wordList.append("%03d_%s" % (words[word][0] + 1, word))
                out.append("%d, %.2f, %s" %
                           (words[word][0], words[word][1], word))

            utils.safeWrite(wordsDir + "words.txt", "\n".join(out))

            # get the info for each occurrence of the given words
            # associated with these authors
            target = {
                a1: wordList,
                a2: wordList,
            }
            printOccs(wordsDir, target, language)
Exemple #17
0
def getTopWords(N, tokenInfo, name, saveDir):
    if (N == 0):
        return []

    sortedTokenInfo = sorted(tokenInfo, key=lambda x: x[1], reverse=True)

    chosenCutoff = 0.5
    nextCutoff = 0.6
    chosen, skipped = getTokensForCutoff(sortedTokenInfo, N, chosenCutoff, 0)
    _, nextSkipped = getTokensForCutoff(sortedTokenInfo, N, nextCutoff,
                                        chosenCutoff)

    info = []
    info.append("Skipped (from cutoff of %f):" % chosenCutoff)
    for tokenInfo in skipped:
        word = tokenInfo[0]
        authorFrequency = tokenInfo[2]
        info.append("  %s (appears in %d%% of authors)" %
                    (word, 100 * authorFrequency))

    info.append("")

    info.append("Would skip (from cutoff of %f):" % nextCutoff)
    for tokenInfo in nextSkipped:
        word = tokenInfo[0]
        authorFrequency = tokenInfo[2]
        info.append("  %s (appears in %d%% of authors)" %
                    (word, 100 * authorFrequency))

    fname = "%schosenWordInfo%s.txt" % (saveDir, name)
    utils.safeWrite(fname, "\n".join(info))

    tops = chosen  # sortedTokenInfo[0:N]

    topWords = list(map(lambda x: x[0], tops))

    return topWords
Exemple #18
0
def storeFreqResults(authors, books, saveDir, topWords):
    # store frequencies
    authorOutput = []
    for author in authors:
        authorOutput.append(author.authorName + "," +
                            ",".join(map(str, author.featureData)))

    utils.safeWrite(saveDir + "authorFreqs.txt", "\n".join(authorOutput))

    bookOutput = []
    for book in books:
        if (book.numTokens >= mp.MIN_TOKENS_NECESSARY):
            bookOutput.append(book.getShortName() + "," +
                              ",".join(map(str, book.featureData)))

    utils.safeWrite(saveDir + "bookFreqs.txt", "\n".join(bookOutput))

    utils.safeWrite(saveDir + "topWords.txt", ",".join(topWords))
Exemple #19
0
            sheet[spec] = cell

    wb.remove(wb["Sheet"])
    wb.save(filename)


# Check for db
if (not (utils.fileExists(DB_LOCATION))):
    raise Exception("Please add a database at '" + DB_LOCATION + "'")

# Create input folder and subfolders
utils.check_and_create_path("input/articles")
utils.check_and_create_path("input/illustrations")

utils.safeWrite(
    "input/articles/README.md",
    "Place articles here. Articles should be included in a text file and the filename should be [lemma].txt, where [lemma] is the lemma the article is about. For example, the article for ἄβουλος should be in ἄβουλος.txt"
)
utils.safeWrite(
    "input/illustrations/README.md",
    "Place illustrations here. Illustrations should be a .jpg, .gif, or .png with the name of the lemma they are an illustraiton for. For example, the image for ἄβουλος should be in ἄβουλος.png (or ἄβουλος.gif or ἄβουλος.jpg)"
)

# Create lemmata xlsx
lemma_info = []
lemma_info.append([
    "Matched", "Lemma", "Short Definition", "Compounds", "Roots", "Sphere",
    "Part of Communication", "Frequency", "Illustration Caption",
    "Bibliography", "Notes"
])

print("Getting Tokens...")
Exemple #20
0
def parseEnglish():
    # If True we split each author into author_prose and author_poetry based on text genre
    # If False, we determine majority class and remove texts not of that work
    splitGenre = False


    RAW_FOLDER = "../rawTexts/english/"
    PARSED_FOLDER = "english/"

    authors = {}
    allBooks = []
    numTexts = 0
    numTexts2 = 0
    available = []

    # ==============================================================================
    # ==============================================================================
    # get all Gutenberg texts
    gutenbergFolder = RAW_FOLDER + "Gutenberg/txt/"
    textList = os.listdir(gutenbergFolder)

    i = 0;
    for author in authorWorks:
        baseAuthorName = author["authorName"]

        # determine whether this author has texts of both genre
        numProse = 0
        numPoetry = 0
        for text in author["works"]:
            workName = text["textName"]
            if (text["genre"] == 0):
                numProse += 1
            elif (text["genre"] == 1):
                numPoetry += 1

        specifyGenre = numProse > 0 and numPoetry > 0
        moreProse = numProse > numPoetry


        for text in author["works"]:
            authorName = baseAuthorName

            # if we are splitting authors by genre, append genre to the "author"
            # of this book as necessary
            if (splitGenre):
                if (specifyGenre):
                    if (text["genre"] == 0):
                        authorName += "_Prose"
                    else:
                        authorName +=  "_Poetry"

            else:# ignore texts of minority genre for this author
                if (not(moreProse) and text["genre"] == 0) or (moreProse and text["genre"] == 1):
                    continue


            # Ceate a work with each of the given books
            workName = text["textName"]
            newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName)
            workObject = {"name": workName, "location": "texts/" + newLocation}
            numTexts += 1

            books = []
            bookIndex = 1
            for b in text["books"]:
                books.append((gutenbergFolder + b, bookIndex))
                bookIndex += 1

            if authorName in authors:
                authors[authorName]["works"].append(workObject)
            else:
                authors[authorName] = {"author": authorName, "works": [workObject]}



            print(i, end=" ", flush=True)
            # Create a new gutenberg text for this text.
            try:
                t = utils.GutenbergText(authorName, workName, books)
                numTexts2 += len(books)
                res = t.convert()
                allBooks.extend(res["booksRaw"])
                utils.safeWrite(newLocation, res, True)
            except Exception as e:
                print(newLocation)
                print(e)

            i += 1

    # ==============================================================================
    # ==============================================================================
    # get Shakespeare
    for playType in ["comedies", "historical", "tragedies"]:
        shakeFolder = RAW_FOLDER + "ShakespearePlaysPlus/%s/" % playType
        textList = os.listdir(shakeFolder)

        for textName in textList:
            if (textName[-4:] == ".txt"):
                numTexts += 1
                authorName = "Shakespeare"
                workName = textName.replace(" ", "_")
                print(workName)
                newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName)
                workObject = {"name": workName, "location": "texts/" + newLocation}
                if authorName in authors:
                    authors[authorName]["works"].append(workObject)
                else:
                    authors[authorName] = {"author": authorName, "works": [workObject]}

                t = utils.ShakespeareText(authorName, workName, shakeFolder + textName)
                res = t.convert()
                allBooks.extend(res["booksRaw"])
                utils.safeWrite(newLocation, res, True)

    # ==============================================================================
    # ==============================================================================
    # get Middle English-y texts
    middleFolder = RAW_FOLDER + "ME/"
    textList = os.listdir(middleFolder)

    for textName in textList:
        split = textName[:-4].split("___")
        if (textName[-4:] == ".txt"):
            numTexts += 1
            authorName = split[0].replace(" ", "_")
            workName = split[1].replace(" ", "_")
            print(workName)
            newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName)
            workObject = {"name": workName, "location": "texts/" + newLocation}
            if authorName in authors:
                authors[authorName]["works"].append(workObject)
            else:
                authors[authorName] = {"author": authorName, "works": [workObject]}

            t = utils.METext(authorName, workName, middleFolder + textName)
            res = t.convert()
            allBooks.extend(res["booksRaw"])
            utils.safeWrite(newLocation, res, True)



    # ==============================================================================
    # Old English and 21st century corpus did not end up being included.

    # get 21st century texts
    # tfFolder = RAW_FOLDER + "21st/"
    # textList = os.listdir(tfFolder)
    #
    # for textName in textList:
    #     split = textName[:-4].split("___")
    #     if (textName[-4:] == ".txt"):
    #         numTexts += 1
    #         authorName = split[0].replace(" ", "_")
    #         workName = split[1].replace(" ", "_")
    #         print(workName)
    #         newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName)
    #         workObject = {"name": workName, "location": "texts/" + newLocation}
    #         if authorName in authors:
    #             authors[authorName]["works"].append(workObject)
    #         else:
    #             authors[authorName] = {"author": authorName, "works": [workObject]}
    #
    #         t = utils.TFText(authorName, workName, tfFolder + textName)
    #         res = t.convert()
    #         allBooks.extend(res["booksRaw"])
    #         utils.safeWrite(newLocation, res, True)


    # Old English
    # oeFolder = RAW_FOLDER + "OE/"
    # textList = os.listdir(tfFolder)
    #
    # for textName in textList:
    #     split = textName[:-4].split("___")
    #     if (textName[-4:] == ".txt"):
    #         numTexts += 1
    #         authorName = split[0].replace(" ", "_")
    #         workName = split[1].replace(" ", "_")
    #         print(workName)
    #         newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName)
    #         workObject = {"name": workName, "location": "texts/" + newLocation}
    #         if authorName in authors:
    #             authors[authorName]["works"].append(workObject)
    #         else:
    #             authors[authorName] = {"author": authorName, "works": [workObject]}
    #
    #         t = utils.OEText(authorName, workName, oeFolder + textName)
    #         res = t.convert()
    #         allBooks.extend(res["booksRaw"])
    #         utils.safeWrite(newLocation, res, True)
    #

    for author in authors:
        available.append(authors[author])

    utils.safeWrite(PARSED_FOLDER + "available.json", available, True)
    print("Done.")

    # Optionally count the characters in the corpus. This is done to find weird
    # Unicode artifacts to make sure it gets removed in the cleaning step.
    countChars = False#True#
    if countChars:
        print("Counting Chars")
        chars = {}
        for b in allBooks:
            bookText = b["bookText"]
            for char in bookText:
                chars[char] = True

        sortedChars = sorted(list(chars.keys()))


        for c in sortedChars:
            utils.printUnicodeChar(c)
        print("======")

        # If true, show the set of unique characters when things are decomposed
        if False:
            decomposedChars = {}
            for c in sortedChars:
                res = utils.fullyDecomposeUnicodeChar(c)
                for newC in res:
                    decomposedChars[newC] = True

            sortedDecompChars = sorted(list(decomposedChars.keys()))

            for c in sortedDecompChars:
                utils.printUnicodeChar(c)
Exemple #21
0
def parseIcelandic():
    RAW_FOLDER = "../rawTexts/icelandic/"
    PARSED_FOLDER = "icelandic/"

    authors = {}
    allBooks = []
    numTexts = 0
    numTexts2 = 0
    available = []

    allNames = []

    # ==============================================================================
    # ==============================================================================
    # get texts from icepahc
    icepahcFolder = RAW_FOLDER + "icepahc-v0.9/txt/"

    for text in icepahcList:
        numTexts += 1
        authorName = text["author"]
        workName = text["title"]
        id = text["id"]

        textName = id + ".txt"
        allNames.append(workName)
        newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName)
        workObject = {"name": workName, "location": "texts/" + newLocation}
        if authorName in authors:
            authors[authorName]["works"].append(workObject)
        else:
            authors[authorName] = {"author": authorName, "works": [workObject]}

        t = utils.IcepahcText(authorName, workName, icepahcFolder + textName)
        res = t.convert()
        allBooks.extend(res["booksRaw"])
        utils.safeWrite(newLocation, res, True)


    # ==============================================================================
    # ==============================================================================
    # get texts from sagas
    sagasFolder = RAW_FOLDER + "textar/fornritin/xml/"

    for text in sagasList:
        numTexts += 1
        if (text["id"] == "F1E"):
            authorName == "Snorri_Sturluson"
        else:
            authorName = "Anon_" + text["id"]
        workName = text["title"].strip().replace(" ", "_")
        allNames.append(workName + "#" + text["id"])
        newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName)
        workObject = {"name": workName, "location": "texts/" + newLocation}
        if authorName in authors:
            authors[authorName]["works"].append(workObject)
        else:
            authors[authorName] = {"author": authorName, "works": [workObject]}

        t = utils.SagasText(authorName, workName, sagasFolder + text["id"] + ".xml")
        res = t.convert()
        allBooks.extend(res["booksRaw"])
        utils.safeWrite(newLocation, res, True)

    # ==============================================================================
    # ==============================================================================
    # get books from MIM corpus
    modernBooksFolder = RAW_FOLDER + "MIM/baekur/"

    # For each text in the list, download it
    for text in modernBookList:
        numTexts += 1
        authorName = text["author"].strip().replace(" ", "_")
        workName = text["title"].strip().replace(" ", "_")

        allNames.append(workName + "#" + text["id"])
        newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName)
        workObject = {"name": workName, "location": "texts/" + newLocation}
        if authorName in authors:
            authors[authorName]["works"].append(workObject)
        else:
            authors[authorName] = {"author": authorName, "works": [workObject]}

        # MIM texts are in the same format as Saga texts so this works fine.
        t = utils.SagasText(authorName, workName, modernBooksFolder + text["id"] + ".xml")
        res = t.convert()
        allBooks.extend(res["booksRaw"])
        utils.safeWrite(newLocation, res, True)

    for author in authors:
        available.append(authors[author])

    utils.safeWrite(PARSED_FOLDER + "available.json", available, True)
    print("Done.")


    # Optionally count the characters in the corpus. This is done to find weird
    # Unicode artifacts to make sure it gets removed in the cleaning step.
    countChars = False#True#
    if countChars:
        print("Counting Chars")
        chars = {}
        for b in allBooks:
            bookText = b["bookText"]
            for char in bookText:
                chars[char] = True

        sortedChars = sorted(list(chars.keys()))


        for c in sortedChars:
            utils.printUnicodeChar(c)
        print("======")

        # If true, show the set of unique characters when things are decomposed
        if False:
            decomposedChars = {}
            for c in sortedChars:
                res = utils.fullyDecomposeUnicodeChar(c)
                for newC in res:
                    decomposedChars[newC] = True

            sortedDecompChars = sorted(list(decomposedChars.keys()))

            for c in sortedDecompChars:
                utils.printUnicodeChar(c)
def cleanAndCombineFeatures(texts, approach):

    matrix = []

    textNames = []

    featureNames = []

    numTexts = len(texts)
    # for all the texts
    for i in range(numTexts):
        text = texts[i]
        textName = text["textName"]
        divideByBook = text["divideByBook"]
        toBeCombined = text["toBeCombined"]

        if (toBeCombined or textName == "Iliad1" or textName == "Odyssey1"):
            continue

        ofn = generalUtils.getTextFeatureDataOdikonFn(textName, approach)
        tfn = generalUtils.getTextFeatureDataTamnonFn(textName)

        odikonFeaturesRaw = generalUtils.getContent(ofn, True)
        tamnonFeaturesRaw = generalUtils.getContent(tfn, True)

        if (len(odikonFeaturesRaw) != len(tamnonFeaturesRaw)):
            raise Exception("Number of subtexts for " + textName + " do not match")

        # for each set of features (the books plus the overall text)
        for j in range(len(odikonFeaturesRaw)):
            # get the raw features for this subtext
            ro = odikonFeaturesRaw[j]
            rt = tamnonFeaturesRaw[j]

            # determine the names for these two texts and make sure they match
            roString = ro["TextName"] + ": " + ro["SubName"]
            rtString = rt["TextName"] + ": " + rt["SubName"]
            if (roString != rtString):
                raise Exception("Book mismatch! " + roString + " and " + rtString)

            # add the cleaned features to the row
            row = []
            row.extend(cleanRawOdikon(ro, False))
            row.extend(cleanRawTamnon(rt, False))
            matrix.append(row)
            textNames.append(roString)

            # and one time, get the list of feature names.
            if (i == 0 and j == 0):
                featureNames.extend(cleanRawOdikon(ro, True))
                featureNames.extend(cleanRawTamnon(rt, True))

    # output the information.
    print "Number of Features: %d." % len(matrix[0])
    output = {
    "rowNames": textNames,
    "matrix": matrix,
    "featureNames": featureNames
    }
    fName = generalUtils.getFeatureMatrixFn()
    generalUtils.safeWrite(fName, json.dumps(output))
Exemple #23
0
def getCenturyInfo(topStr, baseFolder):
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_overall_no_labels.pdf %scentury/centuriesGreek.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_overall_labels.pdf %scentury/extraInfo/Greek_CenturyOverall_Label.pdf"
        % (topStr, baseFolder),
        shell=True)

    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/simRange.txt %scentury/extraInfo/Greek_SimRange.txt"
        % (topStr, baseFolder),
        shell=True)

    # -------------------------
    # Century similarity data
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/extraInfo/Greek_Century_No_Label.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/Greek_Century_Label.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/extraInfo/Greek+p_Century_No_Label.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/Greek+p_Century_Label.pdf"
        % (topStr, baseFolder),
        shell=True)

    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels.pdf %scentury/extraInfo/Greek_Century_Cutoff_No_Label.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_labels.pdf %scentury/extraInfo/Greek_Century_Cutoff_Label.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels.pdf %scentury/extraInfo/Greek+p_Century_Cutoff_No_Label.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_under_9_labels.pdf %scentury/extraInfo/Greek+p_Century_Cutoff_Label.pdf"
        % (topStr, baseFolder),
        shell=True)

    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels.pdf %scentury/centuriesGreek2.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels_violin.pdf %scentury/centuriesGreekViolin.pdf"
        % (topStr, baseFolder),
        shell=True)

    subprocess.run(
        "cp output/english/no_split/%s/jensen-shannon/metric/Authors/simRange.txt %scentury/extraInfo/English_SimRange.txt"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/centuriesEnglish.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels_violin.pdf %scentury/centuriesEnglishViolin.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/English_Century_Label.pdf"
        % (topStr, baseFolder),
        shell=True)

    subprocess.run(
        "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/simRange.txt %scentury/extraInfo/Icelandic_SimRange.txt"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/centuriesIcelandic.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels_violin.pdf %scentury/centuriesIcelandicViolin.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/Icelandic_Century_Label.pdf"
        % (topStr, baseFolder),
        shell=True)

    # Get pvalue + other regression information for charts
    greekPval = utils.getContent(
        "output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_pslope.txt"
        % (topStr), False)
    englishPval = utils.getContent(
        "output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_pslope.txt"
        % (topStr), False)
    icelandicPval = utils.getContent(
        "output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_pslope.txt"
        % (topStr), False)

    pvalOutput = []
    pvalOutput.append("Greek:")
    pvalOutput.append(greekPval)
    pvalOutput.append("English:")
    pvalOutput.append(englishPval)
    pvalOutput.append("Icelandic:")
    pvalOutput.append(icelandicPval)

    utils.safeWrite("%scentury/century_pvals.txt" % baseFolder,
                    "\n".join(pvalOutput))
Exemple #24
0
def keyAuthorComparisonWithImportance(authors, books, baseSaveDir, splitParam,
                                      topWords):
    makeWordImportanceGraphs = False
    keyAuthData = getKeyAuthorData(authors, books)
    saveDir = baseSaveDir + "wordImportance/"
    allDiffLineData = {}
    allCumulDiffLineData = {}
    allRCumulDiffLineData = {}
    allPercentageLineData = {}

    # load diffs for plotting internal similarities
    allDiffsFilename = baseSaveDir + "dists/diffLists.json"
    allDiffs = utils.getContent(allDiffsFilename, True)

    # For each set of key authors, make necessary visaulizations
    for dat in keyAuthData:
        data, _, dataLabels, chartFileName = dat

        print("    %s..." % chartFileName)
        numWords = len(topWords)
        numTexts = len(dataLabels)
        tickLabels = topWords
        distsFilename = baseSaveDir + "dists/" + chartFileName + ".json"
        dists = utils.getContent(distsFilename, True)
        # dists = [
        #     {"name": "D1", "vals": (np.random.random((numWords))*1.5 - 0.5)},
        #     {"name": "D2", "vals": (np.random.random((numWords))*1.5 - 0.5)}
        # ]
        for d in dists:
            d["vals"] = np.array(d["vals"])

        if (makeWordImportanceGraphs):
            graphUtils.wordImportanceComparison(data, dataLabels, tickLabels,
                                                dists, saveDir + "unsorted/",
                                                chartFileName, True)

        # display versions sorted by each metric
        for d in dists:
            sortedSaveDir = saveDir + d["name"] + "-sorted/"
            fname = chartFileName
            sortedInds = np.array(
                list(
                    map(
                        lambda x: x[0],
                        sorted(enumerate(d["vals"]),
                               key=lambda x: x[1][0],
                               reverse=True))))

            data1 = copy.deepcopy(data)
            tickLabels1 = copy.deepcopy(tickLabels)
            wordsUsed = len(topWords)
            # If the similarity metric includes remainder, we have to add it
            if (len(dists[0]["vals"]) == len(data[0]) + 1):
                newData = []
                for row in data1:
                    r = np.append(row, 1 - np.sum(row))
                    newData.append(r)
                data1 = newData

                tickLabels1.append("Remainder")
                wordsUsed += 1

            data2 = list(map(lambda x: np.array(x)[sortedInds], data1))
            tickLabels2 = np.array(tickLabels1)[sortedInds]
            dists2 = copy.deepcopy(dists)
            percentiles = []
            for d2 in dists2:
                d2["vals"] = np.copy(d2["vals"])[sortedInds]

            if (makeWordImportanceGraphs):
                graphUtils.wordImportanceComparison(data2, dataLabels,
                                                    tickLabels2, dists2,
                                                    sortedSaveDir, fname, True)

            # save all words
            if d["name"] == "Jensen-shannon":
                fname = saveDir + "keyWords/" + chartFileName + ".json"
                SimDiff = {}
                for i, val in enumerate(d["vals"][sortedInds]):
                    if (True):
                        SimDiff[tickLabels2[i]] = [i, val[1]]
                utils.safeWrite(fname, SimDiff, True)

            # Diff data
            trueDiffs = np.array(
                list(map(lambda x: x[0], d["vals"][sortedInds])))
            y = (chartFileName, trueDiffs)
            y_cumul = (chartFileName, np.cumsum(trueDiffs))
            linesToGraphDiff = [y]
            linesToGraphDiffCumul = [y_cumul]

            # store info for the chart with all authors
            if d["name"] in allDiffLineData:
                allDiffLineData[d["name"]].extend([y])
            else:
                allDiffLineData[d["name"]] = [y]
            if d["name"] in allCumulDiffLineData:
                allCumulDiffLineData[d["name"]].extend([y_cumul])
            else:
                allCumulDiffLineData[d["name"]] = [y_cumul]

            # dif percentile data
            percentiles = list(map(lambda x: x[1], d["vals"][sortedInds]))
            y = (chartFileName, percentiles)
            linesToGraphPct = [y]

            # store info for the chart with all authors
            if d["name"] in allPercentageLineData:
                allPercentageLineData[d["name"]].append(y)
            else:
                allPercentageLineData[d["name"]] = [y]

            if splitParam == -1:
                # get percentiles for internal consistency of second author
                author1 = dataLabels[0]
                author2 = dataLabels[1]

                authorInternalConsistencies = [
                    # ["split5", author1, "-split5"],
                    # ["split-2", author1, "-splitHalf"],

                    # ["split5", author2, "-split5"],
                    # ["split-2", author2, "-splitHalf"]
                ]

                # Gen information comparing consistencies within given authors.
                for aic in authorInternalConsistencies:
                    a2DiffsFilename = baseSaveDir.replace(
                        "no_split",
                        aic[0]) + "dists/%s_%s_2.json" % (aic[1], aic[1])
                    if (utils.fileExists(a2DiffsFilename)):
                        a2Diffs = utils.getContent(a2DiffsFilename, True)
                        diffNums = None
                        for ad in allDiffs:
                            if ad["name"] == d["name"]:
                                diffNums = ad["allDiffs"]

                        a2RawDiffs = None
                        for ad in a2Diffs:
                            if ad["name"] == d["name"]:
                                a2RawDiffs = ad["vals"]

                        if (diffNums != None and a2RawDiffs != None):
                            # Add difference data
                            aicName = aic[1] + aic[2]
                            a2SortedInds = np.array(
                                list(
                                    map(
                                        lambda x: int(x[0]),
                                        sorted(enumerate(a2RawDiffs),
                                               key=lambda x: x[1][0],
                                               reverse=True))))
                            trueDiffs = np.array(
                                list(
                                    map(lambda x: x[0],
                                        np.array(a2RawDiffs)[a2SortedInds])))
                            y_diff = (aicName, trueDiffs)
                            y_diff_cumul = (aicName, np.cumsum(trueDiffs))
                            linesToGraphDiff.append(y_diff)
                            linesToGraphDiffCumul.append(y_diff_cumul)

                            # Add Percentile data
                            a2Percentiles = []
                            for rd in a2RawDiffs:
                                index = bisect.bisect_left(diffNums, rd[0])
                                a2Percentiles.append(
                                    (100.0 * index) / len(diffNums))

                            a2Percentiles = sorted(a2Percentiles, reverse=True)
                            y2 = (aicName, a2Percentiles)
                            linesToGraphPct.append(y2)
                    else:
                        print("File does not exist: \"%s\"" % a2DiffsFilename)

            # Create charts showing differences for various authors
            graphUtils.lineChart(range(wordsUsed),
                                 linesToGraphDiff,
                                 True,
                                 sortedSaveDir,
                                 chartFileName + "_diff-chart",
                                 yLim=None)  #[-0.002, 0]
            graphUtils.lineChart(range(wordsUsed),
                                 linesToGraphDiffCumul,
                                 True,
                                 sortedSaveDir,
                                 chartFileName + "_diff-cumul-chart",
                                 yLim=None,
                                 yAdjust=1)  #[-0.002, 0]
            #graphUtils.lineChart(range(wordsUsed), linesToGraphPct, True, sortedSaveDir, chartFileName+"_pct-chart")

            linesToGraphDiffRCumul = []
            for name, c in linesToGraphDiffCumul:
                name = name.replace("-split5", " Local Split")
                name = name.replace("-splitHalf", " Global Split")
                linesToGraphDiffRCumul.append((name, c[-1] - np.array(c)))

            if d["name"] in allRCumulDiffLineData:
                allRCumulDiffLineData[d["name"]].extend(
                    [linesToGraphDiffRCumul])
            else:
                allRCumulDiffLineData[d["name"]] = [linesToGraphDiffRCumul]
            graphUtils.lineChart(range(wordsUsed),
                                 linesToGraphDiffRCumul,
                                 True,
                                 sortedSaveDir,
                                 chartFileName + "_diff-r-cumul-chart",
                                 yLim=None,
                                 yAdjust=1)  #[-0.002, 0]

    for d in dists:
        # 4-Up Chart for these authors
        sortedSaveDir = saveDir + d["name"] + "-sorted/"
        graphUtils.lineChart4Up(range(wordsUsed),
                                allRCumulDiffLineData[d["name"]],
                                True,
                                sortedSaveDir,
                                "4up-r-cumul",
                                yLim=None,
                                yAdjust=1)

    # Create graph charts for all data in a cloud
    graphTypes = [
        ("all-diffs", allDiffLineData, None, 0),
        ("all-diffs-cumul", allCumulDiffLineData, None, 1),
        #("all-pcts", allPercentageLineData, [0, 100], 0)
    ]
    alls = {}
    for graphType, lineList, yLim, adjust in graphTypes:
        medFilename = baseSaveDir + "dists/median-%s.json" % graphType
        med = utils.getContent(medFilename, True)

        alls[graphType] = {}
        for d in med:
            lineList[d["name"]].append(["Median", d["line"]])
            alls[graphType][d["name"]] = d["all"]

        for name in allPercentageLineData:
            sortedSaveDir = baseSaveDir + "wordImportance/" + name + "-sorted/"
            for log in [False]:  #, True]:
                print("  %s..." % graphType)
                graphUtils.lineChart(range(wordsUsed),
                                     lineList[name],
                                     True,
                                     sortedSaveDir,
                                     graphType,
                                     yLim=yLim,
                                     log=log,
                                     yAdjust=adjust)
                print("  %s cloud..." % graphType)
                graphUtils.lineChart(range(wordsUsed),
                                     lineList[name],
                                     True,
                                     sortedSaveDir,
                                     graphType + "-cloud",
                                     yLim=yLim,
                                     allLines=alls[graphType][name],
                                     log=log,
                                     yAdjust=adjust)

    # Create chart showing ignored top words
    n = "Jensen-shannon"
    sortedSaveDir = baseSaveDir + "wordImportance/" + n + "-sorted/"

    # Cumulative
    data = allCumulDiffLineData[n]

    # Add lines
    res = []
    targetSim = -1
    for item in alls["all-diffs-cumul"][n]:
        name, c = item
        # "Aristotle_Pindar" in name or

        #"AeliusAristides_Demosthenes", "DioChrysostom_Plato"
        if ("ApolloniusRhodius_QuintusSmyrnaeus" in name
                or "DioChrysostom_Xenophon" == name):
            res.append((name, "-", 1 + c[-1] - np.array(c)))

        # Lowest of our top authors
        if ("DioChrysostom_Xenophon" == name):
            targetSim = c[-1]

    # add median
    # for item in allCumulDiffLineData[n]:
    #     name, c = item
    #     if ("Median" in name):
    #         res.append((name, "-", 1 + c[-1] - np.array(c)))

    # Add line cloud
    resAll = []
    for item in alls["all-diffs-cumul"][n]:
        name, c = item
        if not ("Hymns_Dionysus" in name or "Euclid" in name):
            n1, n2 = name.replace("Hymns_", "Hymns").split("_")
            n1 = n1.replace("Hymns", "Hymns_")
            n2 = n2.replace("Hymns", "Hymns_")
            centuryDiff = centDiff(genre.toCent(n1), genre.toCent(n2))
            #print("%s, %s: %d" % (n1, n2, centuryDiff))
            if (centuryDiff >= 4):
                # color top sims differently
                color = "k-"

                resAll.append((name, color, 1 + c[-1] - np.array(c)))

    # for name, c in data:
    #     y = c[-1] - np.array(c)
    #     res.append((name, y))

    #resAll = map(lambda n, c: (n, c[-1] - np.array(c)))
    graphUtils.compareWordUsageChart(res,
                                     True,
                                     sortedSaveDir,
                                     "ignoreBestWords",
                                     yLim=None,
                                     allLines=resAll)
Exemple #25
0
def gatherFilesFull(topStr, topNum, comparableTopStr, comparableNum,
                    poetryNum):
    baseFolder = "output/full/"

    folders = [
        "",
        "data",
        "genre",
        "metric",
        "metric/extraInfo",
        "century",
        "century/extraInfo",
        "wordUse",
        "wordUse/extraInfo",
        "wordUse/grouping",
    ]
    createFolders(folders, baseFolder)

    # Get info for the data section
    getDataInfo(topStr, baseFolder)

    # Get info for approach section
    getWordUseInfo(topStr, baseFolder)

    # Get genre info
    getGenreInfo(topStr, baseFolder)
    # Gather 4up tsne charts for standard data and data normalized by genre
    # Grab this from the best metric
    subprocess.run(
        "cp output/greek/no_split/%s/Authors/tSNE/info_no_labels_4Up.pdf %sgenre/groupings.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s/Books/tSNE/outliers4up.pdf %sgenre/bookOutliers.pdf"
        % (topStr, baseFolder),
        shell=True)

    # Get book tsne charts
    # Grab this from the best metric
    subprocess.run(
        "cp output/greek/no_split/%s/Books/tSNE/tSNE_2D_no_labels.pdf %sgenre/books_tSNE_no_labels.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s/Books/tSNE/tSNE_2D_labels.pdf %sgenre/books_tSNE_labels.pdf"
        % (topStr, baseFolder),
        shell=True)
    # To get a look at these, run python3 visualizeBooks

    # Get info for standard and normalized by poetry
    makeMLTable("output/greek/no_split/%s/dataPreds/" % (topStr), False,
                "%sgenre/ml_table.tex" % baseFolder)
    # makeMLTable("output/greek/no_split/%s+p/dataPreds/" % (topStr), False, "%sgenre/ml_table+p.tex" % baseFolder)

    # =========================

    # Get info for results section

    # -----------
    # Metric
    getMetricInfo(topStr, comparableTopStr, topNum, poetryNum, comparableNum,
                  SIM_METRICS, baseFolder)

    makeMetricInternalTables("", topStr, SIM_METRICS, baseFolder)
    makeMetricInternalTables("", topStr + "+p", SIM_METRICS, baseFolder)

    # -----------
    # Century
    # Get information on century comparison
    getCenturyInfo(topStr, baseFolder)
    # Get pvalue + other regression information for charts that are + p
    greekPval = utils.getContent(
        "output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_under_9_pslope.txt"
        % (topStr), False)
    englishPval = utils.getContent(
        "output/english/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_pslope.txt"
        % (topStr), False)

    pvalOutput = []
    pvalOutput.append("Greek:")
    pvalOutput.append(greekPval)
    pvalOutput.append("English:")
    pvalOutput.append(englishPval)

    utils.safeWrite("%scentury/century_pvals+p.txt" % baseFolder,
                    "\n".join(pvalOutput))

    # -------------------------
    # Grab this from the best metric
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/sims.txt %swordUse/authorSims.txt"
        % (topStr, baseFolder),
        shell=True)

    fourCenturiesTables(topStr, SIM_METRICS, baseFolder)

    # get word usage charts and info
    getWordUsageInfo(topStr, baseFolder)
Exemple #26
0
i = 1
allBooks = []
for o in available:
    workLocs = o["works"]
    for w in workLocs:
        if (i % 20 == 0):
            print("%d out of %d (%.2f%%)" % (i, numTexts,
                                             (100 * i / numTexts)))
        loc = w["location"]
        if (True or i == TARGET_BOOK):
            newLoc, books = convertBook(loc)
            allBooks.extend(books)
            w["location"] = newLoc
        i += 1

utils.safeWrite(PARSED_FOLDER + "available.json", available, True)
print("Done.")

# If desired, analyze the unicode characters in the processed texts.
countChars = False  #True#
if countChars:
    print("Counting Chars")
    chars = {}
    for b in allBooks:
        bookText = b["bookText"]
        for char in bookText:
            chars[char] = True

    sortedChars = sorted(list(chars.keys()))

    for c in sortedChars:
Exemple #27
0
def getWordUseInfo(topStr, baseFolder):
    # total +p words
    tops = utils.getContent(
        "output/greek/no_split/%s/wordInfo_%s.txt" % (topStr, topStr),
        False).split("\n")[1:]
    poetrys = utils.getContent(
        "output/greek/no_split/top_p/wordInfo_top_p.txt",
        False).split("\n")[1:]
    # Top plus poetry
    totals = utils.getContent(
        "output/greek/no_split/%s+p/wordInfo_%s+p.txt" % (topStr, topStr),
        False).split("\n")[1:]

    numWordsOutput = []
    numWordsOutput.append("Number of Top Words: %d" % len(tops))
    numWordsOutput.append("Number of Poetry Words: %d" % len(poetrys))
    numWordsOutput.append("Total Number of Words: %d" % len(totals))
    utils.safeWrite("%s/wordUse/totalWords.txt" % baseFolder,
                    "\n".join(numWordsOutput))

    # Create Table of words
    topRanks = {}
    poetryRanks = {}

    for i, line in enumerate(tops):
        w = line.split(":")[0]
        topRanks[w] = i + 1

    for i, line in enumerate(poetrys):
        w = line.split(":")[0]
        poetryRanks[w] = i + 1

    rankInfo = []
    for line in totals:
        w = line.split(":")[0]
        topRank = ""
        if w in topRanks:
            topRank = "%d" % topRanks[w]
        poetryRank = ""
        if w in poetryRanks:
            poetryRank = "%d" % poetryRanks[w]

        rankInfo.append((w, topRank, poetryRank))

    rankTableOutput = []
    rankTableOutput.append("""
    \\begin{table}[!hbt]
      \\centering
      \\def\\arraystretch{1}
      \\begin{tabular}{| l | l | l ||| l | l | l ||| l | l | l ||| l | l | l |}
    \\hline

    \\textbf{Token} & \\textbf{A} & \\textbf{P} & \\textbf{Token} & \\textbf{A} & \\textbf{P} & \\textbf{Token} & \\textbf{A} & \\textbf{P} & \\textbf{Token} & \\textbf{A} & \\textbf{P}\\\\\\hline
    """)

    columnHeight = 43
    for i in range(columnHeight):
        cells = []
        for j in range(4):
            index = i + j * columnHeight
            cell = ""
            if (index < len(rankInfo)):
                cell = "%s & %s & %s" % rankInfo[index]

            cells.append(cell)
        rankTableOutput.append("%s \\\\\\hline" % (" & ".join(cells)))

    rankTableOutput.append("""
      \\end{tabular}
      \\caption{List of tokens used, along with their rank in the top 150 tokens found in all texts (\\textbf{A}) and rank in the top 100 tokens found in poetry texts (\\textbf{P}).}
      \\label{table:top_words}
    \\end{table}
    """)

    utils.safeWrite("%swordUse/topWordsTable.tex" % baseFolder,
                    "\n".join(rankTableOutput))
Exemple #28
0
def makeMetricInternalTables(suffix, topStr, simMetrics, baseFolder):
    metricInternalTables = []
    for simMetric in simMetrics:
        dir, metricName = simMetric

        # skip Jensen-Shannon
        if metricName == "Jensen-Shannon":
            continue

        tableOutput = []
        temp = """
\\begin{table}[!bt]
  \\centering
  \\def\\arraystretch{1}
  \\begin{tabular}{| l | c | c | c |}
\\hline
        """
        tableOutput.append(temp)

        temp = "\\textbf{Metric Options} & \\textbf{Author} & \\textbf{Work} & \\textbf{Total} \\\\\\hline"
        tableOutput.append(temp)

        workSigReport = []
        authorSigReport = []
        totalSigReport = []

        # & \\textbf{Sim to another work} & \\textbf{Closest to diff author} & \\textbf{Median}

        metricOptions = [("Baseline", "-remainder-smoothed"),
                         ("+1 Smoothing", "-remainder+smoothed"),
                         ("Remainder", "+remainder-smoothed"),
                         ("Both", "+remainder+smoothed")]

        # Get the list of authors and works the metric got correct
        scoreLists = {}
        for _, opt in metricOptions:
            scoreLists[opt] = {}
            name = opt
            # Use Poetry Words
            metricTopStr = topStr

            fname = "output/greek/no_split/%s/%s/metric%s/Books/scores.json" % (
                metricTopStr, dir, opt)
            scores = utils.getContent(fname, True)
            scoreLists[opt] = scores
            scoreLists[opt]["name"] = name

        baseScore = scoreLists["-remainder-smoothed"]
        # baseScores = []
        # for bsi in baseScoreInfo:
        #     baseScoreMetric, baseScoreIndex = bsi
        #     baseScores.append(scoreLists[baseScoreMetric][baseScoreIndex])

        # Create a table of the information using the provided scores
        for optName, opt in metricOptions:
            cell = "\\textbf{%s}" % (optName)

            currentScores = scoreLists[opt]
            authorScores = currentScores["author"]
            workScores = currentScores["work"]
            name = currentScores["name"]
            sameWork = "%.2f%%, (%d/%d)" % (
                100 * np.mean(workScores), np.sum(workScores), len(workScores))
            sameAuth = "%.2f%%, (%d/%d)" % (100 * np.mean(authorScores),
                                            np.sum(authorScores),
                                            len(authorScores))
            all = np.concatenate((workScores, authorScores))
            total = "%.2f%%, (%d/%d)" % (100 * np.mean(all), np.sum(all),
                                         len(all))

            wrk = " & %s" % (sameWork)
            auth = " & %s" % (sameAuth)
            tot = " & %s" % (total)

            # Calculate significance
            a = baseScore["work"]
            b = currentScores["work"]
            work_t, work_p = stats.ttest_rel(a, b)
            workSigReport.append(name)
            # Degrees of freedom
            df = len(b) - 1
            workSig = "  (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % (
                np.mean(b), np.std(b), df, work_t, work_p)
            workSigReport.append(workSig)

            a = baseScore["author"]
            b = currentScores["author"]
            author_t, author_p = stats.ttest_rel(a, b)
            authorSigReport.append(name)
            # Degrees of freedom
            df = len(b) - 1
            authorSig = "  (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % (
                np.mean(b), np.std(b), df, author_t, author_p)
            authorSigReport.append(authorSig)

            a = np.concatenate((baseScore["work"], baseScore["author"]))
            b = np.concatenate(
                (currentScores["work"], currentScores["author"]))
            all_t, all_p = stats.ttest_rel(a, b)
            totalSigReport.append(name)
            # Degrees of freedom
            df = len(b) - 1
            totalSig = "  (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % (
                np.mean(b), np.std(b), df, all_t, all_p)
            totalSigReport.append(totalSig)

            # if (name == bestMetricName or name == baseScore["name"]):
            #     bestMetricSigWork.append("%s vs %s" % (name, baseScore["name"]))
            #     bestMetricSigWork.append(workSig)
            #
            #     bestMetricSigAuthor.append("%s vs %s" % (name, baseScore["name"]))
            #     bestMetricSigAuthor.append(authorSig)

            #print("  Author: t-statistic = %6.3f pvalue = %f" %  stats.ttest_rel(a, b))

            # Significance notes
            if (work_p < 0.01):
                wrk += "\\textbf{†}"
            elif (work_p < 0.05):
                wrk += "\\textbf{*}"
            if (author_p < 0.01):
                auth += "\\textbf{†}"
            elif (author_p < 0.05):
                auth += "\\textbf{*}"
            if (all_p < 0.01):
                tot += "\\textbf{†}"
            elif (all_p < 0.05):
                tot += "\\textbf{*}"

            # wrk += " %.4f" % work_p
            # auth += " %.4f" % author_p
            # tot += " %.4f" % all_p

            cell += "%s%s%s" % (wrk, auth, tot)

            cell = cell.replace("%", "\\%")
            tableOutput.append("%s\\\\\\hline" % cell)

        tableOutput.append("\\end{tabular}")
        tableOutput.append("\\caption{")
        tableOutput.append(
            "How well %s performs with the remainder words and smoothing included. "
            % metricName)
        tableOutput.append(
            "†: Results very significant (p < 0.01) when compared to baseline. "
        )
        tableOutput.append(
            "*: Results significant (p < 0.05) when compared to baseline. ")
        tableOutput.append("}")
        tableOutput.append("\\label{table:metric_options_eval_%s}" % dir)
        tableOutput.append("\\end{table}")

        tableOutput.append("")
        tableOutput.append("")
        metricInternalTables.append("\n".join(tableOutput))
        utils.safeWrite(
            "%smetric/%s_optionsEvalTable%s.tex" %
            (baseFolder, metricName, suffix), "\n".join(tableOutput))

        # sigReport = "Work:\n" + ("\n".join(bestMetricSigWork)) + "\n\n-------------\n\nAuthor:\n" + ("\n".join(bestMetricSigAuthor))
        # utils.safeWrite("%smetric/bestMetricSignificance%s_2.txt" % (baseFolder, suffix), sigReport)

        # utils.safeWrite("%smetric/extraInfo/metricSignificanceReportWork%s_2.txt" % (baseFolder, suffix), "\n".join(workSigReport))
        # utils.safeWrite("%smetric/extraInfo/metricSignificanceReportAuthor%s_2.txt" % (baseFolder, suffix), "\n".join(authorSigReport))
    utils.safeWrite(
        "%smetric/extraInfo/optionsEvalTables%s.tex" % (baseFolder, suffix),
        "\n".join(metricInternalTables))
Exemple #29
0
def makeMetricEvalTables(suffix, topStr, comparableTopStr, topNum, poetryNum,
                         comparableNum, simMetrics, baseFolder):
    baseScoreInfo = [
        ("Cosine", 0),
        ("Burrows' Delta", 0),
    ]

    bestMetricName = "Jensen-Shannon (250)"  #Jensen-Shannon+p
    bestMetricSigWork = []
    bestMetricSigAuthor = []

    evalTableOutput = []
    evalTableOutput.append("""\\begin{table}[!bt]
  \\centering
  \\def\\arraystretch{1}
  \\begin{tabular}{| l | r | r |}
\\hline
 & \\multicolumn{2}{c|}{\\textbf{Percentage of segments most similar to a segment...}} \\\\

\\textbf{Metric}& \\textbf{from the same work} & \\textbf{by the same author} \\\\\\hline
""")

    sameWorkTableOutput = []
    sameAuthorTableOutput = []
    temp = """\\begin{table}[!bt]
  \\centering
  \\def\\arraystretch{1}
  \\begin{tabular}{| l | c | c | c |}
\\hline
    """
    sameWorkTableOutput.append(temp)
    sameAuthorTableOutput.append(temp)

    temp = "& & \\textbf{Top %d +} & \\\\" % (topNum)
    sameWorkTableOutput.append(temp)
    sameAuthorTableOutput.append(temp)

    temp = "\\textbf{Metric}& \\textbf{Top %d} & \\textbf{Top %d in Poetry} & \\textbf{Top %d} \\\\\\hline" % (
        topNum, poetryNum, comparableNum)
    sameWorkTableOutput.append(temp)
    sameAuthorTableOutput.append(temp)

    workSigReport = []
    authorSigReport = []

    # & \\textbf{Sim to another work} & \\textbf{Closest to diff author} & \\textbf{Median}

    # Get the list of authors and works the metric got correct
    scoreLists = {}
    for simMetric in simMetrics:
        dir, metricName = simMetric
        scoreLists[metricName] = {}
        for i, params in enumerate([
            (False, False),
            (True, False),
            (False, True),
        ]):
            name = metricName
            addP, comparable = params
            metricTopStr = topStr
            if addP:
                metricTopStr += "+p"
                name += "+p"
            # look at comparable number of non-poetry words
            elif comparable:
                metricTopStr = comparableTopStr
                name += " (%d)" % comparableNum
            else:
                name += " (%d)" % topNum

            fname = "output/greek/no_split/%s/%s/metric/Books/scores.json" % (
                metricTopStr, dir)
            scores = utils.getContent(fname, True)
            scoreLists[metricName][i] = scores
            scoreLists[metricName][i]["name"] = name

    baseScores = []
    for bsi in baseScoreInfo:
        baseScoreMetric, baseScoreIndex = bsi
        baseScores.append(scoreLists[baseScoreMetric][baseScoreIndex])

    # Create a table of the information using the provided scores
    for metricName in scoreLists:
        cell2 = "\\textbf{%s}" % (metricName)
        cell3 = "\\textbf{%s}" % (metricName)
        for i in scoreLists[metricName]:
            currentScores = scoreLists[metricName][i]
            authorScores = currentScores["author"]
            workScores = currentScores["work"]
            name = currentScores["name"]
            sameWork = "%.2f%%" % (100 * np.mean(workScores))
            sameAuth = "%.2f%%" % (100 * np.mean(authorScores))
            # sameWork = "%.2f%%, (%d/%d)" % (100*np.mean(workScores), np.sum(workScores), len(workScores))
            # sameAuth = "%.2f%%, (%d/%d)" % (100*np.mean(authorScores), np.sum(authorScores), len(authorScores))

            # cell = "%s & %s & %s & %s & %s & %s" % (name, sameAuth, sameWork, otherWork, diffAuthClosest, median)
            cell = "%s & %s & %s" % (name, sameWork, sameAuth)
            cell = cell.replace("%", "\\%")
            evalTableOutput.append("%s\\\\\\hline" % cell)

            cell2 += " & %s" % (sameWork)  # work_p
            cell3 += " & %s" % (sameAuth)  # , author_p)

            for j, baseScore in enumerate(baseScores):
                a = baseScore["work"]
                b = currentScores["work"]
                work_t, work_p = stats.ttest_rel(a, b)
                workSigReport.append(name)
                # Degrees of freedom
                df = len(b) - 1
                workSig = "  (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % (
                    np.mean(b), np.std(b), df, work_t, work_p)
                workSigReport.append(workSig)

                a = baseScore["author"]
                b = currentScores["author"]
                author_t, author_p = stats.ttest_rel(a, b)
                authorSigReport.append(name)
                # Degrees of freedom
                df = len(b) - 1
                authorSig = "  (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % (
                    np.mean(b), np.std(b), df, author_t, author_p)
                authorSigReport.append(authorSig)

                if (name == bestMetricName or name == baseScore["name"]):
                    bestMetricSigWork.append("%s vs %s" %
                                             (name, baseScore["name"]))
                    bestMetricSigWork.append(workSig)

                    bestMetricSigAuthor.append("%s vs %s" %
                                               (name, baseScore["name"]))
                    bestMetricSigAuthor.append(authorSig)

                #print("  Author: t-statistic = %6.3f pvalue = %f" %  stats.ttest_rel(a, b))

                # Significance notes
                if (j == 0):
                    if (work_p < 0.01):
                        cell2 += "\\textbf{†}"
                    elif (work_p < 0.05):
                        cell2 += "\\textbf{*}"
                    if (author_p < 0.01):
                        cell3 += "\\textbf{†}"
                    elif (author_p < 0.05):
                        cell3 += "\\textbf{*}"
                else:
                    if (work_p < 0.01):
                        cell2 += "\\textbf{‡}"
                    if (author_p < 0.01):
                        cell3 += "\\textbf{‡}"

        cell2 = cell2.replace("%", "\\%")
        sameWorkTableOutput.append("%s\\\\\\hline" % cell2)

        cell3 = cell3.replace("%", "\\%")
        sameAuthorTableOutput.append("%s\\\\\\hline" % cell3)

    evalTableOutput.append("""
      \\end{tabular}
      \\caption{How well similarity metrics identify whether two segments come from the same work or the same author.}
      \\label{table:metric_eval}
    \\end{table}
    """)

    utils.safeWrite(
        "%smetric/extraInfo/metricEvalTable%s.tex" % (baseFolder, suffix),
        "\n".join(evalTableOutput))

    sameWorkTableOutput.append("\\end{tabular}")
    sameWorkTableOutput.append(
        "\\caption[How well similarity metrics based on a given set of words identify whether two segments come from the same work.]{"
    )
    sameWorkTableOutput.append(
        "How well similarity metrics based on a given set of words identify whether two segments come from the same work. \\newline"
    )
    sameWorkTableOutput.append(
        "†: Results very significant (p < 0.01) when compared to %s. \\newline"
        % baseScores[0]["name"])
    sameWorkTableOutput.append(
        "*: Results significant (p < 0.05) when compared to %s. \\newline" %
        baseScores[0]["name"])
    sameWorkTableOutput.append(
        "‡: Results very significant (p < 0.01) when compared to %s. " %
        baseScores[1]["name"])
    sameWorkTableOutput.append("}")
    sameWorkTableOutput.append("\\label{table:metric_eval_work}")
    sameWorkTableOutput.append("\\end{table}")

    utils.safeWrite("%smetric/sameWorkEvalTable%s.tex" % (baseFolder, suffix),
                    "\n".join(sameWorkTableOutput))

    sameAuthorTableOutput.append("\\end{tabular}")
    sameAuthorTableOutput.append(
        "\\caption[How well similarity metrics based on a given set of words identify whether two segments come from the same author.]{"
    )
    sameAuthorTableOutput.append(
        "How well similarity metrics based on a given set of words identify whether two segments come from the same author. \\newline"
    )
    sameAuthorTableOutput.append(
        "†: Results very significant (p < 0.01) when compared to %s. \\newline"
        % baseScores[0]["name"])
    sameAuthorTableOutput.append(
        "*: Results significant (p < 0.05) when compared to %s. \\newline" %
        baseScores[0]["name"])
    sameAuthorTableOutput.append(
        "‡: Results very significant (p < 0.01) when compared to %s. " %
        baseScores[1]["name"])
    sameAuthorTableOutput.append("}")
    sameAuthorTableOutput.append("\\label{table:metric_eval_author}")
    sameAuthorTableOutput.append("\\end{table}")

    utils.safeWrite(
        "%smetric/sameAuthorEvalTable%s.tex" % (baseFolder, suffix),
        "\n".join(sameAuthorTableOutput))

    sigReport = "Work:\n" + (
        "\n".join(bestMetricSigWork)) + "\n\n-------------\n\nAuthor:\n" + (
            "\n".join(bestMetricSigAuthor))
    utils.safeWrite(
        "%smetric/bestMetricSignificance%s.txt" % (baseFolder, suffix),
        sigReport)
    # utils.safeWrite("%smetric/bestMetricSignificanceWork%s.txt" % (baseFolder, suffix), "\n".join(bestMetricSigWork))
    # utils.safeWrite("%smetric/bestMetricSignificanceAuthor%s.txt" % (baseFolder, suffix), "\n".join(bestMetricSigAuthor))

    utils.safeWrite(
        "%smetric/extraInfo/metricSignificanceReportWork%s.txt" %
        (baseFolder, suffix), "\n".join(workSigReport))
    utils.safeWrite(
        "%smetric/extraInfo/metricSignificanceReportAuthor%s.txt" %
        (baseFolder, suffix), "\n".join(authorSigReport))
Exemple #30
0
def fourCenturiesTables(topStr, simMetrics, baseFolder):
    comparisonOutput = []
    topSimsToExamine = 100

    # Grab this from the best metric
    authorSims = utils.getContent(
        "output/greek/no_split/%s/jensen-shannon/metric/Authors/sims.txt" %
        (topStr), False).split("\n")
    topDistantSims = []
    topDistantAuthors = {}
    for i, sim in enumerate(authorSims):
        centuries_apart = int(sim.split("(")[-1].split(" ")[0])
        if (centuries_apart >= 4 and i < topSimsToExamine):
            topDistantSims.append(sim)
            topDistantAuthors[sim[11:]] = {}

        authors = " (".join(sim.split(" - ")[1].split(" (")[:-1])
        if authors == "Isocrates, Lysias" or authors == "Plato, Xenophon" or authors == "AratusSolensis, Callimachus" or authors == "Herodotus, Thucydides":
            comparisonOutput.append("Rank %d: %s" % (i + 1, sim))

    fourCenturiesApartOutput = []
    fourCenturiesApartOutput.append(
        "%d of the top %d are at least 4 centuries apart." %
        (len(topDistantSims), topSimsToExamine))
    fourCenturiesApartOutput.append("---")
    fourCenturiesApartOutput.extend(topDistantSims)

    utils.safeWrite("%swordUse/fourCenturiesApart.txt" % baseFolder,
                    "\n".join(fourCenturiesApartOutput))

    # Comparison to English and Icelandic
    numGreek = len(authorSims)
    fracGreek = topSimsToExamine / numGreek
    numDistantGreek = len(topDistantSims)

    englishSims = utils.getContent(
        "output/english/no_split/%s/jensen-shannon/metric/Authors/sims.txt" %
        (topStr), False).split("\n")
    numEnglish = len(englishSims)
    topSimsEnglish = int(np.ceil(numEnglish * fracGreek))
    fracEnglish = topSimsEnglish / numEnglish
    numDistantEnglish = 0
    num2English = 0
    for sim in englishSims[:topSimsEnglish]:
        centuries_apart = int(sim.split("(")[-1].split(" ")[0])
        if (centuries_apart >= 2):
            num2English += 1
        if (centuries_apart >= 4):
            numDistantEnglish += 1

    iceSims = utils.getContent(
        "output/icelandic/no_split/%s/jensen-shannon/metric/Authors/sims.txt" %
        (topStr), False).split("\n")
    numIcelandic = len(iceSims)
    topSimsIcelandic = int(np.ceil(numIcelandic * fracGreek))
    fracIcelandic = topSimsIcelandic / numIcelandic
    numDistantIcelandic = 0
    for sim in iceSims[:topSimsIcelandic]:
        centuries_apart = int(sim.split("(")[-1].split(" ")[0])
        if (centuries_apart >= 4):
            numDistantIcelandic += 1

    comparisonOutput.append("\n=========\n")
    comparisonOutput.append("Top similar pairs")
    comparisonOutput.append("Greek:")
    comparisonOutput.append("  examining top %d of %d pairs (%.2f%%)" %
                            (topSimsToExamine, numGreek, 100 * fracGreek))
    comparisonOutput.append(
        "  %d (%.2f%%) are at least 4 centuries apart" %
        (numDistantGreek, 100 * numDistantGreek / topSimsToExamine))
    comparisonOutput.append("English:")
    comparisonOutput.append("  examining top %d of %d pairs (%.2f%%)" %
                            (topSimsEnglish, numEnglish, 100 * fracEnglish))
    comparisonOutput.append(
        "  %d (%.2f%%) are at least 4 centuries apart" %
        (numDistantEnglish, 100 * numDistantEnglish / topSimsEnglish))
    comparisonOutput.append("  %d (%.2f%%) are at least 2 centuries apart" %
                            (num2English, 100 * num2English / topSimsEnglish))
    comparisonOutput.append("Icelandic:")
    comparisonOutput.append(
        "  examining top %d of %d pairs (%.2f%%)" %
        (topSimsIcelandic, numIcelandic, 100 * fracIcelandic))
    comparisonOutput.append(
        "  %d (%.2f%%) are at least 4 centuries apart" %
        (numDistantIcelandic, 100 * numDistantIcelandic / topSimsIcelandic))

    utils.safeWrite("%swordUse/fourApartComparisonInfo.txt" % baseFolder,
                    "\n".join(comparisonOutput))

    # Table
    for simMetric in simMetrics:
        dir, name = simMetric
        # "" or "+p" depending on which is better
        metricSims = utils.getContent(
            "output/greek/no_split/%s/%s/metric/Authors/sims.txt" %
            (topStr, dir), False).split("\n")
        for i, sim in enumerate(metricSims):
            pairName = sim[11:]
            if pairName in topDistantAuthors:
                topDistantAuthors[pairName][dir] = i + 1

    # prepare values for coloring table cells
    maxVal = 0
    minVal = 1000000

    for authorPair in topDistantAuthors:
        for simDir, _ in simMetrics:
            val = topDistantAuthors[authorPair][simDir]
            minVal = min(minVal, val)
            maxVal = max(maxVal, val)

    pairRankOutput = []
    pairRankOutputSimple = []
    pairRankOutput.append("""
    \\begin{table}[!bt]
      \\centering
      \\def\\arraystretch{1}
      \\begin{tabular}{| l | c | c | c | c | c | c |}
    \\hline
    & \\multicolumn{5}{c|}{\\textbf{Rank according to}} \\\\
    & \\textbf{Jensen-} & \\textbf{Burrows'} & & & & \\\\
    \\textbf{Authors} & \\textbf{Shannon} & \\textbf{Delta} & \\textbf{Min-Max} & \\textbf{Manhattan} & \\textbf{Canberra} & \\textbf{Cosine} \\\\\\hline
    """)
    pairRankOutputSimple.append("%s,%s,%s,%s,%s,%s,%s" %
                                ("Authors", "Jensen-Shannon", "Burrow's Delta",
                                 "Min-Max", "Manhattan", "Canberra", "Cosine"))
    authorConvert = {
        "ApolloniusRhodius": "Apollonius",
        "DionysiusOfHalicarnassus": "Dionysius",
        "EusebiusOfCaesarea": "Eusebius",
        "ClementOfAlexandria": "Clement",
        "BasilBishopOfCaesarea": "Basil",
        "Anonymous(Hymns_Aphrodite)": "Hymns Aphrodite",
        "Anonymous(Hymns_Apollo)": "Hymns Apollo",
        "Anonymous(Hymns_Demeter)": "Hymns Demeter",
        "Anonymous(Hymns_Hermes)": "Hymns Hermes",
        "Anonymous(Hymns_Rest)": "Hymns Rest",
    }
    for authorPair in topDistantAuthors:
        pair = "(".join(authorPair.split(" (")[:-1])
        pairSplit = pair.split(", ")
        author1 = pairSplit[0]
        author2 = pairSplit[1]

        if author1 in authorConvert:
            author1 = authorConvert[author1]
        if author2 in authorConvert:
            author2 = authorConvert[author2]

        pairName = author1 + ", " + author2
        cell = "%s &" % pairName
        cellSimple = "%s," % re.sub(", ", "/", pairName)
        firstVal = None
        for simDir, _ in simMetrics:
            val = topDistantAuthors[authorPair][simDir]

            cutoff = 100
            if (val < cutoff):
                r, g, b = colorConvert(minVal, cutoff, val, COLOR_ORANGE,
                                       COLOR_GRAY)
            else:
                r, g, b = colorConvert(cutoff, maxVal, val, COLOR_GRAY,
                                       COLOR_BLUE)
            cell += "\\cellcolor[rgb]{%.3f,%.3f,%.3f} " % (r, g, b)

            if (firstVal == None):
                firstVal = val
                cell += "%d & " % (val)
                cellSimple += "%d," % (val)
            else:
                cell += "%d (%+d) & " % (val, firstVal - val)
                rel = "(%d)" % (firstVal - val)
                cellSimple += "%d %s," % (val, rel)
        cell = cell[:-2]
        pairRankOutput.append("%s\\\\\\hline" % cell)
        pairRankOutputSimple.append(cellSimple)
    pairRankOutput.append("""
      \\end{tabular}
      \\caption{Rank of these pair's similarity by different metrics.}
      \\label{table:pair_rank}
    \\end{table}
    """)

    utils.safeWrite("%swordUse/pairRankTable.tex" % baseFolder,
                    "\n".join(pairRankOutput))
    utils.safeWrite("%swordUse/pairRankTableSimple.csv" % baseFolder,
                    "\n".join(pairRankOutputSimple))
Exemple #31
0
	def commit(self, jobNum, jobObj):
		fp = open(os.path.join(self._dbPath, 'job_%d.txt' % jobNum), 'w')
		utils.safeWrite(fp, utils.DictFormat(escapeString = True).format(jobObj.getAll()))