def extractTopWordsOnly(authors, books, topWords, topWordsName, saveDir): saveDir += "%s/textsOnlyTopWords/" % topWordsName topWordDict = {} for i, w in enumerate(topWords): topWordDict[w] = i for author in authors: tokens = author.allTokens onlyTopTokens = [] for token in tokens: if token in topWordDict: onlyTopTokens.append(topWordDict[token]) utils.safeWrite(saveDir + "lists/authors/" + author.getSaveName() + ".json", onlyTopTokens, dumpJSON=True) # get the top token frequency features for each book for book in books: tokens = book.tokens onlyTopTokens = [] for token in tokens: if token in topWordDict: onlyTopTokens.append(topWordDict[token]) utils.safeWrite(saveDir + "lists/books/" + book.getSaveName() + ".json", onlyTopTokens, dumpJSON=True)
def getSkippedWordInfo(baseFolder): output = [] splitter = "\n------\n" output.append("Greek:\n") output.append( utils.getContent("output/greek/no_split/top250/chosenWordInfo.txt", False)) output.append("\nPoetry:") output.append( utils.getContent( "output/greek/no_split/top250+p/chosenWordInfoPoetry.txt", False)) output.append(splitter) output.append("English:\n") output.append( utils.getContent("output/english/no_split/top250/chosenWordInfo.txt", False)) output.append("\nPoetry:") output.append( utils.getContent( "output/english/no_split/top250+p/chosenWordInfoPoetry.txt", False)) output.append(splitter) output.append("Icelandic:\n") output.append( utils.getContent("output/icelandic/no_split/top250/chosenWordInfo.txt", False)) output.append(splitter) utils.safeWrite("%s/skippedWords.txt" % baseFolder, "\n".join(output))
def getAuthorBookCounts(baseFolder): ab_counts_output = [] splitter = "\n------\n" ab_counts_output.append("Greek:\n") ab_counts_output.append( utils.getContent("output/greek/numberOfAuthors_Books.txt", False)) ab_counts_output.append( utils.getContent("output/greek/numberOfTypes_Tokens.txt", False)) ab_counts_output.append(splitter) ab_counts_output.append("English:\n") ab_counts_output.append( utils.getContent("output/english/numberOfAuthors_Books.txt", False)) ab_counts_output.append( utils.getContent("output/english/numberOfTypes_Tokens.txt", False)) ab_counts_output.append(splitter) ab_counts_output.append("Icelandic:\n") ab_counts_output.append( utils.getContent("output/icelandic/numberOfAuthors_Books.txt", False)) ab_counts_output.append( utils.getContent("output/icelandic/numberOfTypes_Tokens.txt", False)) ab_counts_output.append(splitter) utils.safeWrite("%s/AuthorBookNumbers.txt" % baseFolder, "\n".join(ab_counts_output))
def makeTopAuthorTable(topStr, baseFolder): # Grab this from the best metric fname = "output/greek/no_split/%s/jensen-shannon/metric/Authors/sims.txt" % ( topStr) allAuthorSims = utils.getContent(fname, False).split("\n") topAuthorPairs = [] topAuthorPairs.append("""\\begin{table}[!bt] \\centering \\def\\arraystretch{1.2} \\begin{tabular}{| r | l | l | l | l |} \\hline & \\textbf{Author 1} & \\textbf{Author 2} & \\textbf{Score} & \\textbf{Notes} \\\\\\hline """) for i, pair in enumerate(allAuthorSims[:10]): splt1 = pair.split(" - ") sim = splt1[0] auths = splt1[1].split(" (")[0].split(", ") topAuthorPairs.append(" %.2d & %s & %s & %s & TODO \\\\\\hline" % (i + 1, auths[0], auths[1], sim)) topAuthorPairs.append(""" \\end{tabular} \\caption{Top author pairs by similarity score according to Jensen-Shannon Similarity.} \\label{table:top_author_pairs} \\end{table} """) utils.safeWrite("%smetric/topAuthorPairs.tex" % baseFolder, "\n".join(topAuthorPairs))
def downloadText(textName, textSource, startBook, endBook, increments): books = [] if (not (increments) == None): # if there are multiple cards, use them all, otherwise, it is single book. if (len(increments) >= 1): for i in range(len(increments)): index = increments[i] url = textSource + str(index) bookResult = utils.parse_TEI(utils.get_TEI_XML(url), textName, 1, True, index) books.extend(bookResult) else: url = textSource bookResult = utils.parse_TEI(utils.get_TEI_XML(url), textName, 1, False, 0) books.extend(bookResult) else: books = [] for i in range(startBook - 1, endBook): index = i + 1 url = textSource + str(index) bookResult = utils.parse_TEI(utils.get_TEI_XML(url), textName, index, False, 0) books.extend(bookResult) print "Lines: " + str(len(books)) outFileName = utils.getTextFn(textName) utils.safeWrite(outFileName, json.dumps(books))
def printAuthorWords(authors, targets, saveDir): for author in authors: aname = author.authorName for targetTokenName in targets[aname]: targetToken = targetTokenName.split("_")[-1] matches = [] for i, token in enumerate(author.allTokens): if token == targetToken: matches.append(getTokenContext(i, author.allTokens)) orders = [ ("", None), ("_after", lambda x: x[2]), # after context ("_before", lambda x: x[3]), # reverse of before context ] for order in orders: oname, keyFunc = order if keyFunc == None: myMatches = matches else: myMatches = sorted(matches, key=keyFunc) output = [] for match in myMatches: output.append(" ~~ ".join(match[:3])) fname = saveDir + ("wordOccurrences%s/%s_%s.txt" % (oname, targetTokenName, aname)) utils.safeWrite(fname, "\n".join(output))
def runMLAlgorithms(X, tokens, y, groups, type_name, category_name, saveDir): X = np.array(X) y = np.array(y) # get word counts for naive bayes counts = [] for i in range(len(X)): counts.append(X[i]*tokens[i]) counts = np.array(counts) # 9 splits yields ~10 test authors per fold kf = GroupKFold(n_splits=9) # Ensure no work has books (segments) in both the training and test set. splits = list(kf.split(X, None, groups)) output = [] output.append("Average results for %s (%s) across %d folds:" % (category_name, type_name, len(splits))) # Run Majority Class output.append(majorityClass(X, y, splits, saveDir)) # Run KNN output.append(knn(X, y, splits, saveDir)) # Run Naive Bayes try: output.append(naiveBayes(counts, y, splits, saveDir)) except ValueError: output.append(" Failed - Naive Bayes") fname = saveDir + "res_%s_%s.txt" % (category_name, type_name) utils.safeWrite(fname, "\n".join(output))
def convertBook(location): filename = loc.replace(RAW_FOLDER, "") newLoc = PARSED_FOLDER + filename t = utils.XMLText(loc) res = t.convertFromXML() utils.safeWrite(newLoc, res, True) return newLoc, res["booksRaw"]
def combineTexts(textName, sourceTexts): allLines = [] for source in sourceTexts: inFileName = utils.getTextFn(source) lines = utils.getContent(inFileName, True) allLines.extend(lines) jsonDump = json.dumps(allLines) outFileName = utils.getTextFn(textName) utils.safeWrite(outFileName, jsonDump)
def makeMLTable(source, norm, filename): output = [] output.append("""\\begin{table}[!bt] \\centering \\def\\arraystretch{1.2} """) # No naive bayes if normed due to negative data if norm: output.append(" \\begin{tabular}{| r | l | l |} \\hline") output.append( " \\textbf{Prediction Task} & \\textbf{Majority Class} & \\textbf{KNN} \\\\\\hline" ) else: output.append(" \\begin{tabular}{| r | l | l | l |} \\hline") output.append( " \\textbf{Prediction Task} & \\textbf{Majority Class} & \\textbf{KNN} & \\textbf{Naive Bayes} \\\\\\hline" ) for t in ["Authors", "Books", "Books_2"]: cats = ["genre", "dialect", "timeframe"] if (t == "Books"): cats.append("author") if (t == "Books_2"): cats = ["work", "genre", "dialect", "timeframe", "author"] for cat in cats: fname = source + "res_%s_%s.txt" % (cat, t) lines = utils.getContent(fname, False).split("\n") maj_class = lines[1].split(" - ")[0].strip() knn = lines[2].split(" - ")[0].strip() naive_bayes = lines[3].split(" - ")[0].strip() t_name = t if t_name == "Books": t_name = "Segments" if t_name == "Books_2": t_name = "Segments*" if norm: output.append(" %s of %s & %s & %s \\\\\\hline" % (cat, t_name, maj_class, knn)) else: output.append(" %s of %s & %s & %s & %s \\\\\\hline" % (cat, t_name, maj_class, knn, naive_bayes)) output.append(""" \\end{tabular} \\caption{Results of running simple machine learning on the frequency data.} \\label{table:ml+p} \\end{table} """) utils.safeWrite(filename, "\n".join(output))
def calcTopWordOverlapOverTime(language): saveDirBase = mp.languageInfo[language]["saveDir"] print(" Loading data...", end=" ", flush=True) authors, books, topWords = loadWCData(saveDirBase, -1, "commonWords", "") print("done") centWordCounts = {} for author in authors: cent = toCent(author.authorName) if not (cent in centWordCounts): centWordCounts[cent] = newEmptyWordCounts(topWords) for i, w in enumerate(topWords): count = author.counts[i] if (count > 0): centWordCounts[cent][w] += count centTopWords = {} for cent in centWordCounts: wc = centWordCounts[cent] wordList = [] for w in wc: # word, count wordList.append([w, wc[w]]) sortedWordList = sorted(wordList, key=lambda x: x[1], reverse=True) topWordList = list(map(lambda x: x[0], sortedWordList)) centTopWords[cent] = set(topWordList[:NUM_TOP_WORDS]) centuries = [] for cent in centTopWords: centuries.append(cent) centuries = sorted(centuries) cells = [] for i, c1 in enumerate(centuries): row = [] for j, c2 in enumerate(centuries): overlap = 0 wordSet1 = centTopWords[c1] wordSet2 = centTopWords[c2] for w in wordSet1: if (w in wordSet2): overlap += 1 row.append("%d" % overlap) cells.append(row) output = asciiTable(centuries, centuries, cells) utils.safeWrite(saveDirBase + "topWordOverlapOverTime.txt", output)
def getMetricInfo(topStr, comparableTopStr, topNum, poetryNum, comparableNum, simMetrics, baseFolder): # Copy full eval files for jensen-shannon subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Books/comparisonInfo.txt %smetric/extraInfo/metricEvaluation_tops.txt" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s+p/jensen-shannon/metric/Books/comparisonInfo.txt %smetric/extraInfo/metricEvaluation_+p.txt" % (topStr, baseFolder), shell=True) # Grab median distance fname = "output/greek/no_split/%s/jensen-shannon/metric/Books/comparisonInfo.txt" % ( topStr) metricEvalInfo = utils.getContent( fname, False).split("=========")[-2].split("\n")[2:-1] sameAuthorRanks = [] for i, line in enumerate(metricEvalInfo): sameAuthorRank = line.split("with same author: ")[1].split(".")[0] sameAuthorRanks.append(int(sameAuthorRank)) median = np.median(sameAuthorRanks) utils.safeWrite( "%smetric/extraInfo/medianForDifferentAuthor.txt" % (baseFolder), "Median distance for closest author: %f" % median) # get info on the indica subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Books/sims/Arrian.Indica.1.txt %smetric/extraInfo/arrianIndica.txt" % (topStr, baseFolder), shell=True) # Info on book distance # Grab this from the best metric fname = "output/greek/no_split/%s/jensen-shannon/metric/Books/sims.txt" % ( topStr) allBookSims = utils.getContent(fname, False).split("\n") utils.safeWrite("%smetric/lowestSimilarity.txt" % (baseFolder), "Lowest similarity between segments: %s" % allBookSims[-1]) # Info on top similar authors makeTopAuthorTable(topStr, baseFolder) # =============================== makeMetricEvalTables("", topStr, comparableTopStr, topNum, poetryNum, comparableNum, simMetrics, baseFolder)
def getAllTokenCounts(authors, saveDir): allTokenCounts = {} poetryTokenCounts = {} totalTokens = 0 # for each author, keep track of counts; also keep track of prose/poetry for i in range(len(authors)): author = authors[i] totalTokens += len(author.allTokens) # print("%s: %d" %(author.authorName, len(author.allTokens))) allTokens = author.allTokens totalTokenCount = 0 tokenCounts = {} for token in allTokens: totalTokenCount += 1 if (token in tokenCounts): tokenCounts[token] = tokenCounts[token] + 1 else: tokenCounts[token] = 1 if (token in allTokenCounts): allTokenCounts[token] = allTokenCounts[token] + 1 else: allTokenCounts[token] = 1 if (toGenre(author.authorName) == 1): if (token in poetryTokenCounts): poetryTokenCounts[token] = poetryTokenCounts[token] + 1 else: poetryTokenCounts[token] = 1 author.tokenCounts = tokenCounts author.totalTokenCount = totalTokenCount # print("Results for %s:" % author.authorName) # print(len(allTokens)) # print("---") typeTokenInfo = [] typeTokenInfo.append("Total tokens: %d" % totalTokens) typeTokenInfo.append("Total types: %d" % len(allTokenCounts)) typeTokenInfoStr = "\n".join(typeTokenInfo) print(typeTokenInfoStr) utils.safeWrite(saveDir + "numberOfTypes_Tokens.txt", typeTokenInfoStr) return allTokenCounts, poetryTokenCounts
def getTextCounts(textLocation, saveDir): subprocess.run("cp %savailable.json %savailable.json" % (textLocation, saveDir), shell=True) available = utils.getContent(textLocation + "available.json", True) # For each available text for i, o in enumerate(available): if (i % 20 == 0): print(i, end=" ", flush=True) workLocs = o["works"] # Process each work for w in workLocs: t = utils.getContent(w["location"], True) booksRaw = t["booksRaw"] booksCounts = [] for b in booksRaw: rawTokens = re.sub(r'\.,;:᾽῾\'', "", b["bookText"]).split(" ") tokenCounts = {} for token in rawTokens: if (token == ""): continue if not (token in tokenCounts): tokenCounts[token] = 1 else: tokenCounts[token] += 1 bookWithCounts = {} bookWithCounts["bookNumber"] = b["bookNumber"] bookWithCounts["bookTokenCounts"] = tokenCounts bookWithCounts["bookText"] = "" booksCounts.append(bookWithCounts) t["booksRaw"] = booksCounts # Remove "texts/" from start filename = "textCounts/" + w["location"][6:] utils.safeWrite(filename, t, True)
def getOverlapInfo(baseFolder): output = [] splitter = "\n------\n" output.append("Greek:\n") output.append( utils.getContent("output/greek/topWordOverlapOverTime.txt", False)) output.append(splitter) output.append("English:\n") output.append( utils.getContent("output/english/topWordOverlapOverTime.txt", False)) output.append(splitter) output.append("Icelandic:\n") output.append( utils.getContent("output/icelandic/topWordOverlapOverTime.txt", False)) output.append(splitter) utils.safeWrite("%s/topWordOverlapOverTime.txt" % baseFolder, "\n".join(output))
def printKeyWords(dataSplit, top, subsetSize, language, saveDirBase): topName, _, _ = top # calculate save directory based on input parameters saveDir = saveDirBase + "%s" % (topName) keyWordsDir = saveDir + "/wordImportance/keyWords/" # find all the relevant json files files = os.listdir(keyWordsDir) for f in files: if f[-5:] == ".json": nameCore = f.split(".json")[0] # get the word info for this author pair words = utils.getContent(keyWordsDir + f, True) # get the authors authors = nameCore.split("_") a1 = authors[0] a2 = authors[1] print(a1, a2) # save dir for new files wordsDir = keyWordsDir + nameCore + "/" # gather the list of words and print them out along with percentiles wordList = [] out = ["index, percentile, token"] for word in words: wordList.append("%03d_%s" % (words[word][0] + 1, word)) out.append("%d, %.2f, %s" % (words[word][0], words[word][1], word)) utils.safeWrite(wordsDir + "words.txt", "\n".join(out)) # get the info for each occurrence of the given words # associated with these authors target = { a1: wordList, a2: wordList, } printOccs(wordsDir, target, language)
def getTopWords(N, tokenInfo, name, saveDir): if (N == 0): return [] sortedTokenInfo = sorted(tokenInfo, key=lambda x: x[1], reverse=True) chosenCutoff = 0.5 nextCutoff = 0.6 chosen, skipped = getTokensForCutoff(sortedTokenInfo, N, chosenCutoff, 0) _, nextSkipped = getTokensForCutoff(sortedTokenInfo, N, nextCutoff, chosenCutoff) info = [] info.append("Skipped (from cutoff of %f):" % chosenCutoff) for tokenInfo in skipped: word = tokenInfo[0] authorFrequency = tokenInfo[2] info.append(" %s (appears in %d%% of authors)" % (word, 100 * authorFrequency)) info.append("") info.append("Would skip (from cutoff of %f):" % nextCutoff) for tokenInfo in nextSkipped: word = tokenInfo[0] authorFrequency = tokenInfo[2] info.append(" %s (appears in %d%% of authors)" % (word, 100 * authorFrequency)) fname = "%schosenWordInfo%s.txt" % (saveDir, name) utils.safeWrite(fname, "\n".join(info)) tops = chosen # sortedTokenInfo[0:N] topWords = list(map(lambda x: x[0], tops)) return topWords
def storeFreqResults(authors, books, saveDir, topWords): # store frequencies authorOutput = [] for author in authors: authorOutput.append(author.authorName + "," + ",".join(map(str, author.featureData))) utils.safeWrite(saveDir + "authorFreqs.txt", "\n".join(authorOutput)) bookOutput = [] for book in books: if (book.numTokens >= mp.MIN_TOKENS_NECESSARY): bookOutput.append(book.getShortName() + "," + ",".join(map(str, book.featureData))) utils.safeWrite(saveDir + "bookFreqs.txt", "\n".join(bookOutput)) utils.safeWrite(saveDir + "topWords.txt", ",".join(topWords))
sheet[spec] = cell wb.remove(wb["Sheet"]) wb.save(filename) # Check for db if (not (utils.fileExists(DB_LOCATION))): raise Exception("Please add a database at '" + DB_LOCATION + "'") # Create input folder and subfolders utils.check_and_create_path("input/articles") utils.check_and_create_path("input/illustrations") utils.safeWrite( "input/articles/README.md", "Place articles here. Articles should be included in a text file and the filename should be [lemma].txt, where [lemma] is the lemma the article is about. For example, the article for ἄβουλος should be in ἄβουλος.txt" ) utils.safeWrite( "input/illustrations/README.md", "Place illustrations here. Illustrations should be a .jpg, .gif, or .png with the name of the lemma they are an illustraiton for. For example, the image for ἄβουλος should be in ἄβουλος.png (or ἄβουλος.gif or ἄβουλος.jpg)" ) # Create lemmata xlsx lemma_info = [] lemma_info.append([ "Matched", "Lemma", "Short Definition", "Compounds", "Roots", "Sphere", "Part of Communication", "Frequency", "Illustration Caption", "Bibliography", "Notes" ]) print("Getting Tokens...")
def parseEnglish(): # If True we split each author into author_prose and author_poetry based on text genre # If False, we determine majority class and remove texts not of that work splitGenre = False RAW_FOLDER = "../rawTexts/english/" PARSED_FOLDER = "english/" authors = {} allBooks = [] numTexts = 0 numTexts2 = 0 available = [] # ============================================================================== # ============================================================================== # get all Gutenberg texts gutenbergFolder = RAW_FOLDER + "Gutenberg/txt/" textList = os.listdir(gutenbergFolder) i = 0; for author in authorWorks: baseAuthorName = author["authorName"] # determine whether this author has texts of both genre numProse = 0 numPoetry = 0 for text in author["works"]: workName = text["textName"] if (text["genre"] == 0): numProse += 1 elif (text["genre"] == 1): numPoetry += 1 specifyGenre = numProse > 0 and numPoetry > 0 moreProse = numProse > numPoetry for text in author["works"]: authorName = baseAuthorName # if we are splitting authors by genre, append genre to the "author" # of this book as necessary if (splitGenre): if (specifyGenre): if (text["genre"] == 0): authorName += "_Prose" else: authorName += "_Poetry" else:# ignore texts of minority genre for this author if (not(moreProse) and text["genre"] == 0) or (moreProse and text["genre"] == 1): continue # Ceate a work with each of the given books workName = text["textName"] newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName) workObject = {"name": workName, "location": "texts/" + newLocation} numTexts += 1 books = [] bookIndex = 1 for b in text["books"]: books.append((gutenbergFolder + b, bookIndex)) bookIndex += 1 if authorName in authors: authors[authorName]["works"].append(workObject) else: authors[authorName] = {"author": authorName, "works": [workObject]} print(i, end=" ", flush=True) # Create a new gutenberg text for this text. try: t = utils.GutenbergText(authorName, workName, books) numTexts2 += len(books) res = t.convert() allBooks.extend(res["booksRaw"]) utils.safeWrite(newLocation, res, True) except Exception as e: print(newLocation) print(e) i += 1 # ============================================================================== # ============================================================================== # get Shakespeare for playType in ["comedies", "historical", "tragedies"]: shakeFolder = RAW_FOLDER + "ShakespearePlaysPlus/%s/" % playType textList = os.listdir(shakeFolder) for textName in textList: if (textName[-4:] == ".txt"): numTexts += 1 authorName = "Shakespeare" workName = textName.replace(" ", "_") print(workName) newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName) workObject = {"name": workName, "location": "texts/" + newLocation} if authorName in authors: authors[authorName]["works"].append(workObject) else: authors[authorName] = {"author": authorName, "works": [workObject]} t = utils.ShakespeareText(authorName, workName, shakeFolder + textName) res = t.convert() allBooks.extend(res["booksRaw"]) utils.safeWrite(newLocation, res, True) # ============================================================================== # ============================================================================== # get Middle English-y texts middleFolder = RAW_FOLDER + "ME/" textList = os.listdir(middleFolder) for textName in textList: split = textName[:-4].split("___") if (textName[-4:] == ".txt"): numTexts += 1 authorName = split[0].replace(" ", "_") workName = split[1].replace(" ", "_") print(workName) newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName) workObject = {"name": workName, "location": "texts/" + newLocation} if authorName in authors: authors[authorName]["works"].append(workObject) else: authors[authorName] = {"author": authorName, "works": [workObject]} t = utils.METext(authorName, workName, middleFolder + textName) res = t.convert() allBooks.extend(res["booksRaw"]) utils.safeWrite(newLocation, res, True) # ============================================================================== # Old English and 21st century corpus did not end up being included. # get 21st century texts # tfFolder = RAW_FOLDER + "21st/" # textList = os.listdir(tfFolder) # # for textName in textList: # split = textName[:-4].split("___") # if (textName[-4:] == ".txt"): # numTexts += 1 # authorName = split[0].replace(" ", "_") # workName = split[1].replace(" ", "_") # print(workName) # newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName) # workObject = {"name": workName, "location": "texts/" + newLocation} # if authorName in authors: # authors[authorName]["works"].append(workObject) # else: # authors[authorName] = {"author": authorName, "works": [workObject]} # # t = utils.TFText(authorName, workName, tfFolder + textName) # res = t.convert() # allBooks.extend(res["booksRaw"]) # utils.safeWrite(newLocation, res, True) # Old English # oeFolder = RAW_FOLDER + "OE/" # textList = os.listdir(tfFolder) # # for textName in textList: # split = textName[:-4].split("___") # if (textName[-4:] == ".txt"): # numTexts += 1 # authorName = split[0].replace(" ", "_") # workName = split[1].replace(" ", "_") # print(workName) # newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName) # workObject = {"name": workName, "location": "texts/" + newLocation} # if authorName in authors: # authors[authorName]["works"].append(workObject) # else: # authors[authorName] = {"author": authorName, "works": [workObject]} # # t = utils.OEText(authorName, workName, oeFolder + textName) # res = t.convert() # allBooks.extend(res["booksRaw"]) # utils.safeWrite(newLocation, res, True) # for author in authors: available.append(authors[author]) utils.safeWrite(PARSED_FOLDER + "available.json", available, True) print("Done.") # Optionally count the characters in the corpus. This is done to find weird # Unicode artifacts to make sure it gets removed in the cleaning step. countChars = False#True# if countChars: print("Counting Chars") chars = {} for b in allBooks: bookText = b["bookText"] for char in bookText: chars[char] = True sortedChars = sorted(list(chars.keys())) for c in sortedChars: utils.printUnicodeChar(c) print("======") # If true, show the set of unique characters when things are decomposed if False: decomposedChars = {} for c in sortedChars: res = utils.fullyDecomposeUnicodeChar(c) for newC in res: decomposedChars[newC] = True sortedDecompChars = sorted(list(decomposedChars.keys())) for c in sortedDecompChars: utils.printUnicodeChar(c)
def parseIcelandic(): RAW_FOLDER = "../rawTexts/icelandic/" PARSED_FOLDER = "icelandic/" authors = {} allBooks = [] numTexts = 0 numTexts2 = 0 available = [] allNames = [] # ============================================================================== # ============================================================================== # get texts from icepahc icepahcFolder = RAW_FOLDER + "icepahc-v0.9/txt/" for text in icepahcList: numTexts += 1 authorName = text["author"] workName = text["title"] id = text["id"] textName = id + ".txt" allNames.append(workName) newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName) workObject = {"name": workName, "location": "texts/" + newLocation} if authorName in authors: authors[authorName]["works"].append(workObject) else: authors[authorName] = {"author": authorName, "works": [workObject]} t = utils.IcepahcText(authorName, workName, icepahcFolder + textName) res = t.convert() allBooks.extend(res["booksRaw"]) utils.safeWrite(newLocation, res, True) # ============================================================================== # ============================================================================== # get texts from sagas sagasFolder = RAW_FOLDER + "textar/fornritin/xml/" for text in sagasList: numTexts += 1 if (text["id"] == "F1E"): authorName == "Snorri_Sturluson" else: authorName = "Anon_" + text["id"] workName = text["title"].strip().replace(" ", "_") allNames.append(workName + "#" + text["id"]) newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName) workObject = {"name": workName, "location": "texts/" + newLocation} if authorName in authors: authors[authorName]["works"].append(workObject) else: authors[authorName] = {"author": authorName, "works": [workObject]} t = utils.SagasText(authorName, workName, sagasFolder + text["id"] + ".xml") res = t.convert() allBooks.extend(res["booksRaw"]) utils.safeWrite(newLocation, res, True) # ============================================================================== # ============================================================================== # get books from MIM corpus modernBooksFolder = RAW_FOLDER + "MIM/baekur/" # For each text in the list, download it for text in modernBookList: numTexts += 1 authorName = text["author"].strip().replace(" ", "_") workName = text["title"].strip().replace(" ", "_") allNames.append(workName + "#" + text["id"]) newLocation = "%s%s-%s.json" % (PARSED_FOLDER, authorName, workName) workObject = {"name": workName, "location": "texts/" + newLocation} if authorName in authors: authors[authorName]["works"].append(workObject) else: authors[authorName] = {"author": authorName, "works": [workObject]} # MIM texts are in the same format as Saga texts so this works fine. t = utils.SagasText(authorName, workName, modernBooksFolder + text["id"] + ".xml") res = t.convert() allBooks.extend(res["booksRaw"]) utils.safeWrite(newLocation, res, True) for author in authors: available.append(authors[author]) utils.safeWrite(PARSED_FOLDER + "available.json", available, True) print("Done.") # Optionally count the characters in the corpus. This is done to find weird # Unicode artifacts to make sure it gets removed in the cleaning step. countChars = False#True# if countChars: print("Counting Chars") chars = {} for b in allBooks: bookText = b["bookText"] for char in bookText: chars[char] = True sortedChars = sorted(list(chars.keys())) for c in sortedChars: utils.printUnicodeChar(c) print("======") # If true, show the set of unique characters when things are decomposed if False: decomposedChars = {} for c in sortedChars: res = utils.fullyDecomposeUnicodeChar(c) for newC in res: decomposedChars[newC] = True sortedDecompChars = sorted(list(decomposedChars.keys())) for c in sortedDecompChars: utils.printUnicodeChar(c)
def cleanAndCombineFeatures(texts, approach): matrix = [] textNames = [] featureNames = [] numTexts = len(texts) # for all the texts for i in range(numTexts): text = texts[i] textName = text["textName"] divideByBook = text["divideByBook"] toBeCombined = text["toBeCombined"] if (toBeCombined or textName == "Iliad1" or textName == "Odyssey1"): continue ofn = generalUtils.getTextFeatureDataOdikonFn(textName, approach) tfn = generalUtils.getTextFeatureDataTamnonFn(textName) odikonFeaturesRaw = generalUtils.getContent(ofn, True) tamnonFeaturesRaw = generalUtils.getContent(tfn, True) if (len(odikonFeaturesRaw) != len(tamnonFeaturesRaw)): raise Exception("Number of subtexts for " + textName + " do not match") # for each set of features (the books plus the overall text) for j in range(len(odikonFeaturesRaw)): # get the raw features for this subtext ro = odikonFeaturesRaw[j] rt = tamnonFeaturesRaw[j] # determine the names for these two texts and make sure they match roString = ro["TextName"] + ": " + ro["SubName"] rtString = rt["TextName"] + ": " + rt["SubName"] if (roString != rtString): raise Exception("Book mismatch! " + roString + " and " + rtString) # add the cleaned features to the row row = [] row.extend(cleanRawOdikon(ro, False)) row.extend(cleanRawTamnon(rt, False)) matrix.append(row) textNames.append(roString) # and one time, get the list of feature names. if (i == 0 and j == 0): featureNames.extend(cleanRawOdikon(ro, True)) featureNames.extend(cleanRawTamnon(rt, True)) # output the information. print "Number of Features: %d." % len(matrix[0]) output = { "rowNames": textNames, "matrix": matrix, "featureNames": featureNames } fName = generalUtils.getFeatureMatrixFn() generalUtils.safeWrite(fName, json.dumps(output))
def getCenturyInfo(topStr, baseFolder): subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_overall_no_labels.pdf %scentury/centuriesGreek.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_overall_labels.pdf %scentury/extraInfo/Greek_CenturyOverall_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/simRange.txt %scentury/extraInfo/Greek_SimRange.txt" % (topStr, baseFolder), shell=True) # ------------------------- # Century similarity data subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/extraInfo/Greek_Century_No_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/Greek_Century_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/extraInfo/Greek+p_Century_No_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/Greek+p_Century_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels.pdf %scentury/extraInfo/Greek_Century_Cutoff_No_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_labels.pdf %scentury/extraInfo/Greek_Century_Cutoff_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels.pdf %scentury/extraInfo/Greek+p_Century_Cutoff_No_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_under_9_labels.pdf %scentury/extraInfo/Greek+p_Century_Cutoff_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels.pdf %scentury/centuriesGreek2.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels_violin.pdf %scentury/centuriesGreekViolin.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/english/no_split/%s/jensen-shannon/metric/Authors/simRange.txt %scentury/extraInfo/English_SimRange.txt" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/centuriesEnglish.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels_violin.pdf %scentury/centuriesEnglishViolin.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/English_Century_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/simRange.txt %scentury/extraInfo/Icelandic_SimRange.txt" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/centuriesIcelandic.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels_violin.pdf %scentury/centuriesIcelandicViolin.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/Icelandic_Century_Label.pdf" % (topStr, baseFolder), shell=True) # Get pvalue + other regression information for charts greekPval = utils.getContent( "output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_pslope.txt" % (topStr), False) englishPval = utils.getContent( "output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_pslope.txt" % (topStr), False) icelandicPval = utils.getContent( "output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_pslope.txt" % (topStr), False) pvalOutput = [] pvalOutput.append("Greek:") pvalOutput.append(greekPval) pvalOutput.append("English:") pvalOutput.append(englishPval) pvalOutput.append("Icelandic:") pvalOutput.append(icelandicPval) utils.safeWrite("%scentury/century_pvals.txt" % baseFolder, "\n".join(pvalOutput))
def keyAuthorComparisonWithImportance(authors, books, baseSaveDir, splitParam, topWords): makeWordImportanceGraphs = False keyAuthData = getKeyAuthorData(authors, books) saveDir = baseSaveDir + "wordImportance/" allDiffLineData = {} allCumulDiffLineData = {} allRCumulDiffLineData = {} allPercentageLineData = {} # load diffs for plotting internal similarities allDiffsFilename = baseSaveDir + "dists/diffLists.json" allDiffs = utils.getContent(allDiffsFilename, True) # For each set of key authors, make necessary visaulizations for dat in keyAuthData: data, _, dataLabels, chartFileName = dat print(" %s..." % chartFileName) numWords = len(topWords) numTexts = len(dataLabels) tickLabels = topWords distsFilename = baseSaveDir + "dists/" + chartFileName + ".json" dists = utils.getContent(distsFilename, True) # dists = [ # {"name": "D1", "vals": (np.random.random((numWords))*1.5 - 0.5)}, # {"name": "D2", "vals": (np.random.random((numWords))*1.5 - 0.5)} # ] for d in dists: d["vals"] = np.array(d["vals"]) if (makeWordImportanceGraphs): graphUtils.wordImportanceComparison(data, dataLabels, tickLabels, dists, saveDir + "unsorted/", chartFileName, True) # display versions sorted by each metric for d in dists: sortedSaveDir = saveDir + d["name"] + "-sorted/" fname = chartFileName sortedInds = np.array( list( map( lambda x: x[0], sorted(enumerate(d["vals"]), key=lambda x: x[1][0], reverse=True)))) data1 = copy.deepcopy(data) tickLabels1 = copy.deepcopy(tickLabels) wordsUsed = len(topWords) # If the similarity metric includes remainder, we have to add it if (len(dists[0]["vals"]) == len(data[0]) + 1): newData = [] for row in data1: r = np.append(row, 1 - np.sum(row)) newData.append(r) data1 = newData tickLabels1.append("Remainder") wordsUsed += 1 data2 = list(map(lambda x: np.array(x)[sortedInds], data1)) tickLabels2 = np.array(tickLabels1)[sortedInds] dists2 = copy.deepcopy(dists) percentiles = [] for d2 in dists2: d2["vals"] = np.copy(d2["vals"])[sortedInds] if (makeWordImportanceGraphs): graphUtils.wordImportanceComparison(data2, dataLabels, tickLabels2, dists2, sortedSaveDir, fname, True) # save all words if d["name"] == "Jensen-shannon": fname = saveDir + "keyWords/" + chartFileName + ".json" SimDiff = {} for i, val in enumerate(d["vals"][sortedInds]): if (True): SimDiff[tickLabels2[i]] = [i, val[1]] utils.safeWrite(fname, SimDiff, True) # Diff data trueDiffs = np.array( list(map(lambda x: x[0], d["vals"][sortedInds]))) y = (chartFileName, trueDiffs) y_cumul = (chartFileName, np.cumsum(trueDiffs)) linesToGraphDiff = [y] linesToGraphDiffCumul = [y_cumul] # store info for the chart with all authors if d["name"] in allDiffLineData: allDiffLineData[d["name"]].extend([y]) else: allDiffLineData[d["name"]] = [y] if d["name"] in allCumulDiffLineData: allCumulDiffLineData[d["name"]].extend([y_cumul]) else: allCumulDiffLineData[d["name"]] = [y_cumul] # dif percentile data percentiles = list(map(lambda x: x[1], d["vals"][sortedInds])) y = (chartFileName, percentiles) linesToGraphPct = [y] # store info for the chart with all authors if d["name"] in allPercentageLineData: allPercentageLineData[d["name"]].append(y) else: allPercentageLineData[d["name"]] = [y] if splitParam == -1: # get percentiles for internal consistency of second author author1 = dataLabels[0] author2 = dataLabels[1] authorInternalConsistencies = [ # ["split5", author1, "-split5"], # ["split-2", author1, "-splitHalf"], # ["split5", author2, "-split5"], # ["split-2", author2, "-splitHalf"] ] # Gen information comparing consistencies within given authors. for aic in authorInternalConsistencies: a2DiffsFilename = baseSaveDir.replace( "no_split", aic[0]) + "dists/%s_%s_2.json" % (aic[1], aic[1]) if (utils.fileExists(a2DiffsFilename)): a2Diffs = utils.getContent(a2DiffsFilename, True) diffNums = None for ad in allDiffs: if ad["name"] == d["name"]: diffNums = ad["allDiffs"] a2RawDiffs = None for ad in a2Diffs: if ad["name"] == d["name"]: a2RawDiffs = ad["vals"] if (diffNums != None and a2RawDiffs != None): # Add difference data aicName = aic[1] + aic[2] a2SortedInds = np.array( list( map( lambda x: int(x[0]), sorted(enumerate(a2RawDiffs), key=lambda x: x[1][0], reverse=True)))) trueDiffs = np.array( list( map(lambda x: x[0], np.array(a2RawDiffs)[a2SortedInds]))) y_diff = (aicName, trueDiffs) y_diff_cumul = (aicName, np.cumsum(trueDiffs)) linesToGraphDiff.append(y_diff) linesToGraphDiffCumul.append(y_diff_cumul) # Add Percentile data a2Percentiles = [] for rd in a2RawDiffs: index = bisect.bisect_left(diffNums, rd[0]) a2Percentiles.append( (100.0 * index) / len(diffNums)) a2Percentiles = sorted(a2Percentiles, reverse=True) y2 = (aicName, a2Percentiles) linesToGraphPct.append(y2) else: print("File does not exist: \"%s\"" % a2DiffsFilename) # Create charts showing differences for various authors graphUtils.lineChart(range(wordsUsed), linesToGraphDiff, True, sortedSaveDir, chartFileName + "_diff-chart", yLim=None) #[-0.002, 0] graphUtils.lineChart(range(wordsUsed), linesToGraphDiffCumul, True, sortedSaveDir, chartFileName + "_diff-cumul-chart", yLim=None, yAdjust=1) #[-0.002, 0] #graphUtils.lineChart(range(wordsUsed), linesToGraphPct, True, sortedSaveDir, chartFileName+"_pct-chart") linesToGraphDiffRCumul = [] for name, c in linesToGraphDiffCumul: name = name.replace("-split5", " Local Split") name = name.replace("-splitHalf", " Global Split") linesToGraphDiffRCumul.append((name, c[-1] - np.array(c))) if d["name"] in allRCumulDiffLineData: allRCumulDiffLineData[d["name"]].extend( [linesToGraphDiffRCumul]) else: allRCumulDiffLineData[d["name"]] = [linesToGraphDiffRCumul] graphUtils.lineChart(range(wordsUsed), linesToGraphDiffRCumul, True, sortedSaveDir, chartFileName + "_diff-r-cumul-chart", yLim=None, yAdjust=1) #[-0.002, 0] for d in dists: # 4-Up Chart for these authors sortedSaveDir = saveDir + d["name"] + "-sorted/" graphUtils.lineChart4Up(range(wordsUsed), allRCumulDiffLineData[d["name"]], True, sortedSaveDir, "4up-r-cumul", yLim=None, yAdjust=1) # Create graph charts for all data in a cloud graphTypes = [ ("all-diffs", allDiffLineData, None, 0), ("all-diffs-cumul", allCumulDiffLineData, None, 1), #("all-pcts", allPercentageLineData, [0, 100], 0) ] alls = {} for graphType, lineList, yLim, adjust in graphTypes: medFilename = baseSaveDir + "dists/median-%s.json" % graphType med = utils.getContent(medFilename, True) alls[graphType] = {} for d in med: lineList[d["name"]].append(["Median", d["line"]]) alls[graphType][d["name"]] = d["all"] for name in allPercentageLineData: sortedSaveDir = baseSaveDir + "wordImportance/" + name + "-sorted/" for log in [False]: #, True]: print(" %s..." % graphType) graphUtils.lineChart(range(wordsUsed), lineList[name], True, sortedSaveDir, graphType, yLim=yLim, log=log, yAdjust=adjust) print(" %s cloud..." % graphType) graphUtils.lineChart(range(wordsUsed), lineList[name], True, sortedSaveDir, graphType + "-cloud", yLim=yLim, allLines=alls[graphType][name], log=log, yAdjust=adjust) # Create chart showing ignored top words n = "Jensen-shannon" sortedSaveDir = baseSaveDir + "wordImportance/" + n + "-sorted/" # Cumulative data = allCumulDiffLineData[n] # Add lines res = [] targetSim = -1 for item in alls["all-diffs-cumul"][n]: name, c = item # "Aristotle_Pindar" in name or #"AeliusAristides_Demosthenes", "DioChrysostom_Plato" if ("ApolloniusRhodius_QuintusSmyrnaeus" in name or "DioChrysostom_Xenophon" == name): res.append((name, "-", 1 + c[-1] - np.array(c))) # Lowest of our top authors if ("DioChrysostom_Xenophon" == name): targetSim = c[-1] # add median # for item in allCumulDiffLineData[n]: # name, c = item # if ("Median" in name): # res.append((name, "-", 1 + c[-1] - np.array(c))) # Add line cloud resAll = [] for item in alls["all-diffs-cumul"][n]: name, c = item if not ("Hymns_Dionysus" in name or "Euclid" in name): n1, n2 = name.replace("Hymns_", "Hymns").split("_") n1 = n1.replace("Hymns", "Hymns_") n2 = n2.replace("Hymns", "Hymns_") centuryDiff = centDiff(genre.toCent(n1), genre.toCent(n2)) #print("%s, %s: %d" % (n1, n2, centuryDiff)) if (centuryDiff >= 4): # color top sims differently color = "k-" resAll.append((name, color, 1 + c[-1] - np.array(c))) # for name, c in data: # y = c[-1] - np.array(c) # res.append((name, y)) #resAll = map(lambda n, c: (n, c[-1] - np.array(c))) graphUtils.compareWordUsageChart(res, True, sortedSaveDir, "ignoreBestWords", yLim=None, allLines=resAll)
def gatherFilesFull(topStr, topNum, comparableTopStr, comparableNum, poetryNum): baseFolder = "output/full/" folders = [ "", "data", "genre", "metric", "metric/extraInfo", "century", "century/extraInfo", "wordUse", "wordUse/extraInfo", "wordUse/grouping", ] createFolders(folders, baseFolder) # Get info for the data section getDataInfo(topStr, baseFolder) # Get info for approach section getWordUseInfo(topStr, baseFolder) # Get genre info getGenreInfo(topStr, baseFolder) # Gather 4up tsne charts for standard data and data normalized by genre # Grab this from the best metric subprocess.run( "cp output/greek/no_split/%s/Authors/tSNE/info_no_labels_4Up.pdf %sgenre/groupings.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/Books/tSNE/outliers4up.pdf %sgenre/bookOutliers.pdf" % (topStr, baseFolder), shell=True) # Get book tsne charts # Grab this from the best metric subprocess.run( "cp output/greek/no_split/%s/Books/tSNE/tSNE_2D_no_labels.pdf %sgenre/books_tSNE_no_labels.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/Books/tSNE/tSNE_2D_labels.pdf %sgenre/books_tSNE_labels.pdf" % (topStr, baseFolder), shell=True) # To get a look at these, run python3 visualizeBooks # Get info for standard and normalized by poetry makeMLTable("output/greek/no_split/%s/dataPreds/" % (topStr), False, "%sgenre/ml_table.tex" % baseFolder) # makeMLTable("output/greek/no_split/%s+p/dataPreds/" % (topStr), False, "%sgenre/ml_table+p.tex" % baseFolder) # ========================= # Get info for results section # ----------- # Metric getMetricInfo(topStr, comparableTopStr, topNum, poetryNum, comparableNum, SIM_METRICS, baseFolder) makeMetricInternalTables("", topStr, SIM_METRICS, baseFolder) makeMetricInternalTables("", topStr + "+p", SIM_METRICS, baseFolder) # ----------- # Century # Get information on century comparison getCenturyInfo(topStr, baseFolder) # Get pvalue + other regression information for charts that are + p greekPval = utils.getContent( "output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_under_9_pslope.txt" % (topStr), False) englishPval = utils.getContent( "output/english/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_pslope.txt" % (topStr), False) pvalOutput = [] pvalOutput.append("Greek:") pvalOutput.append(greekPval) pvalOutput.append("English:") pvalOutput.append(englishPval) utils.safeWrite("%scentury/century_pvals+p.txt" % baseFolder, "\n".join(pvalOutput)) # ------------------------- # Grab this from the best metric subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/sims.txt %swordUse/authorSims.txt" % (topStr, baseFolder), shell=True) fourCenturiesTables(topStr, SIM_METRICS, baseFolder) # get word usage charts and info getWordUsageInfo(topStr, baseFolder)
i = 1 allBooks = [] for o in available: workLocs = o["works"] for w in workLocs: if (i % 20 == 0): print("%d out of %d (%.2f%%)" % (i, numTexts, (100 * i / numTexts))) loc = w["location"] if (True or i == TARGET_BOOK): newLoc, books = convertBook(loc) allBooks.extend(books) w["location"] = newLoc i += 1 utils.safeWrite(PARSED_FOLDER + "available.json", available, True) print("Done.") # If desired, analyze the unicode characters in the processed texts. countChars = False #True# if countChars: print("Counting Chars") chars = {} for b in allBooks: bookText = b["bookText"] for char in bookText: chars[char] = True sortedChars = sorted(list(chars.keys())) for c in sortedChars:
def getWordUseInfo(topStr, baseFolder): # total +p words tops = utils.getContent( "output/greek/no_split/%s/wordInfo_%s.txt" % (topStr, topStr), False).split("\n")[1:] poetrys = utils.getContent( "output/greek/no_split/top_p/wordInfo_top_p.txt", False).split("\n")[1:] # Top plus poetry totals = utils.getContent( "output/greek/no_split/%s+p/wordInfo_%s+p.txt" % (topStr, topStr), False).split("\n")[1:] numWordsOutput = [] numWordsOutput.append("Number of Top Words: %d" % len(tops)) numWordsOutput.append("Number of Poetry Words: %d" % len(poetrys)) numWordsOutput.append("Total Number of Words: %d" % len(totals)) utils.safeWrite("%s/wordUse/totalWords.txt" % baseFolder, "\n".join(numWordsOutput)) # Create Table of words topRanks = {} poetryRanks = {} for i, line in enumerate(tops): w = line.split(":")[0] topRanks[w] = i + 1 for i, line in enumerate(poetrys): w = line.split(":")[0] poetryRanks[w] = i + 1 rankInfo = [] for line in totals: w = line.split(":")[0] topRank = "" if w in topRanks: topRank = "%d" % topRanks[w] poetryRank = "" if w in poetryRanks: poetryRank = "%d" % poetryRanks[w] rankInfo.append((w, topRank, poetryRank)) rankTableOutput = [] rankTableOutput.append(""" \\begin{table}[!hbt] \\centering \\def\\arraystretch{1} \\begin{tabular}{| l | l | l ||| l | l | l ||| l | l | l ||| l | l | l |} \\hline \\textbf{Token} & \\textbf{A} & \\textbf{P} & \\textbf{Token} & \\textbf{A} & \\textbf{P} & \\textbf{Token} & \\textbf{A} & \\textbf{P} & \\textbf{Token} & \\textbf{A} & \\textbf{P}\\\\\\hline """) columnHeight = 43 for i in range(columnHeight): cells = [] for j in range(4): index = i + j * columnHeight cell = "" if (index < len(rankInfo)): cell = "%s & %s & %s" % rankInfo[index] cells.append(cell) rankTableOutput.append("%s \\\\\\hline" % (" & ".join(cells))) rankTableOutput.append(""" \\end{tabular} \\caption{List of tokens used, along with their rank in the top 150 tokens found in all texts (\\textbf{A}) and rank in the top 100 tokens found in poetry texts (\\textbf{P}).} \\label{table:top_words} \\end{table} """) utils.safeWrite("%swordUse/topWordsTable.tex" % baseFolder, "\n".join(rankTableOutput))
def makeMetricInternalTables(suffix, topStr, simMetrics, baseFolder): metricInternalTables = [] for simMetric in simMetrics: dir, metricName = simMetric # skip Jensen-Shannon if metricName == "Jensen-Shannon": continue tableOutput = [] temp = """ \\begin{table}[!bt] \\centering \\def\\arraystretch{1} \\begin{tabular}{| l | c | c | c |} \\hline """ tableOutput.append(temp) temp = "\\textbf{Metric Options} & \\textbf{Author} & \\textbf{Work} & \\textbf{Total} \\\\\\hline" tableOutput.append(temp) workSigReport = [] authorSigReport = [] totalSigReport = [] # & \\textbf{Sim to another work} & \\textbf{Closest to diff author} & \\textbf{Median} metricOptions = [("Baseline", "-remainder-smoothed"), ("+1 Smoothing", "-remainder+smoothed"), ("Remainder", "+remainder-smoothed"), ("Both", "+remainder+smoothed")] # Get the list of authors and works the metric got correct scoreLists = {} for _, opt in metricOptions: scoreLists[opt] = {} name = opt # Use Poetry Words metricTopStr = topStr fname = "output/greek/no_split/%s/%s/metric%s/Books/scores.json" % ( metricTopStr, dir, opt) scores = utils.getContent(fname, True) scoreLists[opt] = scores scoreLists[opt]["name"] = name baseScore = scoreLists["-remainder-smoothed"] # baseScores = [] # for bsi in baseScoreInfo: # baseScoreMetric, baseScoreIndex = bsi # baseScores.append(scoreLists[baseScoreMetric][baseScoreIndex]) # Create a table of the information using the provided scores for optName, opt in metricOptions: cell = "\\textbf{%s}" % (optName) currentScores = scoreLists[opt] authorScores = currentScores["author"] workScores = currentScores["work"] name = currentScores["name"] sameWork = "%.2f%%, (%d/%d)" % ( 100 * np.mean(workScores), np.sum(workScores), len(workScores)) sameAuth = "%.2f%%, (%d/%d)" % (100 * np.mean(authorScores), np.sum(authorScores), len(authorScores)) all = np.concatenate((workScores, authorScores)) total = "%.2f%%, (%d/%d)" % (100 * np.mean(all), np.sum(all), len(all)) wrk = " & %s" % (sameWork) auth = " & %s" % (sameAuth) tot = " & %s" % (total) # Calculate significance a = baseScore["work"] b = currentScores["work"] work_t, work_p = stats.ttest_rel(a, b) workSigReport.append(name) # Degrees of freedom df = len(b) - 1 workSig = " (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % ( np.mean(b), np.std(b), df, work_t, work_p) workSigReport.append(workSig) a = baseScore["author"] b = currentScores["author"] author_t, author_p = stats.ttest_rel(a, b) authorSigReport.append(name) # Degrees of freedom df = len(b) - 1 authorSig = " (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % ( np.mean(b), np.std(b), df, author_t, author_p) authorSigReport.append(authorSig) a = np.concatenate((baseScore["work"], baseScore["author"])) b = np.concatenate( (currentScores["work"], currentScores["author"])) all_t, all_p = stats.ttest_rel(a, b) totalSigReport.append(name) # Degrees of freedom df = len(b) - 1 totalSig = " (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % ( np.mean(b), np.std(b), df, all_t, all_p) totalSigReport.append(totalSig) # if (name == bestMetricName or name == baseScore["name"]): # bestMetricSigWork.append("%s vs %s" % (name, baseScore["name"])) # bestMetricSigWork.append(workSig) # # bestMetricSigAuthor.append("%s vs %s" % (name, baseScore["name"])) # bestMetricSigAuthor.append(authorSig) #print(" Author: t-statistic = %6.3f pvalue = %f" % stats.ttest_rel(a, b)) # Significance notes if (work_p < 0.01): wrk += "\\textbf{†}" elif (work_p < 0.05): wrk += "\\textbf{*}" if (author_p < 0.01): auth += "\\textbf{†}" elif (author_p < 0.05): auth += "\\textbf{*}" if (all_p < 0.01): tot += "\\textbf{†}" elif (all_p < 0.05): tot += "\\textbf{*}" # wrk += " %.4f" % work_p # auth += " %.4f" % author_p # tot += " %.4f" % all_p cell += "%s%s%s" % (wrk, auth, tot) cell = cell.replace("%", "\\%") tableOutput.append("%s\\\\\\hline" % cell) tableOutput.append("\\end{tabular}") tableOutput.append("\\caption{") tableOutput.append( "How well %s performs with the remainder words and smoothing included. " % metricName) tableOutput.append( "†: Results very significant (p < 0.01) when compared to baseline. " ) tableOutput.append( "*: Results significant (p < 0.05) when compared to baseline. ") tableOutput.append("}") tableOutput.append("\\label{table:metric_options_eval_%s}" % dir) tableOutput.append("\\end{table}") tableOutput.append("") tableOutput.append("") metricInternalTables.append("\n".join(tableOutput)) utils.safeWrite( "%smetric/%s_optionsEvalTable%s.tex" % (baseFolder, metricName, suffix), "\n".join(tableOutput)) # sigReport = "Work:\n" + ("\n".join(bestMetricSigWork)) + "\n\n-------------\n\nAuthor:\n" + ("\n".join(bestMetricSigAuthor)) # utils.safeWrite("%smetric/bestMetricSignificance%s_2.txt" % (baseFolder, suffix), sigReport) # utils.safeWrite("%smetric/extraInfo/metricSignificanceReportWork%s_2.txt" % (baseFolder, suffix), "\n".join(workSigReport)) # utils.safeWrite("%smetric/extraInfo/metricSignificanceReportAuthor%s_2.txt" % (baseFolder, suffix), "\n".join(authorSigReport)) utils.safeWrite( "%smetric/extraInfo/optionsEvalTables%s.tex" % (baseFolder, suffix), "\n".join(metricInternalTables))
def makeMetricEvalTables(suffix, topStr, comparableTopStr, topNum, poetryNum, comparableNum, simMetrics, baseFolder): baseScoreInfo = [ ("Cosine", 0), ("Burrows' Delta", 0), ] bestMetricName = "Jensen-Shannon (250)" #Jensen-Shannon+p bestMetricSigWork = [] bestMetricSigAuthor = [] evalTableOutput = [] evalTableOutput.append("""\\begin{table}[!bt] \\centering \\def\\arraystretch{1} \\begin{tabular}{| l | r | r |} \\hline & \\multicolumn{2}{c|}{\\textbf{Percentage of segments most similar to a segment...}} \\\\ \\textbf{Metric}& \\textbf{from the same work} & \\textbf{by the same author} \\\\\\hline """) sameWorkTableOutput = [] sameAuthorTableOutput = [] temp = """\\begin{table}[!bt] \\centering \\def\\arraystretch{1} \\begin{tabular}{| l | c | c | c |} \\hline """ sameWorkTableOutput.append(temp) sameAuthorTableOutput.append(temp) temp = "& & \\textbf{Top %d +} & \\\\" % (topNum) sameWorkTableOutput.append(temp) sameAuthorTableOutput.append(temp) temp = "\\textbf{Metric}& \\textbf{Top %d} & \\textbf{Top %d in Poetry} & \\textbf{Top %d} \\\\\\hline" % ( topNum, poetryNum, comparableNum) sameWorkTableOutput.append(temp) sameAuthorTableOutput.append(temp) workSigReport = [] authorSigReport = [] # & \\textbf{Sim to another work} & \\textbf{Closest to diff author} & \\textbf{Median} # Get the list of authors and works the metric got correct scoreLists = {} for simMetric in simMetrics: dir, metricName = simMetric scoreLists[metricName] = {} for i, params in enumerate([ (False, False), (True, False), (False, True), ]): name = metricName addP, comparable = params metricTopStr = topStr if addP: metricTopStr += "+p" name += "+p" # look at comparable number of non-poetry words elif comparable: metricTopStr = comparableTopStr name += " (%d)" % comparableNum else: name += " (%d)" % topNum fname = "output/greek/no_split/%s/%s/metric/Books/scores.json" % ( metricTopStr, dir) scores = utils.getContent(fname, True) scoreLists[metricName][i] = scores scoreLists[metricName][i]["name"] = name baseScores = [] for bsi in baseScoreInfo: baseScoreMetric, baseScoreIndex = bsi baseScores.append(scoreLists[baseScoreMetric][baseScoreIndex]) # Create a table of the information using the provided scores for metricName in scoreLists: cell2 = "\\textbf{%s}" % (metricName) cell3 = "\\textbf{%s}" % (metricName) for i in scoreLists[metricName]: currentScores = scoreLists[metricName][i] authorScores = currentScores["author"] workScores = currentScores["work"] name = currentScores["name"] sameWork = "%.2f%%" % (100 * np.mean(workScores)) sameAuth = "%.2f%%" % (100 * np.mean(authorScores)) # sameWork = "%.2f%%, (%d/%d)" % (100*np.mean(workScores), np.sum(workScores), len(workScores)) # sameAuth = "%.2f%%, (%d/%d)" % (100*np.mean(authorScores), np.sum(authorScores), len(authorScores)) # cell = "%s & %s & %s & %s & %s & %s" % (name, sameAuth, sameWork, otherWork, diffAuthClosest, median) cell = "%s & %s & %s" % (name, sameWork, sameAuth) cell = cell.replace("%", "\\%") evalTableOutput.append("%s\\\\\\hline" % cell) cell2 += " & %s" % (sameWork) # work_p cell3 += " & %s" % (sameAuth) # , author_p) for j, baseScore in enumerate(baseScores): a = baseScore["work"] b = currentScores["work"] work_t, work_p = stats.ttest_rel(a, b) workSigReport.append(name) # Degrees of freedom df = len(b) - 1 workSig = " (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % ( np.mean(b), np.std(b), df, work_t, work_p) workSigReport.append(workSig) a = baseScore["author"] b = currentScores["author"] author_t, author_p = stats.ttest_rel(a, b) authorSigReport.append(name) # Degrees of freedom df = len(b) - 1 authorSig = " (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % ( np.mean(b), np.std(b), df, author_t, author_p) authorSigReport.append(authorSig) if (name == bestMetricName or name == baseScore["name"]): bestMetricSigWork.append("%s vs %s" % (name, baseScore["name"])) bestMetricSigWork.append(workSig) bestMetricSigAuthor.append("%s vs %s" % (name, baseScore["name"])) bestMetricSigAuthor.append(authorSig) #print(" Author: t-statistic = %6.3f pvalue = %f" % stats.ttest_rel(a, b)) # Significance notes if (j == 0): if (work_p < 0.01): cell2 += "\\textbf{†}" elif (work_p < 0.05): cell2 += "\\textbf{*}" if (author_p < 0.01): cell3 += "\\textbf{†}" elif (author_p < 0.05): cell3 += "\\textbf{*}" else: if (work_p < 0.01): cell2 += "\\textbf{‡}" if (author_p < 0.01): cell3 += "\\textbf{‡}" cell2 = cell2.replace("%", "\\%") sameWorkTableOutput.append("%s\\\\\\hline" % cell2) cell3 = cell3.replace("%", "\\%") sameAuthorTableOutput.append("%s\\\\\\hline" % cell3) evalTableOutput.append(""" \\end{tabular} \\caption{How well similarity metrics identify whether two segments come from the same work or the same author.} \\label{table:metric_eval} \\end{table} """) utils.safeWrite( "%smetric/extraInfo/metricEvalTable%s.tex" % (baseFolder, suffix), "\n".join(evalTableOutput)) sameWorkTableOutput.append("\\end{tabular}") sameWorkTableOutput.append( "\\caption[How well similarity metrics based on a given set of words identify whether two segments come from the same work.]{" ) sameWorkTableOutput.append( "How well similarity metrics based on a given set of words identify whether two segments come from the same work. \\newline" ) sameWorkTableOutput.append( "†: Results very significant (p < 0.01) when compared to %s. \\newline" % baseScores[0]["name"]) sameWorkTableOutput.append( "*: Results significant (p < 0.05) when compared to %s. \\newline" % baseScores[0]["name"]) sameWorkTableOutput.append( "‡: Results very significant (p < 0.01) when compared to %s. " % baseScores[1]["name"]) sameWorkTableOutput.append("}") sameWorkTableOutput.append("\\label{table:metric_eval_work}") sameWorkTableOutput.append("\\end{table}") utils.safeWrite("%smetric/sameWorkEvalTable%s.tex" % (baseFolder, suffix), "\n".join(sameWorkTableOutput)) sameAuthorTableOutput.append("\\end{tabular}") sameAuthorTableOutput.append( "\\caption[How well similarity metrics based on a given set of words identify whether two segments come from the same author.]{" ) sameAuthorTableOutput.append( "How well similarity metrics based on a given set of words identify whether two segments come from the same author. \\newline" ) sameAuthorTableOutput.append( "†: Results very significant (p < 0.01) when compared to %s. \\newline" % baseScores[0]["name"]) sameAuthorTableOutput.append( "*: Results significant (p < 0.05) when compared to %s. \\newline" % baseScores[0]["name"]) sameAuthorTableOutput.append( "‡: Results very significant (p < 0.01) when compared to %s. " % baseScores[1]["name"]) sameAuthorTableOutput.append("}") sameAuthorTableOutput.append("\\label{table:metric_eval_author}") sameAuthorTableOutput.append("\\end{table}") utils.safeWrite( "%smetric/sameAuthorEvalTable%s.tex" % (baseFolder, suffix), "\n".join(sameAuthorTableOutput)) sigReport = "Work:\n" + ( "\n".join(bestMetricSigWork)) + "\n\n-------------\n\nAuthor:\n" + ( "\n".join(bestMetricSigAuthor)) utils.safeWrite( "%smetric/bestMetricSignificance%s.txt" % (baseFolder, suffix), sigReport) # utils.safeWrite("%smetric/bestMetricSignificanceWork%s.txt" % (baseFolder, suffix), "\n".join(bestMetricSigWork)) # utils.safeWrite("%smetric/bestMetricSignificanceAuthor%s.txt" % (baseFolder, suffix), "\n".join(bestMetricSigAuthor)) utils.safeWrite( "%smetric/extraInfo/metricSignificanceReportWork%s.txt" % (baseFolder, suffix), "\n".join(workSigReport)) utils.safeWrite( "%smetric/extraInfo/metricSignificanceReportAuthor%s.txt" % (baseFolder, suffix), "\n".join(authorSigReport))
def fourCenturiesTables(topStr, simMetrics, baseFolder): comparisonOutput = [] topSimsToExamine = 100 # Grab this from the best metric authorSims = utils.getContent( "output/greek/no_split/%s/jensen-shannon/metric/Authors/sims.txt" % (topStr), False).split("\n") topDistantSims = [] topDistantAuthors = {} for i, sim in enumerate(authorSims): centuries_apart = int(sim.split("(")[-1].split(" ")[0]) if (centuries_apart >= 4 and i < topSimsToExamine): topDistantSims.append(sim) topDistantAuthors[sim[11:]] = {} authors = " (".join(sim.split(" - ")[1].split(" (")[:-1]) if authors == "Isocrates, Lysias" or authors == "Plato, Xenophon" or authors == "AratusSolensis, Callimachus" or authors == "Herodotus, Thucydides": comparisonOutput.append("Rank %d: %s" % (i + 1, sim)) fourCenturiesApartOutput = [] fourCenturiesApartOutput.append( "%d of the top %d are at least 4 centuries apart." % (len(topDistantSims), topSimsToExamine)) fourCenturiesApartOutput.append("---") fourCenturiesApartOutput.extend(topDistantSims) utils.safeWrite("%swordUse/fourCenturiesApart.txt" % baseFolder, "\n".join(fourCenturiesApartOutput)) # Comparison to English and Icelandic numGreek = len(authorSims) fracGreek = topSimsToExamine / numGreek numDistantGreek = len(topDistantSims) englishSims = utils.getContent( "output/english/no_split/%s/jensen-shannon/metric/Authors/sims.txt" % (topStr), False).split("\n") numEnglish = len(englishSims) topSimsEnglish = int(np.ceil(numEnglish * fracGreek)) fracEnglish = topSimsEnglish / numEnglish numDistantEnglish = 0 num2English = 0 for sim in englishSims[:topSimsEnglish]: centuries_apart = int(sim.split("(")[-1].split(" ")[0]) if (centuries_apart >= 2): num2English += 1 if (centuries_apart >= 4): numDistantEnglish += 1 iceSims = utils.getContent( "output/icelandic/no_split/%s/jensen-shannon/metric/Authors/sims.txt" % (topStr), False).split("\n") numIcelandic = len(iceSims) topSimsIcelandic = int(np.ceil(numIcelandic * fracGreek)) fracIcelandic = topSimsIcelandic / numIcelandic numDistantIcelandic = 0 for sim in iceSims[:topSimsIcelandic]: centuries_apart = int(sim.split("(")[-1].split(" ")[0]) if (centuries_apart >= 4): numDistantIcelandic += 1 comparisonOutput.append("\n=========\n") comparisonOutput.append("Top similar pairs") comparisonOutput.append("Greek:") comparisonOutput.append(" examining top %d of %d pairs (%.2f%%)" % (topSimsToExamine, numGreek, 100 * fracGreek)) comparisonOutput.append( " %d (%.2f%%) are at least 4 centuries apart" % (numDistantGreek, 100 * numDistantGreek / topSimsToExamine)) comparisonOutput.append("English:") comparisonOutput.append(" examining top %d of %d pairs (%.2f%%)" % (topSimsEnglish, numEnglish, 100 * fracEnglish)) comparisonOutput.append( " %d (%.2f%%) are at least 4 centuries apart" % (numDistantEnglish, 100 * numDistantEnglish / topSimsEnglish)) comparisonOutput.append(" %d (%.2f%%) are at least 2 centuries apart" % (num2English, 100 * num2English / topSimsEnglish)) comparisonOutput.append("Icelandic:") comparisonOutput.append( " examining top %d of %d pairs (%.2f%%)" % (topSimsIcelandic, numIcelandic, 100 * fracIcelandic)) comparisonOutput.append( " %d (%.2f%%) are at least 4 centuries apart" % (numDistantIcelandic, 100 * numDistantIcelandic / topSimsIcelandic)) utils.safeWrite("%swordUse/fourApartComparisonInfo.txt" % baseFolder, "\n".join(comparisonOutput)) # Table for simMetric in simMetrics: dir, name = simMetric # "" or "+p" depending on which is better metricSims = utils.getContent( "output/greek/no_split/%s/%s/metric/Authors/sims.txt" % (topStr, dir), False).split("\n") for i, sim in enumerate(metricSims): pairName = sim[11:] if pairName in topDistantAuthors: topDistantAuthors[pairName][dir] = i + 1 # prepare values for coloring table cells maxVal = 0 minVal = 1000000 for authorPair in topDistantAuthors: for simDir, _ in simMetrics: val = topDistantAuthors[authorPair][simDir] minVal = min(minVal, val) maxVal = max(maxVal, val) pairRankOutput = [] pairRankOutputSimple = [] pairRankOutput.append(""" \\begin{table}[!bt] \\centering \\def\\arraystretch{1} \\begin{tabular}{| l | c | c | c | c | c | c |} \\hline & \\multicolumn{5}{c|}{\\textbf{Rank according to}} \\\\ & \\textbf{Jensen-} & \\textbf{Burrows'} & & & & \\\\ \\textbf{Authors} & \\textbf{Shannon} & \\textbf{Delta} & \\textbf{Min-Max} & \\textbf{Manhattan} & \\textbf{Canberra} & \\textbf{Cosine} \\\\\\hline """) pairRankOutputSimple.append("%s,%s,%s,%s,%s,%s,%s" % ("Authors", "Jensen-Shannon", "Burrow's Delta", "Min-Max", "Manhattan", "Canberra", "Cosine")) authorConvert = { "ApolloniusRhodius": "Apollonius", "DionysiusOfHalicarnassus": "Dionysius", "EusebiusOfCaesarea": "Eusebius", "ClementOfAlexandria": "Clement", "BasilBishopOfCaesarea": "Basil", "Anonymous(Hymns_Aphrodite)": "Hymns Aphrodite", "Anonymous(Hymns_Apollo)": "Hymns Apollo", "Anonymous(Hymns_Demeter)": "Hymns Demeter", "Anonymous(Hymns_Hermes)": "Hymns Hermes", "Anonymous(Hymns_Rest)": "Hymns Rest", } for authorPair in topDistantAuthors: pair = "(".join(authorPair.split(" (")[:-1]) pairSplit = pair.split(", ") author1 = pairSplit[0] author2 = pairSplit[1] if author1 in authorConvert: author1 = authorConvert[author1] if author2 in authorConvert: author2 = authorConvert[author2] pairName = author1 + ", " + author2 cell = "%s &" % pairName cellSimple = "%s," % re.sub(", ", "/", pairName) firstVal = None for simDir, _ in simMetrics: val = topDistantAuthors[authorPair][simDir] cutoff = 100 if (val < cutoff): r, g, b = colorConvert(minVal, cutoff, val, COLOR_ORANGE, COLOR_GRAY) else: r, g, b = colorConvert(cutoff, maxVal, val, COLOR_GRAY, COLOR_BLUE) cell += "\\cellcolor[rgb]{%.3f,%.3f,%.3f} " % (r, g, b) if (firstVal == None): firstVal = val cell += "%d & " % (val) cellSimple += "%d," % (val) else: cell += "%d (%+d) & " % (val, firstVal - val) rel = "(%d)" % (firstVal - val) cellSimple += "%d %s," % (val, rel) cell = cell[:-2] pairRankOutput.append("%s\\\\\\hline" % cell) pairRankOutputSimple.append(cellSimple) pairRankOutput.append(""" \\end{tabular} \\caption{Rank of these pair's similarity by different metrics.} \\label{table:pair_rank} \\end{table} """) utils.safeWrite("%swordUse/pairRankTable.tex" % baseFolder, "\n".join(pairRankOutput)) utils.safeWrite("%swordUse/pairRankTableSimple.csv" % baseFolder, "\n".join(pairRankOutputSimple))
def commit(self, jobNum, jobObj): fp = open(os.path.join(self._dbPath, 'job_%d.txt' % jobNum), 'w') utils.safeWrite(fp, utils.DictFormat(escapeString = True).format(jobObj.getAll()))