# CreateStupidPredictions.py import os, sys import SonicScrewdriver as utils rowindices, columns, metadata = utils.readtsv( "/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv") sourcedirectory = "/Users/tunder/Dropbox/pagedata/mixedtraining/genremaps/" dirlist = os.listdir(sourcedirectory) validnames = list() for filename in dirlist: if not (filename.startswith(".") or filename.startswith("_")): validnames.append(filename) for filename in validnames: filepath = os.path.join(sourcedirectory, filename) with open(filepath, mode="r", encoding="utf-8") as f: filelines = f.readlines() numpages = len(filelines) htid = utils.pairtreelabel(filename[0:-4]) # convert the htid into a dirty pairtree label for metadata matching genre = "unknown"
# refine fiction import SonicScrewdriver as utils def passfilter(genrestring): fields = genrestring.split(';') if "Autobiography" in fields or "Biography" in fields: return False else: return True rows19c, columns19c, table19c = utils.readtsv('/Volumes/TARDIS/work/metadata/19cMetadata.tsv') rows20c, columns20c, table20c = utils.readtsv('/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv') with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding = 'utf-8') as f: filelines = f.readlines() idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines] filteredrows = list() missing = 0 for anid in idlist: if anid in rows19c: genrestring = table19c["genres"][anid] rowdict = dict() for col in columns19c: rowdict[col] = table19c[col][anid] elif anid in rows20c:
wordcountsbyfile[htid].append(count) else: wordcountsbyfile[htid] = [count] return wordcountsbyfile # Begin main script. TOL = 0.1 THRESH = 0.80 genrestocheck = ['fic', 'poe', 'dra'] metadatapath = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv' rows, columns, table = utils.readtsv(metadatapath) firstsource = "/Users/tunder/Dropbox/pagedata/to1923features/genremaps/" secondsource = "/Users/tunder/Dropbox/pagedata/seventhfeatures/genremaps/" firstmaps = os.listdir(firstsource) secondmaps = os.listdir(secondsource) firstwordcounts = loadwordcounts(firstsource) secondwordcounts = loadwordcounts(secondsource) predictsource = '/Users/tunder/Dropbox/pagedata/production/crosspredicts/' predicts = os.listdir(predictsource) predicts = [x for x in predicts if not x.startswith('.')]
# plotter import matplotlib.pyplot as plt import SonicScrewdriver as utils import pandas as pd from scipy.stats.stats import pearsonr indices, columns, agreement = utils.readtsv( "/Users/tunder/Dropbox/pagedata/interrater/HumanDissensus.tsv") indices2, columns2, confidence = utils.readtsv( "/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv") for idx in indices: if idx not in indices2: print(idx + " is missing.") makeframe = dict() makeframe["human-agreement"] = agreement["agreement"] makeframe["machine-accuracy"] = confidence["accuracy"] df = pd.DataFrame(makeframe, dtype="float") df = df.dropna() print(str(pearsonr(df["human-agreement"], df["machine-accuracy"]))) plt.plot(df["human-agreement"], df["machine-accuracy"], "r.") plt.xlabel("Human agreement") plt.ylabel("Machine accuracy") plt.axis([0, 1.02, 0, 1.02])
# Uses metadata to help assess degrees import os, sys import SonicScrewdriver as utils rowindices, columns, metadata = utils.readtsv( "/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv") modelindices, modelcolumns, modeldata = utils.readtsv( "/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt") options = ["non", "bio", "poe", "dra", "fic"] def censor(htid, genresequence): htid = utils.pairtreelabel(htid) # convert the htid into a dirty pairtree label for metadata matching # Create a dictionary with entries for all possible conditions, initially set negative. symptoms = [ "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees" ] reported = dict() for symptom in symptoms: reported[symptom] = 0 couldbefiction = True # Now we need to assess the largest genre in this volume.
def main(): global testrun, datapath, slicepath, metadatapath, current_working, metaoutpath, errorpath, pagevocabset if testrun: filelist = os.listdir(datapath) HTIDs = set() for afilename in filelist: if not (afilename.startswith(".") or afilename.startswith("_")): HTIDs.add(afilename) else: with open(slicepath, encoding="utf-8") as file: HTIDlist = file.readlines() HTIDs = set([x.rstrip() for x in HTIDlist]) del HTIDlist ## discard bad volume IDs with open(metadatapath + "badIDs.txt", encoding = 'utf-8') as file: filelines = file.readlines() for line in filelines: line = line.rstrip() line = line.split(delim) if line[0] in HTIDs: HTIDs.discard(line[0]) if not os.path.isfile(metaoutpath): with open(metaoutpath, 'w', encoding = 'utf-8') as f: f.write("volID\ttotalwords\tprematched\tpreenglish\tpostmatched\tpostenglish\n") print(len(HTIDs)) # Let's get some metadata to create metadata features. if testrun: rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv") else: rowindices, columns, metadata = utils.readtsv("/projects/ichass/usesofscale/hathimeta/ExtractedMetadata.tsv") metadata_clues = list() for aHTID in HTIDs: evidence = get_metadata_evidence(aHTID, rowindices, columns, metadata) metadata_clues.append(evidence) assert len(HTIDs) == len(metadata_clues) file_tuples = zip(HTIDs, metadata_clues) pool = Pool(processes = 12) res = pool.map_async(process_a_file, file_tuples) # After all files are processed, write metadata, errorlog, and counts of phrases. res.wait() resultlist = res.get() processedmeta = list() errorlog = list() phrasecount = dict() for file_dict in resultlist: processedmeta.append(file_dict["metadata"]) errorlog.extend(file_dict["errors"]) htid = file_dict["htid"] # Metadata. with open(metaoutpath, mode = 'a', encoding = 'utf-8') as file: for metatuple in processedmeta: outlist = [x for x in metatuple] outline = delim.join(outlist) + '\n' file.write(outline) # Write the errorlog. if len(errorlog) > 0: with open(errorpath, mode = 'w', encoding = 'utf-8') as file: for line in errorlog: file.write(line + '\n') # Write phrase counts. # with open(phrasecountpath, mode="w", encoding = "utf-8") as file: # j = json.dumps(phrasecount) # file.write(j) print("Done.") pool.close() pool.join()
print() # print("ROUGH MICROACCURACY:") # print(roughaccuracy) print("SMOOTHED MICROACCURACY:") print(smoothaccuracy) print("COALESCED MICROACCURACY:") print(coalaccuracy) with open("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv", mode = "w", encoding="utf-8") as f: f.write("htid\taccuracy\n") for key, value in accuracies.items(): outline = key + "\t" + str(value) + "\n" f.write(outline) metadatapath = os.path.join(firstdir, "predictionMetadata.tsv") rowindices, columns, metadata = utils.readtsv(metadatapath) metadatatable['maxprob']= metadata['maxprob'] metadatatable['gap'] = metadata['gap'] metadatatable['accuracy'] = accuracies metadatatable['dissent'] = dissentperfile data = pd.DataFrame(metadatatable, dtype = "float") data['intercept'] = 1.0 train_cols = data.columns[1:] logit = sm.Logit(data['accuracy'], data[train_cols]) result = logit.fit() print(result.summary()) predictions = result.predict(data[train_cols]) print(pearsonr(data['accuracy'], predictions))
## import os, sys import SonicScrewdriver as utils import random rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv") initialsample = random.sample(rowindices, 2000) directorylist = os.listdir("/Users/tunder/Dropbox/pagedata/mixedtraining/pagefeatures") existingfiles = list() for filename in directorylist: if filename.startswith(".") or filename.startswith("_"): continue htid = utils.pairtreelabel(filename[0:-7]) existingfiles.append(htid) counter = 0 toremove = list() for htid in initialsample: if htid in existingfiles: counter +=1 toremove.append(htid) print("Found " + str(counter) + " duplicates.") for htid in toremove: initialsample.remove(htid)
# refine fiction import SonicScrewdriver as utils def passfilter(genrestring): fields = genrestring.split(';') if "Autobiography" in fields or "Biography" in fields: return False else: return True rows19c, columns19c, table19c = utils.readtsv( '/Volumes/TARDIS/work/metadata/19cMetadata.tsv') rows20c, columns20c, table20c = utils.readtsv( '/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv') with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding='utf-8') as f: filelines = f.readlines() idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines] filteredrows = list() missing = 0 for anid in idlist: if anid in rows19c:
title = fields[2] author = fields[3] + ', ' + fields[4] date = fields[8] filename = idcode + '.txt' filepath = os.path.join(sourcedir, filename) if os.path.isfile(filepath): tokencount, wordcount = count_words(filepath) else: print("Missing file: " + filepath) sys.exit(0) newrow = [idcode, date, tokencount, wordcount, author, title] outtable.append(newrow) print(counter) counter += 1 rows, columns, table = utils.readtsv( '/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv') sourcedir = "/Volumes/TARDIS/work/moneytexts/" for row in rows: filename = utils.pairtreefile(row) + ".fic.txt" filepath = os.path.join(sourcedir, filename) if os.path.isfile(filepath): tokencount, wordcount = count_words(filepath) else: print("Missing file: " + filepath) sys.exit(0) idcode = table["HTid"][row] date = str(utils.simple_date(row, table)) author = table["author"][row]
title = fields[2] author = fields[3] + ', ' + fields[4] date = fields[8] filename = idcode + '.txt' filepath = os.path.join(sourcedir, filename) if os.path.isfile(filepath): tokencount, wordcount = count_words(filepath) else: print("Missing file: " + filepath) sys.exit(0) newrow = [idcode, date, tokencount, wordcount, author, title] outtable.append(newrow) print(counter) counter += 1 rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv') sourcedir = "/Volumes/TARDIS/work/moneytexts/" for row in rows: filename = utils.pairtreefile(row) + ".fic.txt" filepath = os.path.join(sourcedir, filename) if os.path.isfile(filepath): tokencount, wordcount = count_words(filepath) else: print("Missing file: " + filepath) sys.exit(0) idcode = table["HTid"][row] date = str(utils.simple_date(row, table)) author = table["author"][row]
# Generate Cotraining Set # This script uses a set of volumes already classified and sorted by a model # in order to generate additional training data for a new model. import SonicScrewdriver as utils from shutil import copyfile indices, columns, metadata = utils.readtsv( "/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv") toget = indices[-200:] toget = [utils.pairtreefile(x) for x in toget] genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" for htid in toget: featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv" featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv" copyfile(featuresource, featuredestination) genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict" genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map" with open(genresource, mode="r", encoding="utf-8") as f: filelines = f.readlines() with open(genredestination, mode="w", encoding="utf-8") as f: for line in filelines:
# Uses metadata to help assess degrees import os, sys import SonicScrewdriver as utils rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv") modelindices, modelcolumns, modeldata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt") options = ["non", "bio", "poe", "dra", "fic"] def censor(htid, genresequence): htid = utils.pairtreelabel(htid) # convert the htid into a dirty pairtree label for metadata matching # Create a dictionary with entries for all possible conditions, initially set negative. symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"] reported = dict() for symptom in symptoms: reported[symptom] = 0 couldbefiction = True # Now we need to assess the largest genre in this volume. genrecounts = dict() genrecounts['fic'] = 0 genrecounts['poe'] = 0 genrecounts['dra'] = 0 genrecounts['non'] = 0
# Figures out what call numbers mean for genre import os, sys import SonicScrewdriver as utils rowindices, columns, metadata = utils.readtsv( "/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv") options = ["non", "bio", "poe", "dra", "fic"] modelindices, modelcolumns, modeldata = utils.readtsv( "/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt") def keywithmaxval(dictionary): maxval = 0 maxkey = "" for key, value in dictionary.items(): if value > maxval: maxval = value maxkey = key return maxkey def sequence_to_counts(genresequence): '''Converts a sequence of page-level predictions to a dictionary of counts reflecting the number of pages assigned to each genre. Also reports the largest genre.'''
# plotter import matplotlib.pyplot as plt import SonicScrewdriver as utils import pandas as pd from scipy.stats.stats import pearsonr indices, columns, agreement = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/HumanDissensus.tsv") indices2, columns2, confidence = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/MachineConfidence.tsv") for idx in indices: if idx not in indices2: print(idx + " is missing.") makeframe = dict() makeframe["human-agreement"] = agreement["agreement"] makeframe["machine-confidence"] = confidence["accuracy"] df = pd.DataFrame(makeframe, dtype="float") df = df.dropna() print(str(pearsonr(df["human-agreement"], df["machine-confidence"]))) plt.plot(df["human-agreement"], df["machine-confidence"], "r.") plt.xlabel("Human agreement") plt.ylabel("Machine confidence") plt.axis([0,1,0,1]) plt.show()
else: with open(slicepath, encoding = 'utf-8') as f: filelines = f.readlines() idstoget = set([x.strip() for x in filelines]) getall = False startdir = 0 enddir = 100 with open(wordpath, encoding = 'utf-8') as f: filelines = f.readlines() wordstoget = set([x.strip() for x in filelines]) metafile = '/projects/ichass/usesofscale/hathimeta/MergedMonographs.tsv' rows, columns, table = utils.readtsv(metafile) subdirectories = [os.path.join(d,o) for o in os.listdir(d) if os.path.isdir(os.path.join(d,o))] wordcounts = dict() counter = 0 if slicepath in directoryslices: outputpath = os.path.join(outdir, slicepath + ".tsv") else: outputpath = os.path.join(outdir, "extracted_words.tsv") # Get a dictionary so you can count dictionary words. lexicon = set()
for char in locnum: if char.isalpha(): letterstring += char.upper() else: break if len(letterstring) > 2: letterstring = letterstring[:2] if len(letterstring) > 1 and letterstring[0] == "N": letterstring = "N" if len(letterstring) > 1 and letterstring[0] == "V": letterstring = "V" return letterstring rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv") with open("/Users/tunder/Dropbox/pagedata/litlocs.tsv", encoding="utf-8") as f: filelines = f.readlines() litlocs = dict() for line in filelines: line = line.strip() fields = line.split('\t') litlocs[fields[0]] = int(round(1000 * float(fields[1]))) with open("/Users/tunder/Dropbox/pagedata/biolocs.tsv", encoding="utf-8") as f: filelines = f.readlines() biolocs = dict() for line in filelines: line = line.strip() fields = line.split('\t')
# Uses metadata to help assess degrees import os, sys import SonicScrewdriver as utils rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv") options = ["non", "bio", "poe", "dra", "fic"] with open("/Users/tunder/Dropbox/pagedata/litlocs.tsv", encoding="utf-8") as f: filelines = f.readlines() litlocs = dict() for line in filelines: line = line.strip() fields = line.split('\t') litlocs[fields[0]] = int(round(1000 * float(fields[1]))) with open("/Users/tunder/Dropbox/pagedata/biolocs.tsv", encoding="utf-8") as f: filelines = f.readlines() biolocs = dict() for line in filelines: line = line.strip() fields = line.split('\t') biolocs[fields[0]] = int(round(1000 * float(fields[1]))) def letterpart(locnum): if locnum == "<blank>": return "<blank>" letterstring = "" for char in locnum:
reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1880-1899_200.csv' with open(reviews) as f: reader = csv.reader(f) for fields in reader: htid = fields[0] if htid == "HTid": continue jgenre = fields[13] date = int(fields[1]) if jgenre == 'poe': selecteddates[htid] = date selected.add(htid) rows, columns, table = utils.readtsv( '/Users/tunder/Dropbox/GenreProject/metadata/filteredpoetry.tsv') bydate = dict() for row in rows: if row in selected: continue date = utils.simple_date(row, table) if date in bydate: bydate[date].append(row) else: bydate[date] = [row] controlset = set()
# sort_anovaset.py import SonicScrewdriver as utils import csv rows, columns, table = utils.readtsv( '/Volumes/TARDIS/work/metadata/19cmetadata.tsv') with open('anovaset.txt', encoding='utf-8') as f: filelines = f.readlines() wholeset = [x.rstrip() for x in filelines] the19c = list() the20c = list() for anid in wholeset: if anid in rows: the19c.append(anid) else: the20c.append(anid) with open('anova19c.txt', mode='w', encoding='utf-8') as f: for anid in the19c: f.write(anid + '\n') with open('anova20c.txt', mode='w', encoding='utf-8') as f: for anid in the20c: f.write(anid + '\n')
# Figures out what call numbers mean for genre import os, sys import SonicScrewdriver as utils rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv") options = ["non", "bio", "poe", "dra", "fic"] modelindices, modelcolumns, modeldata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt") def keywithmaxval(dictionary): maxval = 0 maxkey = "" for key, value in dictionary.items(): if value > maxval: maxval = value maxkey = key return maxkey def sequence_to_counts(genresequence): '''Converts a sequence of page-level predictions to a dictionary of counts reflecting the number of pages assigned to each genre. Also reports the largest genre.''' genrecounts = dict() genrecounts['fic'] = 0 genrecounts['poe'] = 0 genrecounts['dra'] = 0
# Generate Cotraining Set # This script uses a set of volumes already classified and sorted by a model # in order to generate additional training data for a new model. import SonicScrewdriver as utils from shutil import copyfile indices, columns, metadata = utils.readtsv("/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv") toget = indices[-200:] toget = [utils.pairtreefile(x) for x in toget] genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" for htid in toget: featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv" featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv" copyfile(featuresource, featuredestination) genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict" genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map" with open(genresource, mode="r", encoding = "utf-8") as f: filelines = f.readlines() with open(genredestination, mode="w", encoding = "utf-8") as f: for line in filelines: line = line.rstrip()
reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1880-1899_200.csv' with open(reviews) as f: reader = csv.reader(f) for fields in reader: htid = fields[0] if htid == "HTid": continue jgenre = fields[13] date = int(fields[1]) if jgenre == 'poe': selecteddates[htid] = date selected.add(htid) rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/filteredpoetry.tsv') bydate = dict() for row in rows: if row in selected: continue date = utils.simple_date(row, table) if date in bydate: bydate[date].append(row) else: bydate[date] = [row] controlset = set()
# sort_anovaset.py import SonicScrewdriver as utils import csv rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/19cmetadata.tsv') with open('anovaset.txt', encoding = 'utf-8') as f: filelines = f.readlines() wholeset = [x.rstrip() for x in filelines] the19c = list() the20c = list() for anid in wholeset: if anid in rows: the19c.append(anid) else: the20c.append(anid) with open('anova19c.txt', mode = 'w', encoding = 'utf-8') as f: for anid in the19c: f.write(anid + '\n') with open('anova20c.txt', mode = 'w', encoding = 'utf-8') as f: for anid in the20c: f.write(anid + '\n')
if htid in wordcountsbyfile: wordcountsbyfile[htid].append(count) else: wordcountsbyfile[htid] = [count] return wordcountsbyfile # Begin main script. TOL = 0.1 THRESH = 0.80 genrestocheck = ['fic', 'poe', 'dra'] metadatapath = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv' rows, columns, table = utils.readtsv(metadatapath) firstsource = "/Users/tunder/Dropbox/pagedata/to1923features/genremaps/" secondsource = "/Users/tunder/Dropbox/pagedata/seventhfeatures/genremaps/" firstmaps = os.listdir(firstsource) secondmaps = os.listdir(secondsource) firstwordcounts = loadwordcounts(firstsource) secondwordcounts = loadwordcounts(secondsource) predictsource = '/Users/tunder/Dropbox/pagedata/production/crosspredicts/' predicts = os.listdir(predictsource) predicts = [x for x in predicts if not x.startswith('.')]
modelfolder = "/Volumes/TARDIS/work/moneycontext/" modelpath = modelfolder + "logisticmodel.p" with open(modelpath, mode = 'rb') as f: logisticmodel = pickle.load(f) standardizerpath = modelfolder + 'standardizer.p' with open(standardizerpath, mode = 'rb') as f: standardizer = pickle.load(f) featurepath = modelfolder + 'featurelist.p' with open(featurepath, mode = 'rb') as f: features = pickle.load(f) # Now load HathiTrust metadata. rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/MergedMonographs.tsv') ambiguouswords = {'crown', 'crowns', 'guinea', 'guineas', 'nickel', 'sovereign', 'sovereigns', 'pound', 'pounds', 'quid'} moneywords = {'dollar', 'dollars', 'dime', 'dimes', 'nickel', 'nickels', 'pound', 'pounds', 'shilling', 'shillings', 'sovereign', 'sovereigns','cent', 'cents', 'centime', 'centimes', 'crown', 'crowns', 'halfcrown', 'half-crown','penny', 'pennies', 'pence', 'farthing', 'farthings', 'franc', 'francs', 'guilder', 'guilders', 'florin', 'florins', 'guinea', 'guineas', "ha'penny", 'tuppence', 'twopence', 'sixpence', '|arabicprice|', '|price|', 'quid'} # Words I explicitly decided not to include: 'quarter', 'quarters', 'mark', 'marks.' Monetary uses # seemed rare enough relative to others that they'd be more likely to introduce noise than to help. # |arabicprice| is a code the tokenizer in modelingcounter produces whenever it encounters # a number connected to £, $, ¢, s, or d. In the output we convert that to |price|, for no very # good reason. wealthwords = {'fortune', 'fortunes', 'wealth', 'rich', 'riches', 'money', 'moneys', 'fund', 'funds', 'sum', 'sums', 'price', 'prices', 'priced'} # This is by no means an exhaustive list. Owe, loan, borrowed, etc. # If we really want to get at the full range of words potentially
# make fiction subset import SonicScrewdriver as utils rows, columns, table = utils.readtsv("/Users/tunder/Dropbox/bookNLP/metadata/enrichedmetadataDec6.tsv") datedict = dict() selected = [] for row in rows: date = int(table["date"][row]) if date in datedict: datedict[date] += 1 else: datedict[date] = 1 if datedict[date] > 3: continue else: selected.append(row) with open("/Users/tunder/Dropbox/GenreProject/python/piketty/fictionsubset.txt", mode='w', encoding = 'utf-8') as f: for line in selected: f.write(line + '\n')
# plotter import matplotlib.pyplot as plt import SonicScrewdriver as utils import pandas as pd from scipy.stats.stats import pearsonr indices, columns, agreement = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/HumanDissensus.tsv") indices2, columns2, confidence = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv") for idx in indices: if idx not in indices2: print(idx + " is missing.") makeframe = dict() makeframe["human-agreement"] = agreement["agreement"] makeframe["machine-accuracy"] = confidence["accuracy"] df = pd.DataFrame(makeframe, dtype="float") df = df.dropna() print(str(pearsonr(df["human-agreement"], df["machine-accuracy"]))) plt.plot(df["human-agreement"], df["machine-accuracy"], "r.") plt.xlabel("Human agreement") plt.ylabel("Machine accuracy") plt.axis([0,1.02,0,1.02]) plt.show()
def add_counts(wordcounts, year, word, count): if year in wordcounts: if word in wordcounts[year]: wordcounts[year][word] += count else: wordcounts[year][word] = count else: wordcounts[year] = dict() wordcounts[year][word] = count metafile = '/Users/tunder/Dropbox/GenreProject/metadata/filteredfiction.tsv' rows, columns, table = utils.readtsv(metafile) dateindex = dict() for volid in rows: startdate = table["startdate"][volid] enddate = table["enddate"][volid] textdate = table["textdate"][volid] intdate = infer_date(startdate, enddate, textdate) if intdate >= 1750 and intdate <= 1950: if intdate in dateindex: dateindex[intdate].append(volid) else: dateindex[intdate] = [volid]
# print(roughaccuracy) print("SMOOTHED MICROACCURACY:") print(smoothaccuracy) print("COALESCED MICROACCURACY:") print(coalaccuracy) with open("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv", mode="w", encoding="utf-8") as f: f.write("htid\taccuracy\n") for key, value in accuracies.items(): outline = key + "\t" + str(value) + "\n" f.write(outline) metadatapath = os.path.join(firstdir, "predictionMetadata.tsv") rowindices, columns, metadata = utils.readtsv(metadatapath) metadatatable['maxprob'] = metadata['maxprob'] metadatatable['gap'] = metadata['gap'] metadatatable['accuracy'] = accuracies metadatatable['dissent'] = dissentperfile data = pd.DataFrame(metadatatable, dtype="float") data['intercept'] = 1.0 train_cols = data.columns[1:] logit = sm.Logit(data['accuracy'], data[train_cols]) result = logit.fit() print(result.summary()) predictions = result.predict(data[train_cols]) print(pearsonr(data['accuracy'], predictions))
# epistolarymetadata.py # This module ingests metadata created by Clara Mount in the summer of 2014, # and uses it to return information about genre in a group of novels. We # are especially interested in the boundary between epistolary and # non-epistolary fiction, which can be configured in a variety of ways. import SonicScrewdriver as utils import numpy as np epindices, epcolumns, epmetadata = utils.readtsv( "/Users/tunder/Dropbox/PythonScripts/classify/HathiGenreInfo-Epist.txt") nonindices, noncolumns, nonmetadata = utils.readtsv( "/Users/tunder/Dropbox/PythonScripts/classify/HathiGenreInfo-NonEpist2.txt" ) def get_genrevector(volumeIDs, boundarydef): global epindices, nonindices n = len(volumeIDs) genrevector = np.zeros(n) if boundarydef == "nonepistolary / epistolary": for idx, volID in enumerate(volumeIDs): cleanID = utils.pairtreelabel(volID) if cleanID in epindices: