forked from tedunderwood/HathiGenreTrainingset
-
Notifications
You must be signed in to change notification settings - Fork 0
/
MetadataCensor.py
143 lines (107 loc) · 4.57 KB
/
MetadataCensor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Uses metadata to help assess degrees
import os, sys
import SonicScrewdriver as utils
rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")
modelindices, modelcolumns, modeldata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt")
options = ["non", "bio", "poe", "dra", "fic"]
def censor(htid, genresequence):
htid = utils.pairtreelabel(htid)
# convert the htid into a dirty pairtree label for metadata matching
# Create a dictionary with entries for all possible conditions, initially set negative.
symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"]
reported = dict()
for symptom in symptoms:
reported[symptom] = 0
couldbefiction = True
# Now we need to assess the largest genre in this volume.
genrecounts = dict()
genrecounts['fic'] = 0
genrecounts['poe'] = 0
genrecounts['dra'] = 0
genrecounts['non'] = 0
for page in genresequence:
indexas = page
# For this purpose, we treat biography and indexes as equivalent to nonfiction.
if page == "bio" or page == "index" or page == "back":
indexas = "non"
utils.addtodict(indexas, 1, genrecounts)
# Convert the dictionary of counts into a sorted list, and take the max.
genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
maxgenre = genretuples[0][1]
if htid not in rowindices and htid not in modelindices:
return genresequence, reported
if htid in rowindices:
genrestring = metadata["genres"][htid]
genreinfo = genrestring.split(";")
# It's a semicolon-delimited list of items.
for info in genreinfo:
if info == "Biography" or info == "Autobiography":
couldbefiction = False
if info == "biog?" and maxgenre == "non":
reported["weakconfirmation"] = 1
if info == "biog?" and maxgenre != "non":
reported["weakdenial"] = 1
if info == "Not fiction" and maxgenre == "non":
reported["weakconfirmation"] = 1
if info == "Not fiction" and maxgenre == "fic":
reported["weakdenial"] = 1
if (info == "Fiction" or info == "Novel") and maxgenre == "fic":
reported["strongconfirmation"] = 1
if (info == "Fiction" or info == "Novel") and maxgenre != "fic":
reported["strongdenial"] = 1
if info == "Biography" and maxgenre == "non":
reported["strongconfirmation"] = 1
if info == "Biography" and maxgenre != "non":
reported["strongdenial"] = 1
if info == "Autobiography" and maxgenre == "non":
reported["strongconfirmation"] = 1
if info == "Autobiography" and maxgenre != "non":
reported["strongdenial"] = 1
if (info == "Poetry" or info == "Poems") and maxgenre == "poe":
reported["strongconfirmation"] = 1
if (info == "Poetry" or info == "Poems") and maxgenre != "poe":
reported["strongdenial"] = 1
if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra":
reported["strongconfirmation"] = 1
if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra":
reported["strongdenial"] = 1
if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre == "non":
reported["strongconfirmation"] = 1
couldbefiction = False
if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre != "non":
reported["strongdenial"] = 1
else:
print("Skipped.")
if htid in modelindices:
modelpredictions = dict()
for genre, genrecolumn in modeldata.items():
if not genre in options:
# this column is not a genre!
continue
modelpredictions[genre] = float(genrecolumn[htid])
predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse = True)
modelprediction = predictionlist[0][1]
modelconfidence = predictionlist[0][0]
nextclosest = predictionlist[1][0]
# Take the top prediction.
# For purposes of this routine, treat biography as nonfiction:
if modelprediction == "bio":
modelprediction = "non"
if maxgenre == modelprediction:
reported["modelagrees"] = 1 ## modelconfidence - nextclosest
reported["modeldisagrees"] = 0
if maxgenre != modelprediction:
## divergence = modelconfidence - modelpredictions[maxgenre]
reported["modeldisagrees"] = 1
reported["modelagrees"] = 0
## print(maxgenre + " ≠ " + modelprediction)
else:
reported["modelagrees"] = 0
reported["modeldisagrees"] = 0
modelprediction = "unknown"
if not couldbefiction:
numberofpages = len(genresequence)
for i in range(numberofpages):
if genresequence[i] == "fic":
genresequence[i] = "non"
return genresequence, reported