def sequence_to_counts(genresequence): '''Converts a sequence of page-level predictions to a dictionary of counts reflecting the number of pages assigned to each genre. Also reports the largest genre. Note that this function cannot return "bio." If biography is the largest genre it returns "non"fiction. It counts bio, but ensures that all votes for bio are also votes for non. ''' genrecounts = dict() for page in genresequence: utils.addtodict(page, 1, genrecounts) if page == 'bio': utils.addtodict('non', 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True) maxgenre = genretuples[0][1] if maxgenre == 'bio': maxgenre = 'non' return genrecounts, maxgenre
def select_common_features(trainingset, n): ''' Very simply, selects the top n features in the training set. Not a sophisticated feature-selection strategy, but in many cases it gets the job done. ''' allwordcounts = dict() for avolume in trainingset: utils.add_dicts(avolume.rawcounts, allwordcounts) # The add_dicts function will add up all the raw counts into # a single master dictionary. descendingbyfreq = utils.sortkeysbyvalue(allwordcounts, whethertoreverse=True) # This returns a list of 2-tuple (frequency, word) pairs. if n > len(descendingbyfreq): n = len(descendingbyfreq) print("We only have " + str(n) + " features.") # List comprehension that gets the second element of each tuple, up to # a total of n tuples. topfeatures = [x[1] for x in descendingbyfreq[0:n]] return topfeatures
def select_common_features(trainingset, n): ''' Very simply, selects the top n features in the training set. Not a sophisticated feature-selection strategy, but in many cases it gets the job done. ''' allwordcounts = dict() for avolume in trainingset: utils.add_dicts(avolume.rawcounts, allwordcounts) # The add_dicts function will add up all the raw counts into # a single master dictionary. descendingbyfreq = utils.sortkeysbyvalue(allwordcounts, whethertoreverse = True) # This returns a list of 2-tuple (frequency, word) pairs. if n > len(descendingbyfreq): n = len(descendingbyfreq) print("We only have " + str(n) + " features.") # List comprehension that gets the second element of each tuple, up to # a total of n tuples. topfeatures = [x[1] for x in descendingbyfreq[0 : n]] return topfeatures
def sequence_to_counts(genresequence): '''Converts a sequence of page-level predictions to a dictionary of counts reflecting the number of pages assigned to each genre. Also reports the largest genre.''' genrecounts = dict() genrecounts['fic'] = 0 genrecounts['poe'] = 0 genrecounts['dra'] = 0 genrecounts['non'] = 0 for page in genresequence: indexas = page # For this purpose, we treat biography and indexes as equivalent to nonfiction. if page == "bio" or page == "index" or page == "back" or page == "trv": indexas = "non" utils.addtodict(indexas, 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True) maxgenre = genretuples[0][1] return genrecounts, maxgenre
def sequence_to_counts(genresequence): '''Converts a sequence of page-level predictions to a dictionary of counts reflecting the number of pages assigned to each genre. Also reports the largest genre. Note that this function cannot return "bio." If biography is the largest genre it returns "non"fiction. It counts bio, but ensures that all votes for bio are also votes for non. ''' genrecounts = dict() for page in genresequence: utils.addtodict(page, 1, genrecounts) if page == 'bio': utils.addtodict('non', 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True) maxgenre = genretuples[0][1] if maxgenre == 'bio': maxgenre = 'non' return genrecounts, maxgenre
def sequence_to_counts(genresequence): '''Converts a sequence of page-level predictions to a dictionary of counts reflecting the number of pages assigned to each genre. Also reports the largest genre.''' genrecounts = dict() genrecounts['fic'] = 0 genrecounts['poe'] = 0 genrecounts['dra'] = 0 genrecounts['non'] = 0 for page in genresequence: indexas = page # For this purpose, we treat biography and indexes as equivalent to nonfiction. if page == "bio" or page == "index" or page == "back" or page == "trv": indexas = "non" utils.addtodict(indexas, 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True) maxgenre = genretuples[0][1] return genrecounts, maxgenre
def resolve_voting(votes, tiebreaker): electorate = len(votes) results = dict() for vote in votes: # if vote == "bio": # vote = "non" utils.addtodict(vote, 1, results) candidate = utils.sortkeysbyvalue(results, whethertoreverse=True) dissent = (electorate - candidate[0][0]) / electorate if len(candidate) < 2: # There is only one candidate. return candidate[0][1], dissent, candidate[0][1] elif candidate[0][0] > candidate[1][0]: # We have a majority. return candidate[0][1], dissent, candidate[1][1] else: # We have a tie. if tiebreaker == candidate[0][1]: print("Tiebreaker " + tiebreaker) return candidate[0][1], dissent, candidate[1][1] elif tiebreaker == candidate[1][1]: print("Tiebreaker " + tiebreaker) return candidate[1][1], dissent, candidate[0][1] else: print("Tie in spite of " + tiebreaker) win = random.choice([candidate[0][1], candidate[1][1]]) if win == candidate[0][1]: runnerup = candidate[1][1] else: runnerup = candidate[0][1] return win, dissent, runnerup
def resolve_voting(votes, tiebreaker): electorate = len(votes) results = dict() for vote in votes: # if vote == "bio": # vote = "non" utils.addtodict(vote, 1, results) candidate = utils.sortkeysbyvalue(results, whethertoreverse = True) dissent = (electorate - candidate[0][0]) / electorate if len(candidate) < 2: # There is only one candidate. return candidate[0][1], dissent, candidate[0][1] elif candidate[0][0] > candidate[1][0]: # We have a majority. return candidate[0][1], dissent, candidate[1][1] else: # We have a tie. if tiebreaker == candidate[0][1]: print("Tiebreaker " + tiebreaker) return candidate[0][1], dissent, candidate[1][1] elif tiebreaker == candidate[1][1]: print("Tiebreaker " + tiebreaker) return candidate[1][1], dissent, candidate[0][1] else: print("Tie in spite of " + tiebreaker) win = random.choice([candidate[0][1], candidate[1][1]]) if win == candidate[0][1]: runnerup = candidate[1][1] else: runnerup = candidate[0][1] return win, dissent, runnerup
def maxkey(dictionary): tuplelist = utils.sortkeysbyvalue(dictionary, whethertoreverse=True) winner = tuplelist[0][1] # if winner == "bio": # winner = "non" return winner
def censor(htid, genresequence): htid = utils.pairtreelabel(htid) # convert the htid into a dirty pairtree label for metadata matching # Create a dictionary with entries for all possible conditions, initially set negative. symptoms = [ "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees" ] reported = dict() for symptom in symptoms: reported[symptom] = 0 couldbefiction = True # Now we need to assess the largest genre in this volume. genrecounts = dict() genrecounts['fic'] = 0 genrecounts['poe'] = 0 genrecounts['dra'] = 0 genrecounts['non'] = 0 for page in genresequence: indexas = page # For this purpose, we treat biography and indexes as equivalent to nonfiction. if page == "bio" or page == "index" or page == "back": indexas = "non" utils.addtodict(indexas, 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True) maxgenre = genretuples[0][1] if htid not in rowindices and htid not in modelindices: return genresequence, reported if htid in rowindices: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Biography" or info == "Autobiography": couldbefiction = False if info == "biog?" and maxgenre == "non": reported["weakconfirmation"] = 1 if info == "biog?" and maxgenre != "non": reported["weakdenial"] = 1 if info == "Not fiction" and maxgenre == "non": reported["weakconfirmation"] = 1 if info == "Not fiction" and maxgenre == "fic": reported["weakdenial"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre == "fic": reported["strongconfirmation"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre != "fic": reported["strongdenial"] = 1 if info == "Biography" and maxgenre == "non": reported["strongconfirmation"] = 1 if info == "Biography" and maxgenre != "non": reported["strongdenial"] = 1 if info == "Autobiography" and maxgenre == "non": reported["strongconfirmation"] = 1 if info == "Autobiography" and maxgenre != "non": reported["strongdenial"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre == "poe": reported["strongconfirmation"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre != "poe": reported["strongdenial"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra": reported["strongconfirmation"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra": reported["strongdenial"] = 1 if (info == "Catalog" or info == "Dictionary" or info == "Bibliographies") and maxgenre == "non": reported["strongconfirmation"] = 1 couldbefiction = False if (info == "Catalog" or info == "Dictionary" or info == "Bibliographies") and maxgenre != "non": reported["strongdenial"] = 1 else: print("Skipped.") if htid in modelindices: modelpredictions = dict() for genre, genrecolumn in modeldata.items(): if not genre in options: # this column is not a genre! continue modelpredictions[genre] = float(genrecolumn[htid]) predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse=True) modelprediction = predictionlist[0][1] modelconfidence = predictionlist[0][0] nextclosest = predictionlist[1][0] # Take the top prediction. # For purposes of this routine, treat biography as nonfiction: if modelprediction == "bio": modelprediction = "non" if maxgenre == modelprediction: reported["modelagrees"] = 1 ## modelconfidence - nextclosest reported["modeldisagrees"] = 0 if maxgenre != modelprediction: ## divergence = modelconfidence - modelpredictions[maxgenre] reported["modeldisagrees"] = 1 reported["modelagrees"] = 0 ## print(maxgenre + " ≠ " + modelprediction) else: reported["modelagrees"] = 0 reported["modeldisagrees"] = 0 modelprediction = "unknown" if not couldbefiction: numberofpages = len(genresequence) for i in range(numberofpages): if genresequence[i] == "fic": genresequence[i] = "non" return genresequence, reported
def maxkey(dictionary): tuplelist = utils.sortkeysbyvalue(dictionary, whethertoreverse = True) winner = tuplelist[0][1] # if winner == "bio": # winner = "non" return winner
def metadata_check(htid, inputsequence): global options, rowindices, columns, metadata, modelindices, modelcolumns, modeldata '''Assesses whether previous metadata tend to deny or confirm the thrust of page-level genre predictions. For this purpose we use both genre codes extracted from the MARC record and the predictions of a volume- level probabilistic model. Returns two parameters: 1) a dictionary of "confirmations" that indicate whether metadata aligns with page-level predictions in six specific ways. 2) The "maxgenre" or genre most commonly predicted at the page level.''' genresequence = [x for x in inputsequence] # make a defensive copy of incoming parameter htid = utils.pairtreelabel(htid) # convert the htid into a dirty pairtree label for metadata matching # Create a dictionary with entries for all possible conditions, initially set negative. symptoms = [ "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees" ] # The first four of these symptoms reflect metadata extracted from the MARC record. Weakconfirmation and # weakdenial are based on flags extracted from controlfield 008 which I find are not very reliable as guides. # Strongconfirmation and strongdenial are based on strings extracted from other fields that are more # specific and reliable as indications of genre. Modelagrees and modeldisagrees reflect the alignment of # page-level predictions with an earlier volume-level model of the corpus. confirmations = dict() for symptom in symptoms: confirmations[symptom] = 0 genrecounts, maxgenre = sequence_to_counts(genresequence) if htid not in rowindices and htid not in modelindices: return confirmations if htid in rowindices: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: # if info == "biog?" and maxgenre == "non": # confirmations["weakconfirmation"] = 1 # if info == "biog?" and maxgenre != "non": # confirmations["weakdenial"] = 1 if info == "Not fiction" and maxgenre == "non": confirmations["weakconfirmation"] = 1 if info == "Not fiction" and maxgenre == "fic": confirmations["weakdenial"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre == "fic": confirmations["strongconfirmation"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre != "fic": confirmations["strongdenial"] = 1 if info == "Biography" and maxgenre == "non": confirmations["strongconfirmation"] = 1 if info == "Biography" and maxgenre != "non": confirmations["strongdenial"] = 1 if info == "Autobiography" and maxgenre == "non": confirmations["strongconfirmation"] = 1 if info == "Autobiography" and maxgenre != "non": confirmations["strongdenial"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre == "poe": confirmations["strongconfirmation"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre != "poe": confirmations["strongdenial"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra": confirmations["strongconfirmation"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra": confirmations["strongdenial"] = 1 if (info == "Catalog" or info == "Dictionary" or info == "Bibliographies") and maxgenre == "non": confirmations["strongconfirmation"] = 1 couldbefiction = False if (info == "Catalog" or info == "Dictionary" or info == "Bibliographies") and maxgenre != "non": confirmations["strongdenial"] = 1 else: print("Skipped.") if htid in modelindices: modelpredictions = dict() for genre, genrecolumn in modeldata.items(): if not genre in options: # this column is not a genre! continue modelpredictions[genre] = float(genrecolumn[htid]) predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse=True) modelprediction = predictionlist[0][1] modelconfidence = predictionlist[0][0] nextclosest = predictionlist[1][0] # Take the top prediction. # For purposes of this routine, treat biography as nonfiction: if modelprediction == "bio": modelprediction = "non" if maxgenre == modelprediction: confirmations["modelagrees"] = 1 ## modelconfidence - nextclosest confirmations["modeldisagrees"] = 0 if maxgenre != modelprediction: ## divergence = modelconfidence - modelpredictions[maxgenre] confirmations["modeldisagrees"] = 1 confirmations["modelagrees"] = 0 ## print(maxgenre + " ≠ " + modelprediction) else: confirmations["modelagrees"] = 0 confirmations["modeldisagrees"] = 0 modelprediction = "unknown" return confirmations
def censor(htid, genresequence): htid = utils.pairtreelabel(htid) # convert the htid into a dirty pairtree label for metadata matching # Create a dictionary with entries for all possible conditions, initially set negative. symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"] reported = dict() for symptom in symptoms: reported[symptom] = 0 couldbefiction = True # Now we need to assess the largest genre in this volume. genrecounts = dict() genrecounts['fic'] = 0 genrecounts['poe'] = 0 genrecounts['dra'] = 0 genrecounts['non'] = 0 for page in genresequence: indexas = page # For this purpose, we treat biography and indexes as equivalent to nonfiction. if page == "bio" or page == "index" or page == "back": indexas = "non" utils.addtodict(indexas, 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True) maxgenre = genretuples[0][1] if htid not in rowindices and htid not in modelindices: return genresequence, reported if htid in rowindices: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Biography" or info == "Autobiography": couldbefiction = False if info == "biog?" and maxgenre == "non": reported["weakconfirmation"] = 1 if info == "biog?" and maxgenre != "non": reported["weakdenial"] = 1 if info == "Not fiction" and maxgenre == "non": reported["weakconfirmation"] = 1 if info == "Not fiction" and maxgenre == "fic": reported["weakdenial"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre == "fic": reported["strongconfirmation"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre != "fic": reported["strongdenial"] = 1 if info == "Biography" and maxgenre == "non": reported["strongconfirmation"] = 1 if info == "Biography" and maxgenre != "non": reported["strongdenial"] = 1 if info == "Autobiography" and maxgenre == "non": reported["strongconfirmation"] = 1 if info == "Autobiography" and maxgenre != "non": reported["strongdenial"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre == "poe": reported["strongconfirmation"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre != "poe": reported["strongdenial"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra": reported["strongconfirmation"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra": reported["strongdenial"] = 1 if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre == "non": reported["strongconfirmation"] = 1 couldbefiction = False if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre != "non": reported["strongdenial"] = 1 else: print("Skipped.") if htid in modelindices: modelpredictions = dict() for genre, genrecolumn in modeldata.items(): if not genre in options: # this column is not a genre! continue modelpredictions[genre] = float(genrecolumn[htid]) predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse = True) modelprediction = predictionlist[0][1] modelconfidence = predictionlist[0][0] nextclosest = predictionlist[1][0] # Take the top prediction. # For purposes of this routine, treat biography as nonfiction: if modelprediction == "bio": modelprediction = "non" if maxgenre == modelprediction: reported["modelagrees"] = 1 ## modelconfidence - nextclosest reported["modeldisagrees"] = 0 if maxgenre != modelprediction: ## divergence = modelconfidence - modelpredictions[maxgenre] reported["modeldisagrees"] = 1 reported["modelagrees"] = 0 ## print(maxgenre + " ≠ " + modelprediction) else: reported["modelagrees"] = 0 reported["modeldisagrees"] = 0 modelprediction = "unknown" if not couldbefiction: numberofpages = len(genresequence) for i in range(numberofpages): if genresequence[i] == "fic": genresequence[i] = "non" return genresequence, reported
def metadata_check(htid, inputsequence): global options, rowindices, columns, metadata, modelindices, modelcolumns, modeldata '''Assesses whether previous metadata tend to deny or confirm the thrust of page-level genre predictions. For this purpose we use both genre codes extracted from the MARC record and the predictions of a volume- level probabilistic model. Returns two parameters: 1) a dictionary of "confirmations" that indicate whether metadata aligns with page-level predictions in six specific ways. 2) The "maxgenre" or genre most commonly predicted at the page level.''' genresequence = [x for x in inputsequence] # make a defensive copy of incoming parameter htid = utils.pairtreelabel(htid) # convert the htid into a dirty pairtree label for metadata matching # Create a dictionary with entries for all possible conditions, initially set negative. symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"] # The first four of these symptoms reflect metadata extracted from the MARC record. Weakconfirmation and # weakdenial are based on flags extracted from controlfield 008 which I find are not very reliable as guides. # Strongconfirmation and strongdenial are based on strings extracted from other fields that are more # specific and reliable as indications of genre. Modelagrees and modeldisagrees reflect the alignment of # page-level predictions with an earlier volume-level model of the corpus. confirmations = dict() for symptom in symptoms: confirmations[symptom] = 0 genrecounts, maxgenre = sequence_to_counts(genresequence) if htid not in rowindices and htid not in modelindices: return confirmations if htid in rowindices: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: # if info == "biog?" and maxgenre == "non": # confirmations["weakconfirmation"] = 1 # if info == "biog?" and maxgenre != "non": # confirmations["weakdenial"] = 1 if info == "Not fiction" and maxgenre == "non": confirmations["weakconfirmation"] = 1 if info == "Not fiction" and maxgenre == "fic": confirmations["weakdenial"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre == "fic": confirmations["strongconfirmation"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre != "fic": confirmations["strongdenial"] = 1 if info == "Biography" and maxgenre == "non": confirmations["strongconfirmation"] = 1 if info == "Biography" and maxgenre != "non": confirmations["strongdenial"] = 1 if info == "Autobiography" and maxgenre == "non": confirmations["strongconfirmation"] = 1 if info == "Autobiography" and maxgenre != "non": confirmations["strongdenial"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre == "poe": confirmations["strongconfirmation"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre != "poe": confirmations["strongdenial"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra": confirmations["strongconfirmation"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra": confirmations["strongdenial"] = 1 if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre == "non": confirmations["strongconfirmation"] = 1 couldbefiction = False if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre != "non": confirmations["strongdenial"] = 1 else: print("Skipped.") if htid in modelindices: modelpredictions = dict() for genre, genrecolumn in modeldata.items(): if not genre in options: # this column is not a genre! continue modelpredictions[genre] = float(genrecolumn[htid]) predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse = True) modelprediction = predictionlist[0][1] modelconfidence = predictionlist[0][0] nextclosest = predictionlist[1][0] # Take the top prediction. # For purposes of this routine, treat biography as nonfiction: if modelprediction == "bio": modelprediction = "non" if maxgenre == modelprediction: confirmations["modelagrees"] = 1 ## modelconfidence - nextclosest confirmations["modeldisagrees"] = 0 if maxgenre != modelprediction: ## divergence = modelconfidence - modelpredictions[maxgenre] confirmations["modeldisagrees"] = 1 confirmations["modelagrees"] = 0 ## print(maxgenre + " ≠ " + modelprediction) else: confirmations["modelagrees"] = 0 confirmations["modeldisagrees"] = 0 modelprediction = "unknown" return confirmations
continue thesewords = round(float(genrecolumn[row]) * 50000) if genre == "bio": modelwords["nonfiction"] += thesewords else: key = translations[genre] modelwords[key] += thesewords modelwords["paratext"] = 1000 modelpredictions = dict() for genre, genrecolumn in modeldata.items(): if not genre in options: # this column is not a genre! continue modelpredictions[genre] = float(genrecolumn[row]) predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse = True) modelprediction = predictionlist[0][1] # Take the top prediction. # For purposes of this routine, treat biography as nonfiction: if modelprediction == "bio": modelprediction = "non" volgenrekey = translations[modelprediction] date = metadata["date"][row] try: integerdate = int(date) except: integerdate = 0 if integerdate > 1699 and integerdate < 1900: