예제 #1
0
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.
    Note that this function cannot return "bio." If
    biography is the largest genre it returns "non"fiction.
    It counts bio, but ensures that all votes for bio are also votes
    for non.
    '''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)
        if page == 'bio':
            utils.addtodict('non', 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True)
    maxgenre = genretuples[0][1]

    if maxgenre == 'bio':
        maxgenre = 'non'

    return genrecounts, maxgenre
예제 #2
0
def select_common_features(trainingset, n):
    ''' Very simply, selects the top n features in the training set.
	Not a sophisticated feature-selection strategy, but in many
	cases it gets the job done.
	'''
    allwordcounts = dict()

    for avolume in trainingset:
        utils.add_dicts(avolume.rawcounts, allwordcounts)
        # The add_dicts function will add up all the raw counts into
        # a single master dictionary.

    descendingbyfreq = utils.sortkeysbyvalue(allwordcounts,
                                             whethertoreverse=True)
    # This returns a list of 2-tuple (frequency, word) pairs.

    if n > len(descendingbyfreq):
        n = len(descendingbyfreq)
        print("We only have " + str(n) + " features.")

    # List comprehension that gets the second element of each tuple, up to
    # a total of n tuples.

    topfeatures = [x[1] for x in descendingbyfreq[0:n]]

    return topfeatures
예제 #3
0
def select_common_features(trainingset, n):
	''' Very simply, selects the top n features in the training set.
	Not a sophisticated feature-selection strategy, but in many
	cases it gets the job done.
	'''
	allwordcounts = dict()

	for avolume in trainingset:
		utils.add_dicts(avolume.rawcounts, allwordcounts)
		# The add_dicts function will add up all the raw counts into
		# a single master dictionary.

	descendingbyfreq = utils.sortkeysbyvalue(allwordcounts, whethertoreverse = True)
	# This returns a list of 2-tuple (frequency, word) pairs.

	if n > len(descendingbyfreq):
		n = len(descendingbyfreq)
		print("We only have " + str(n) + " features.")

	# List comprehension that gets the second element of each tuple, up to
	# a total of n tuples.

	topfeatures = [x[1] for x in descendingbyfreq[0 : n]]

	return topfeatures
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.'''

    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
    genrecounts['non'] = 0

    for page in genresequence:
        indexas = page

        # For this purpose, we treat biography and indexes as equivalent to nonfiction.
        if page == "bio" or page == "index" or page == "back" or page == "trv":
            indexas = "non"

        utils.addtodict(indexas, 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
    maxgenre = genretuples[0][1]

    return genrecounts, maxgenre
예제 #5
0
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.
    Note that this function cannot return "bio." If
    biography is the largest genre it returns "non"fiction.
    It counts bio, but ensures that all votes for bio are also votes
    for non.
    '''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)
        if page == 'bio':
            utils.addtodict('non', 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
    maxgenre = genretuples[0][1]

    if maxgenre == 'bio':
        maxgenre = 'non'

    return genrecounts, maxgenre
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.'''

    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
    genrecounts['non'] = 0

    for page in genresequence:
        indexas = page

        # For this purpose, we treat biography and indexes as equivalent to nonfiction.
        if page == "bio" or page == "index" or page == "back" or page == "trv":
            indexas = "non"

        utils.addtodict(indexas, 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True)
    maxgenre = genretuples[0][1]

    return genrecounts, maxgenre
예제 #7
0
def resolve_voting(votes, tiebreaker):
    electorate = len(votes)

    results = dict()
    for vote in votes:
        # if vote == "bio":
        #   vote = "non"
        utils.addtodict(vote, 1, results)
    candidate = utils.sortkeysbyvalue(results, whethertoreverse=True)

    dissent = (electorate - candidate[0][0]) / electorate

    if len(candidate) < 2:
        # There is only one candidate.
        return candidate[0][1], dissent, candidate[0][1]

    elif candidate[0][0] > candidate[1][0]:
        # We have a majority.
        return candidate[0][1], dissent, candidate[1][1]

    else:
        # We have a tie.
        if tiebreaker == candidate[0][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[0][1], dissent, candidate[1][1]
        elif tiebreaker == candidate[1][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[1][1], dissent, candidate[0][1]
        else:
            print("Tie in spite of " + tiebreaker)
            win = random.choice([candidate[0][1], candidate[1][1]])
            if win == candidate[0][1]:
                runnerup = candidate[1][1]
            else:
                runnerup = candidate[0][1]

            return win, dissent, runnerup
def resolve_voting(votes, tiebreaker):
    electorate = len(votes)

    results = dict()
    for vote in votes:
        # if vote == "bio":
        #   vote = "non"
        utils.addtodict(vote, 1, results)
    candidate = utils.sortkeysbyvalue(results, whethertoreverse = True)

    dissent = (electorate - candidate[0][0]) / electorate

    if len(candidate) < 2:
        # There is only one candidate.
        return candidate[0][1], dissent, candidate[0][1]

    elif candidate[0][0] > candidate[1][0]:
        # We have a majority.
        return candidate[0][1], dissent, candidate[1][1]

    else:
        # We have a tie.
        if tiebreaker == candidate[0][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[0][1], dissent, candidate[1][1]
        elif tiebreaker == candidate[1][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[1][1], dissent, candidate[0][1]
        else:
            print("Tie in spite of " + tiebreaker)
            win = random.choice([candidate[0][1], candidate[1][1]])
            if win == candidate[0][1]:
                runnerup = candidate[1][1]
            else:
                runnerup = candidate[0][1]

            return win, dissent, runnerup
예제 #9
0
def maxkey(dictionary):
    tuplelist = utils.sortkeysbyvalue(dictionary, whethertoreverse=True)
    winner = tuplelist[0][1]
    # if winner == "bio":
    # 	winner = "non"
    return winner
예제 #10
0
def censor(htid, genresequence):

    htid = utils.pairtreelabel(htid)
    # convert the htid into a dirty pairtree label for metadata matching

    # Create a dictionary with entries for all possible conditions, initially set negative.
    symptoms = [
        "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial",
        "modelagrees", "modeldisagrees"
    ]
    reported = dict()
    for symptom in symptoms:
        reported[symptom] = 0

    couldbefiction = True

    # Now we need to assess the largest genre in this volume.
    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
    genrecounts['non'] = 0

    for page in genresequence:
        indexas = page

        # For this purpose, we treat biography and indexes as equivalent to nonfiction.
        if page == "bio" or page == "index" or page == "back":
            indexas = "non"

        utils.addtodict(indexas, 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True)
    maxgenre = genretuples[0][1]

    if htid not in rowindices and htid not in modelindices:
        return genresequence, reported

    if htid in rowindices:

        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            if info == "Biography" or info == "Autobiography":
                couldbefiction = False

            if info == "biog?" and maxgenre == "non":
                reported["weakconfirmation"] = 1
            if info == "biog?" and maxgenre != "non":
                reported["weakdenial"] = 1

            if info == "Not fiction" and maxgenre == "non":
                reported["weakconfirmation"] = 1
            if info == "Not fiction" and maxgenre == "fic":
                reported["weakdenial"] = 1

            if (info == "Fiction" or info == "Novel") and maxgenre == "fic":
                reported["strongconfirmation"] = 1
            if (info == "Fiction" or info == "Novel") and maxgenre != "fic":
                reported["strongdenial"] = 1

            if info == "Biography" and maxgenre == "non":
                reported["strongconfirmation"] = 1
            if info == "Biography" and maxgenre != "non":
                reported["strongdenial"] = 1

            if info == "Autobiography" and maxgenre == "non":
                reported["strongconfirmation"] = 1
            if info == "Autobiography" and maxgenre != "non":
                reported["strongdenial"] = 1

            if (info == "Poetry" or info == "Poems") and maxgenre == "poe":
                reported["strongconfirmation"] = 1
            if (info == "Poetry" or info == "Poems") and maxgenre != "poe":
                reported["strongdenial"] = 1

            if (info == "Drama" or info == "Tragedies"
                    or info == "Comedies") and maxgenre == "dra":
                reported["strongconfirmation"] = 1
            if (info == "Drama" or info == "Tragedies"
                    or info == "Comedies") and maxgenre != "dra":
                reported["strongdenial"] = 1

            if (info == "Catalog" or info == "Dictionary"
                    or info == "Bibliographies") and maxgenre == "non":
                reported["strongconfirmation"] = 1
                couldbefiction = False
            if (info == "Catalog" or info == "Dictionary"
                    or info == "Bibliographies") and maxgenre != "non":
                reported["strongdenial"] = 1
    else:
        print("Skipped.")

    if htid in modelindices:

        modelpredictions = dict()
        for genre, genrecolumn in modeldata.items():
            if not genre in options:
                # this column is not a genre!
                continue
            modelpredictions[genre] = float(genrecolumn[htid])
        predictionlist = utils.sortkeysbyvalue(modelpredictions,
                                               whethertoreverse=True)
        modelprediction = predictionlist[0][1]
        modelconfidence = predictionlist[0][0]
        nextclosest = predictionlist[1][0]
        # Take the top prediction.

        # For purposes of this routine, treat biography as nonfiction:
        if modelprediction == "bio":
            modelprediction = "non"

        if maxgenre == modelprediction:
            reported["modelagrees"] = 1  ## modelconfidence - nextclosest
            reported["modeldisagrees"] = 0
        if maxgenre != modelprediction:
            ## divergence = modelconfidence - modelpredictions[maxgenre]
            reported["modeldisagrees"] = 1
            reported["modelagrees"] = 0
            ## print(maxgenre + " ≠ " + modelprediction)
    else:
        reported["modelagrees"] = 0
        reported["modeldisagrees"] = 0
        modelprediction = "unknown"

    if not couldbefiction:

        numberofpages = len(genresequence)
        for i in range(numberofpages):
            if genresequence[i] == "fic":
                genresequence[i] = "non"

    return genresequence, reported
def maxkey(dictionary):
    tuplelist = utils.sortkeysbyvalue(dictionary, whethertoreverse = True)
    winner = tuplelist[0][1]
    # if winner == "bio":
    #   winner = "non"
    return winner
def metadata_check(htid, inputsequence):
    global options, rowindices, columns, metadata, modelindices, modelcolumns, modeldata
    '''Assesses whether previous metadata tend to deny or confirm the
    thrust of page-level genre predictions. For this purpose we use both
    genre codes extracted from the MARC record and the predictions of a volume-
    level probabilistic model.

    Returns two parameters: 1) a dictionary of "confirmations" that indicate
    whether metadata aligns with page-level predictions in six specific ways.
    2) The "maxgenre" or genre most commonly predicted at the page level.'''

    genresequence = [x for x in inputsequence]
    # make a defensive copy of incoming parameter

    htid = utils.pairtreelabel(htid)
    # convert the htid into a dirty pairtree label for metadata matching

    # Create a dictionary with entries for all possible conditions, initially set negative.
    symptoms = [
        "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial",
        "modelagrees", "modeldisagrees"
    ]
    # The first four of these symptoms reflect metadata extracted from the MARC record. Weakconfirmation and
    # weakdenial are based on flags extracted from controlfield 008 which I find are not very reliable as guides.
    # Strongconfirmation and strongdenial are based on strings extracted from other fields that are more
    # specific and reliable as indications of genre. Modelagrees and modeldisagrees reflect the alignment of
    # page-level predictions with an earlier volume-level model of the corpus.

    confirmations = dict()
    for symptom in symptoms:
        confirmations[symptom] = 0

    genrecounts, maxgenre = sequence_to_counts(genresequence)

    if htid not in rowindices and htid not in modelindices:
        return confirmations

    if htid in rowindices:

        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            # if info == "biog?" and maxgenre == "non":
            #     confirmations["weakconfirmation"] = 1
            # if info == "biog?" and maxgenre != "non":
            #     confirmations["weakdenial"] = 1

            if info == "Not fiction" and maxgenre == "non":
                confirmations["weakconfirmation"] = 1
            if info == "Not fiction" and maxgenre == "fic":
                confirmations["weakdenial"] = 1

            if (info == "Fiction" or info == "Novel") and maxgenre == "fic":
                confirmations["strongconfirmation"] = 1
            if (info == "Fiction" or info == "Novel") and maxgenre != "fic":
                confirmations["strongdenial"] = 1

            if info == "Biography" and maxgenre == "non":
                confirmations["strongconfirmation"] = 1
            if info == "Biography" and maxgenre != "non":
                confirmations["strongdenial"] = 1

            if info == "Autobiography" and maxgenre == "non":
                confirmations["strongconfirmation"] = 1
            if info == "Autobiography" and maxgenre != "non":
                confirmations["strongdenial"] = 1

            if (info == "Poetry" or info == "Poems") and maxgenre == "poe":
                confirmations["strongconfirmation"] = 1
            if (info == "Poetry" or info == "Poems") and maxgenre != "poe":
                confirmations["strongdenial"] = 1

            if (info == "Drama" or info == "Tragedies"
                    or info == "Comedies") and maxgenre == "dra":
                confirmations["strongconfirmation"] = 1
            if (info == "Drama" or info == "Tragedies"
                    or info == "Comedies") and maxgenre != "dra":
                confirmations["strongdenial"] = 1

            if (info == "Catalog" or info == "Dictionary"
                    or info == "Bibliographies") and maxgenre == "non":
                confirmations["strongconfirmation"] = 1
                couldbefiction = False
            if (info == "Catalog" or info == "Dictionary"
                    or info == "Bibliographies") and maxgenre != "non":
                confirmations["strongdenial"] = 1
    else:
        print("Skipped.")

    if htid in modelindices:

        modelpredictions = dict()
        for genre, genrecolumn in modeldata.items():
            if not genre in options:
                # this column is not a genre!
                continue
            modelpredictions[genre] = float(genrecolumn[htid])
        predictionlist = utils.sortkeysbyvalue(modelpredictions,
                                               whethertoreverse=True)
        modelprediction = predictionlist[0][1]
        modelconfidence = predictionlist[0][0]
        nextclosest = predictionlist[1][0]
        # Take the top prediction.

        # For purposes of this routine, treat biography as nonfiction:
        if modelprediction == "bio":
            modelprediction = "non"

        if maxgenre == modelprediction:
            confirmations["modelagrees"] = 1  ## modelconfidence - nextclosest
            confirmations["modeldisagrees"] = 0
        if maxgenre != modelprediction:
            ## divergence = modelconfidence - modelpredictions[maxgenre]
            confirmations["modeldisagrees"] = 1
            confirmations["modelagrees"] = 0
            ## print(maxgenre + " ≠ " + modelprediction)
    else:
        confirmations["modelagrees"] = 0
        confirmations["modeldisagrees"] = 0
        modelprediction = "unknown"

    return confirmations
def censor(htid, genresequence):

	htid = utils.pairtreelabel(htid)
	# convert the htid into a dirty pairtree label for metadata matching

	# Create a dictionary with entries for all possible conditions, initially set negative.
	symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"]
	reported = dict()
	for symptom in symptoms:
		reported[symptom] = 0

	couldbefiction = True

	# Now we need to assess the largest genre in this volume.
	genrecounts = dict()
	genrecounts['fic'] = 0
	genrecounts['poe'] = 0
	genrecounts['dra'] = 0
	genrecounts['non'] = 0

	for page in genresequence:
		indexas = page

		# For this purpose, we treat biography and indexes as equivalent to nonfiction.
		if page == "bio" or page == "index" or page == "back":
			indexas = "non"

		utils.addtodict(indexas, 1, genrecounts)

	# Convert the dictionary of counts into a sorted list, and take the max.
	genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
	maxgenre = genretuples[0][1]

	if htid not in rowindices and htid not in modelindices:
		return genresequence, reported

	if htid in rowindices:

		genrestring = metadata["genres"][htid]
		genreinfo = genrestring.split(";")
		# It's a semicolon-delimited list of items.

		for info in genreinfo:

			if info == "Biography" or info == "Autobiography":
				couldbefiction = False

			if info == "biog?" and maxgenre == "non":
				reported["weakconfirmation"] = 1
			if info == "biog?" and maxgenre != "non":
				reported["weakdenial"] = 1

			if info == "Not fiction" and maxgenre == "non":
				reported["weakconfirmation"] = 1
			if info == "Not fiction" and maxgenre == "fic":
				reported["weakdenial"] = 1

			if (info == "Fiction" or info == "Novel") and maxgenre == "fic":
				reported["strongconfirmation"] = 1
			if (info == "Fiction" or info == "Novel") and maxgenre != "fic":
				reported["strongdenial"] = 1

			if info == "Biography" and maxgenre == "non":
				reported["strongconfirmation"] = 1
			if info == "Biography" and maxgenre != "non":
				reported["strongdenial"] = 1

			if info == "Autobiography" and maxgenre == "non":
				reported["strongconfirmation"] = 1
			if info == "Autobiography" and maxgenre != "non":
				reported["strongdenial"] = 1

			if (info == "Poetry" or info == "Poems") and maxgenre == "poe":
				reported["strongconfirmation"] = 1
			if (info == "Poetry" or info == "Poems") and maxgenre != "poe":
				reported["strongdenial"] = 1

			if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra":
				reported["strongconfirmation"] = 1
			if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra":
				reported["strongdenial"] = 1

			if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre == "non":
				reported["strongconfirmation"] = 1
				couldbefiction = False
			if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre != "non":
				reported["strongdenial"] = 1
	else:
		print("Skipped.")

	if htid in modelindices:

		modelpredictions = dict()
		for genre, genrecolumn in modeldata.items():
			if not genre in options:
				# this column is not a genre!
				continue
			modelpredictions[genre] = float(genrecolumn[htid])
		predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse = True)
		modelprediction = predictionlist[0][1]
		modelconfidence = predictionlist[0][0]
		nextclosest = predictionlist[1][0]
		# Take the top prediction.

		# For purposes of this routine, treat biography as nonfiction:
		if modelprediction == "bio":
			modelprediction = "non"

		if maxgenre == modelprediction:
			reported["modelagrees"] = 1 ## modelconfidence - nextclosest
			reported["modeldisagrees"] = 0
		if maxgenre != modelprediction:
			## divergence = modelconfidence - modelpredictions[maxgenre]
			reported["modeldisagrees"] = 1
			reported["modelagrees"] = 0
			## print(maxgenre + " ≠ " + modelprediction)
	else:
		reported["modelagrees"] = 0
		reported["modeldisagrees"] = 0
		modelprediction = "unknown"


	if not couldbefiction:
		
		numberofpages = len(genresequence)
		for i in range(numberofpages):
			if genresequence[i] == "fic":
				genresequence[i] = "non"

	return genresequence, reported
예제 #14
0
def metadata_check(htid, inputsequence):
    global options, rowindices, columns, metadata, modelindices, modelcolumns, modeldata
    '''Assesses whether previous metadata tend to deny or confirm the
    thrust of page-level genre predictions. For this purpose we use both
    genre codes extracted from the MARC record and the predictions of a volume-
    level probabilistic model.

    Returns two parameters: 1) a dictionary of "confirmations" that indicate
    whether metadata aligns with page-level predictions in six specific ways.
    2) The "maxgenre" or genre most commonly predicted at the page level.'''

    genresequence = [x for x in inputsequence]
    # make a defensive copy of incoming parameter

    htid = utils.pairtreelabel(htid)
    # convert the htid into a dirty pairtree label for metadata matching

    # Create a dictionary with entries for all possible conditions, initially set negative.
    symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"]
    # The first four of these symptoms reflect metadata extracted from the MARC record. Weakconfirmation and
    # weakdenial are based on flags extracted from controlfield 008 which I find are not very reliable as guides.
    # Strongconfirmation and strongdenial are based on strings extracted from other fields that are more
    # specific and reliable as indications of genre. Modelagrees and modeldisagrees reflect the alignment of
    # page-level predictions with an earlier volume-level model of the corpus.

    confirmations = dict()
    for symptom in symptoms:
        confirmations[symptom] = 0

    genrecounts, maxgenre = sequence_to_counts(genresequence)

    if htid not in rowindices and htid not in modelindices:
        return confirmations

    if htid in rowindices:

        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            # if info == "biog?" and maxgenre == "non":
            #     confirmations["weakconfirmation"] = 1
            # if info == "biog?" and maxgenre != "non":
            #     confirmations["weakdenial"] = 1

            if info == "Not fiction" and maxgenre == "non":
                confirmations["weakconfirmation"] = 1
            if info == "Not fiction" and maxgenre == "fic":
                confirmations["weakdenial"] = 1

            if (info == "Fiction" or info == "Novel") and maxgenre == "fic":
                confirmations["strongconfirmation"] = 1
            if (info == "Fiction" or info == "Novel") and maxgenre != "fic":
                confirmations["strongdenial"] = 1

            if info == "Biography" and maxgenre == "non":
                confirmations["strongconfirmation"] = 1
            if info == "Biography" and maxgenre != "non":
                confirmations["strongdenial"] = 1

            if info == "Autobiography" and maxgenre == "non":
                confirmations["strongconfirmation"] = 1
            if info == "Autobiography" and maxgenre != "non":
                confirmations["strongdenial"] = 1

            if (info == "Poetry" or info == "Poems") and maxgenre == "poe":
                confirmations["strongconfirmation"] = 1
            if (info == "Poetry" or info == "Poems") and maxgenre != "poe":
                confirmations["strongdenial"] = 1

            if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra":
                confirmations["strongconfirmation"] = 1
            if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra":
                confirmations["strongdenial"] = 1

            if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre == "non":
                confirmations["strongconfirmation"] = 1
                couldbefiction = False
            if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre != "non":
                confirmations["strongdenial"] = 1
    else:
        print("Skipped.")

    if htid in modelindices:

        modelpredictions = dict()
        for genre, genrecolumn in modeldata.items():
            if not genre in options:
                # this column is not a genre!
                continue
            modelpredictions[genre] = float(genrecolumn[htid])
        predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse = True)
        modelprediction = predictionlist[0][1]
        modelconfidence = predictionlist[0][0]
        nextclosest = predictionlist[1][0]
        # Take the top prediction.

        # For purposes of this routine, treat biography as nonfiction:
        if modelprediction == "bio":
            modelprediction = "non"

        if maxgenre == modelprediction:
            confirmations["modelagrees"] = 1 ## modelconfidence - nextclosest
            confirmations["modeldisagrees"] = 0
        if maxgenre != modelprediction:
            ## divergence = modelconfidence - modelpredictions[maxgenre]
            confirmations["modeldisagrees"] = 1
            confirmations["modelagrees"] = 0
            ## print(maxgenre + " ≠ " + modelprediction)
    else:
        confirmations["modelagrees"] = 0
        confirmations["modeldisagrees"] = 0
        modelprediction = "unknown"

    return confirmations
예제 #15
0
			continue
		thesewords = round(float(genrecolumn[row]) * 50000)
		if genre == "bio":
			modelwords["nonfiction"] += thesewords
		else:
			key = translations[genre]
			modelwords[key] += thesewords
		modelwords["paratext"] = 1000

	modelpredictions = dict()
	for genre, genrecolumn in modeldata.items():
		if not genre in options:
			# this column is not a genre!
			continue
		modelpredictions[genre] = float(genrecolumn[row])
	predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse = True)
	modelprediction = predictionlist[0][1]
	# Take the top prediction.

	# For purposes of this routine, treat biography as nonfiction:
	if modelprediction == "bio":
		modelprediction = "non"
	volgenrekey = translations[modelprediction]

	date = metadata["date"][row]
	try:
		integerdate = int(date)
	except:
		integerdate = 0

	if integerdate > 1699 and integerdate < 1900: