Ejemplo n.º 1
0
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.
    Note that this function cannot return "bio." If
    biography is the largest genre it returns "non"fiction.
    It counts bio, but ensures that all votes for bio are also votes
    for non.
    '''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)
        if page == 'bio':
            utils.addtodict('non', 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
    maxgenre = genretuples[0][1]

    if maxgenre == 'bio':
        maxgenre = 'non'

    return genrecounts, maxgenre
Ejemplo n.º 2
0
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.
    Note that this function cannot return "bio." If
    biography is the largest genre it returns "non"fiction.
    It counts bio, but ensures that all votes for bio are also votes
    for non.
    '''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)
        if page == 'bio':
            utils.addtodict('non', 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True)
    maxgenre = genretuples[0][1]

    if maxgenre == 'bio':
        maxgenre = 'non'

    return genrecounts, maxgenre
Ejemplo n.º 3
0
def comparelists(firstmap, secondmap, genremistakes, correctbygenre,
                 wordcounts):
    if len(firstmap) > len(secondmap):
        length = len(secondmap)
    elif len(firstmap) == len(secondmap):
        length = len(firstmap)
    else:
        print(
            "Error, Will Robinson. There are occasions where the consensus version is shorter but no valid reason for it to be longer."
        )

    divergence = 0.0

    for i in range(length):

        generalizedfirst = translate(firstmap[i])
        generalizedsecond = translate(secondmap[i])

        if effectively_equal(generalizedfirst, generalizedsecond):
            utils.addtodict(generalizedsecond, wordcounts[i], correctbygenre)
        else:
            divergence += wordcounts[i]
            utils.addtodict((generalizedsecond, generalizedfirst),
                            wordcounts[i], genremistakes)

    return divergence
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.'''

    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
    genrecounts['non'] = 0

    for page in genresequence:
        indexas = page

        # For this purpose, we treat biography and indexes as equivalent to nonfiction.
        if page == "bio" or page == "index" or page == "back" or page == "trv":
            indexas = "non"

        utils.addtodict(indexas, 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True)
    maxgenre = genretuples[0][1]

    return genrecounts, maxgenre
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.'''

    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
    genrecounts['non'] = 0

    for page in genresequence:
        indexas = page

        # For this purpose, we treat biography and indexes as equivalent to nonfiction.
        if page == "bio" or page == "index" or page == "back" or page == "trv":
            indexas = "non"

        utils.addtodict(indexas, 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
    maxgenre = genretuples[0][1]

    return genrecounts, maxgenre
Ejemplo n.º 6
0
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre.

    Note that this version of the function is slightly different
    from the version in MetadataCascades, in allowing a wider range
    of genres and not initializing anything to zero.'''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)

    return genrecounts
Ejemplo n.º 7
0
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre.

    Note that this version of the function is slightly different
    from the version in MetadataCascades, in allowing a wider range
    of genres and not initializing anything to zero.'''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)

    return genrecounts
Ejemplo n.º 8
0
def compare_two_lists(truelist, predicted, wordsperpage, whethertocountwords):
    global genretranslations
    assert (len(truelist) == len(predicted))

    errorsbygenre = dict()
    correctbygenre = dict()
    accurate = 0
    inaccurate = 0
    totaltruegenre = dict()

    for index, truegenre in enumerate(truelist):
        if truegenre in genretranslations:
            truegenre = genretranslations[truegenre]

        if whethertocountwords:
            increment = wordsperpage[index]
        else:
            increment = 1

        utils.addtodict(truegenre, increment, totaltruegenre)

        predictedgenre = predicted[index]

        if genresareequal(truegenre, predictedgenre):
            utils.addtodict(truegenre, increment, correctbygenre)
            accurate += increment
        else:
            utils.addtodict((truegenre, predictedgenre), increment,
                            errorsbygenre)
            inaccurate += increment

    return totaltruegenre, correctbygenre, errorsbygenre, accurate, inaccurate
def compare_two_lists(truelist, predicted, wordsperpage, whethertocountwords):
    global genretranslations
    assert(len(truelist) == len(predicted))

    errorsbygenre = dict()
    correctbygenre = dict()
    accurate = 0
    inaccurate = 0
    totaltruegenre = dict()

    for index, truegenre in enumerate(truelist):
        if truegenre in genretranslations:
            truegenre = genretranslations[truegenre]

        if whethertocountwords:
            increment = wordsperpage[index]
        else:
            increment = 1

        utils.addtodict(truegenre, increment, totaltruegenre)

        predictedgenre = predicted[index]

        if genresareequal(truegenre, predictedgenre):
            utils.addtodict(truegenre, increment, correctbygenre)
            accurate += increment
        else:
            utils.addtodict((truegenre, predictedgenre), increment, errorsbygenre)
            inaccurate += increment

    return totaltruegenre, correctbygenre, errorsbygenre, accurate, inaccurate
Ejemplo n.º 10
0
def comparelists(firstmap, secondmap, genremistakes, correctbygenre, wordcounts):
	if len(firstmap) > len(secondmap):
		length = len(secondmap)
	elif len(firstmap) == len(secondmap):
		length = len(firstmap)
	else:
		print("Error, Will Robinson. There are occasions where the consensus version is shorter but no valid reason for it to be longer.")

	divergence = 0.0

	for i in range(length):

		generalizedfirst = translate(firstmap[i])
		generalizedsecond = translate(secondmap[i])

		if effectively_equal(generalizedfirst, generalizedsecond):
			utils.addtodict(generalizedsecond, wordcounts[i], correctbygenre)
		else:
			divergence += wordcounts[i]
			utils.addtodict((generalizedsecond, generalizedfirst), wordcounts[i], genremistakes)

	return divergence
Ejemplo n.º 11
0
def resolve_voting(votes, tiebreaker):
    electorate = len(votes)

    results = dict()
    for vote in votes:
        # if vote == "bio":
        #   vote = "non"
        utils.addtodict(vote, 1, results)
    candidate = utils.sortkeysbyvalue(results, whethertoreverse=True)

    dissent = (electorate - candidate[0][0]) / electorate

    if len(candidate) < 2:
        # There is only one candidate.
        return candidate[0][1], dissent, candidate[0][1]

    elif candidate[0][0] > candidate[1][0]:
        # We have a majority.
        return candidate[0][1], dissent, candidate[1][1]

    else:
        # We have a tie.
        if tiebreaker == candidate[0][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[0][1], dissent, candidate[1][1]
        elif tiebreaker == candidate[1][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[1][1], dissent, candidate[0][1]
        else:
            print("Tie in spite of " + tiebreaker)
            win = random.choice([candidate[0][1], candidate[1][1]])
            if win == candidate[0][1]:
                runnerup = candidate[1][1]
            else:
                runnerup = candidate[0][1]

            return win, dissent, runnerup
def resolve_voting(votes, tiebreaker):
    electorate = len(votes)

    results = dict()
    for vote in votes:
        # if vote == "bio":
        #   vote = "non"
        utils.addtodict(vote, 1, results)
    candidate = utils.sortkeysbyvalue(results, whethertoreverse = True)

    dissent = (electorate - candidate[0][0]) / electorate

    if len(candidate) < 2:
        # There is only one candidate.
        return candidate[0][1], dissent, candidate[0][1]

    elif candidate[0][0] > candidate[1][0]:
        # We have a majority.
        return candidate[0][1], dissent, candidate[1][1]

    else:
        # We have a tie.
        if tiebreaker == candidate[0][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[0][1], dissent, candidate[1][1]
        elif tiebreaker == candidate[1][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[1][1], dissent, candidate[0][1]
        else:
            print("Tie in spite of " + tiebreaker)
            win = random.choice([candidate[0][1], candidate[1][1]])
            if win == candidate[0][1]:
                runnerup = candidate[1][1]
            else:
                runnerup = candidate[0][1]

            return win, dissent, runnerup
Ejemplo n.º 13
0
def censor(htid, genresequence):

    htid = utils.pairtreelabel(htid)
    # convert the htid into a dirty pairtree label for metadata matching

    # Create a dictionary with entries for all possible conditions, initially set negative.
    symptoms = [
        "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial",
        "modelagrees", "modeldisagrees"
    ]
    reported = dict()
    for symptom in symptoms:
        reported[symptom] = 0

    couldbefiction = True

    # Now we need to assess the largest genre in this volume.
    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
    genrecounts['non'] = 0

    for page in genresequence:
        indexas = page

        # For this purpose, we treat biography and indexes as equivalent to nonfiction.
        if page == "bio" or page == "index" or page == "back":
            indexas = "non"

        utils.addtodict(indexas, 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True)
    maxgenre = genretuples[0][1]

    if htid not in rowindices and htid not in modelindices:
        return genresequence, reported

    if htid in rowindices:

        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            if info == "Biography" or info == "Autobiography":
                couldbefiction = False

            if info == "biog?" and maxgenre == "non":
                reported["weakconfirmation"] = 1
            if info == "biog?" and maxgenre != "non":
                reported["weakdenial"] = 1

            if info == "Not fiction" and maxgenre == "non":
                reported["weakconfirmation"] = 1
            if info == "Not fiction" and maxgenre == "fic":
                reported["weakdenial"] = 1

            if (info == "Fiction" or info == "Novel") and maxgenre == "fic":
                reported["strongconfirmation"] = 1
            if (info == "Fiction" or info == "Novel") and maxgenre != "fic":
                reported["strongdenial"] = 1

            if info == "Biography" and maxgenre == "non":
                reported["strongconfirmation"] = 1
            if info == "Biography" and maxgenre != "non":
                reported["strongdenial"] = 1

            if info == "Autobiography" and maxgenre == "non":
                reported["strongconfirmation"] = 1
            if info == "Autobiography" and maxgenre != "non":
                reported["strongdenial"] = 1

            if (info == "Poetry" or info == "Poems") and maxgenre == "poe":
                reported["strongconfirmation"] = 1
            if (info == "Poetry" or info == "Poems") and maxgenre != "poe":
                reported["strongdenial"] = 1

            if (info == "Drama" or info == "Tragedies"
                    or info == "Comedies") and maxgenre == "dra":
                reported["strongconfirmation"] = 1
            if (info == "Drama" or info == "Tragedies"
                    or info == "Comedies") and maxgenre != "dra":
                reported["strongdenial"] = 1

            if (info == "Catalog" or info == "Dictionary"
                    or info == "Bibliographies") and maxgenre == "non":
                reported["strongconfirmation"] = 1
                couldbefiction = False
            if (info == "Catalog" or info == "Dictionary"
                    or info == "Bibliographies") and maxgenre != "non":
                reported["strongdenial"] = 1
    else:
        print("Skipped.")

    if htid in modelindices:

        modelpredictions = dict()
        for genre, genrecolumn in modeldata.items():
            if not genre in options:
                # this column is not a genre!
                continue
            modelpredictions[genre] = float(genrecolumn[htid])
        predictionlist = utils.sortkeysbyvalue(modelpredictions,
                                               whethertoreverse=True)
        modelprediction = predictionlist[0][1]
        modelconfidence = predictionlist[0][0]
        nextclosest = predictionlist[1][0]
        # Take the top prediction.

        # For purposes of this routine, treat biography as nonfiction:
        if modelprediction == "bio":
            modelprediction = "non"

        if maxgenre == modelprediction:
            reported["modelagrees"] = 1  ## modelconfidence - nextclosest
            reported["modeldisagrees"] = 0
        if maxgenre != modelprediction:
            ## divergence = modelconfidence - modelpredictions[maxgenre]
            reported["modeldisagrees"] = 1
            reported["modelagrees"] = 0
            ## print(maxgenre + " ≠ " + modelprediction)
    else:
        reported["modelagrees"] = 0
        reported["modeldisagrees"] = 0
        modelprediction = "unknown"

    if not couldbefiction:

        numberofpages = len(genresequence)
        for i in range(numberofpages):
            if genresequence[i] == "fic":
                genresequence[i] = "non"

    return genresequence, reported
Ejemplo n.º 14
0
with open(metafile, encoding='utf-8') as f:
    reader = csv.reader(f)
    for row in reader:
        # skip header
        if row[0] == "idcode":
            continue
        code = row[0]
        if code in badids:
            continue
        date = int(row[1])
        dateset.add(date)
        wordcount = int(row[3])

        id2date[code] = date
        utils.addtodict(date, wordcount, date2wordcount)
        id2wordcount[code] = wordcount


def pricesymbol(keyword):
    if keyword == '$':
        return True
    elif keyword == '£':
        return True
    elif keyword == '¢':
        return True
    elif keyword == '£':
        return True
    else:
        return False
def censor(htid, genresequence):

	htid = utils.pairtreelabel(htid)
	# convert the htid into a dirty pairtree label for metadata matching

	# Create a dictionary with entries for all possible conditions, initially set negative.
	symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"]
	reported = dict()
	for symptom in symptoms:
		reported[symptom] = 0

	couldbefiction = True

	# Now we need to assess the largest genre in this volume.
	genrecounts = dict()
	genrecounts['fic'] = 0
	genrecounts['poe'] = 0
	genrecounts['dra'] = 0
	genrecounts['non'] = 0

	for page in genresequence:
		indexas = page

		# For this purpose, we treat biography and indexes as equivalent to nonfiction.
		if page == "bio" or page == "index" or page == "back":
			indexas = "non"

		utils.addtodict(indexas, 1, genrecounts)

	# Convert the dictionary of counts into a sorted list, and take the max.
	genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
	maxgenre = genretuples[0][1]

	if htid not in rowindices and htid not in modelindices:
		return genresequence, reported

	if htid in rowindices:

		genrestring = metadata["genres"][htid]
		genreinfo = genrestring.split(";")
		# It's a semicolon-delimited list of items.

		for info in genreinfo:

			if info == "Biography" or info == "Autobiography":
				couldbefiction = False

			if info == "biog?" and maxgenre == "non":
				reported["weakconfirmation"] = 1
			if info == "biog?" and maxgenre != "non":
				reported["weakdenial"] = 1

			if info == "Not fiction" and maxgenre == "non":
				reported["weakconfirmation"] = 1
			if info == "Not fiction" and maxgenre == "fic":
				reported["weakdenial"] = 1

			if (info == "Fiction" or info == "Novel") and maxgenre == "fic":
				reported["strongconfirmation"] = 1
			if (info == "Fiction" or info == "Novel") and maxgenre != "fic":
				reported["strongdenial"] = 1

			if info == "Biography" and maxgenre == "non":
				reported["strongconfirmation"] = 1
			if info == "Biography" and maxgenre != "non":
				reported["strongdenial"] = 1

			if info == "Autobiography" and maxgenre == "non":
				reported["strongconfirmation"] = 1
			if info == "Autobiography" and maxgenre != "non":
				reported["strongdenial"] = 1

			if (info == "Poetry" or info == "Poems") and maxgenre == "poe":
				reported["strongconfirmation"] = 1
			if (info == "Poetry" or info == "Poems") and maxgenre != "poe":
				reported["strongdenial"] = 1

			if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra":
				reported["strongconfirmation"] = 1
			if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra":
				reported["strongdenial"] = 1

			if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre == "non":
				reported["strongconfirmation"] = 1
				couldbefiction = False
			if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre != "non":
				reported["strongdenial"] = 1
	else:
		print("Skipped.")

	if htid in modelindices:

		modelpredictions = dict()
		for genre, genrecolumn in modeldata.items():
			if not genre in options:
				# this column is not a genre!
				continue
			modelpredictions[genre] = float(genrecolumn[htid])
		predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse = True)
		modelprediction = predictionlist[0][1]
		modelconfidence = predictionlist[0][0]
		nextclosest = predictionlist[1][0]
		# Take the top prediction.

		# For purposes of this routine, treat biography as nonfiction:
		if modelprediction == "bio":
			modelprediction = "non"

		if maxgenre == modelprediction:
			reported["modelagrees"] = 1 ## modelconfidence - nextclosest
			reported["modeldisagrees"] = 0
		if maxgenre != modelprediction:
			## divergence = modelconfidence - modelpredictions[maxgenre]
			reported["modeldisagrees"] = 1
			reported["modelagrees"] = 0
			## print(maxgenre + " ≠ " + modelprediction)
	else:
		reported["modelagrees"] = 0
		reported["modeldisagrees"] = 0
		modelprediction = "unknown"


	if not couldbefiction:
		
		numberofpages = len(genresequence)
		for i in range(numberofpages):
			if genresequence[i] == "fic":
				genresequence[i] = "non"

	return genresequence, reported
Ejemplo n.º 16
0
with open(metafile, encoding = 'utf-8') as f:
    reader = csv.reader(f)
    for row in reader:
        # skip header
        if row[0] == "idcode":
            continue
        code = row[0]
        if code in badids:
            continue
        date = int(row[1])
        dateset.add(date)
        wordcount = int(row[3])

        id2date[code] = date
        utils.addtodict(date, wordcount, date2wordcount)
        id2wordcount[code] = wordcount

def pricesymbol(keyword):
    if keyword == '$':
        return True
    elif keyword == '£':
        return True
    elif keyword == '¢':
        return True
    elif keyword == '£':
        return True
    else:
        return False

def priceinsnippet(snippet):
Ejemplo n.º 17
0
    filepath = os.path.join(sourcedir, filename)
    with open(filepath, encoding='utf-8') as f:
        filelines = f.readlines()
    pagelist = [filelines]

    # The wordcounter module expects a list of pages, each of which is a list of lines.
    # Ebooks have no pages -- at least as I currently receive them -- so we treat it
    # all as one giant page.

    tokenstream = wordcounter.makestream(pagelist)
    wordcounts, wordsfused, triplets, alphanum_tokens = wordcounter.count_tokens(
        tokenstream,
        targetwords=targetwords,
        targetphrases=targetphrases,
        verbose=verbose)

    thisyear = summedbydate[date]
    for word, count in wordcounts.items():
        utils.addtodict("moneywords", count, thisyear)

    utils.addtodict("totalwords", alphanum_tokens, thisyear)

outfile = "/Users/tunder/Dropbox/Documents/Conferences (UIUC)/Rutgers talk/1950data.tsv"
with open(outfile, mode='w', encoding='utf-8') as f:
    f.write('year\tmoneywords\ttotalwords\n')
    for date in datelist:
        outline = str(date) + '\t' + str(
            summedbydate[date]["moneywords"]) + '\t' + str(
                summedbydate[date]["totalwords"]) + '\n'
        f.write(outline)
allLCs = dict()
bio = dict()
fic = dict()
poe = dict()
dra = dict()

ctr = 0
for rowidx in rowindices:

    loc = metadata["LOCnum"][rowidx]

    probablybiography, probablydrama, probablyfiction, probablypoetry, maybefiction = choose_cascade(rowidx)
    LC = letterpart(loc)

    utils.addtodict(LC, 1, allLCs)

    if probablybiography:
        utils.addtodict(LC, 1, bio)
    if probablydrama:
        utils.addtodict(LC, 1, dra)
    if probablyfiction:
        utils.addtodict(LC, 1, fic)
    if probablypoetry:
        utils.addtodict(LC, 1, poe)
    if maybefiction:
        utils.addtodict(LC, 0.1, fic)

    ctr += 1
    if ctr % 1000 == 1:
        print(ctr)
Ejemplo n.º 19
0
        continue
    filepath = os.path.join(sourcedir, filename)
    with open(filepath, encoding = 'utf-8') as f:
        filelines = f.readlines()
    pagelist = [filelines]

    # The wordcounter module expects a list of pages, each of which is a list of lines.
    # Ebooks have no pages -- at least as I currently receive them -- so we treat it
    # all as one giant page.

    tokenstream = wordcounter.makestream(pagelist)
    wordcounts, wordsfused, triplets, alphanum_tokens = wordcounter.count_tokens(tokenstream, targetwords=targetwords, targetphrases=targetphrases, verbose = verbose)

    thisyear = summedbydate[date]
    for word, count in wordcounts.items():
        utils.addtodict("moneywords", count, thisyear)

    utils.addtodict("totalwords", alphanum_tokens, thisyear)

outfile = "/Users/tunder/Dropbox/Documents/Conferences (UIUC)/Rutgers talk/1950data.tsv"
with open(outfile, mode='w', encoding='utf-8') as f:
    f.write('year\tmoneywords\ttotalwords\n')
    for date in datelist:
        outline = str(date) + '\t' + str(summedbydate[date]["moneywords"]) + '\t' + str(summedbydate[date]["totalwords"]) + '\n'
        f.write(outline)





Ejemplo n.º 20
0
allLCs = dict()
bio = dict()
fic = dict()
poe = dict()
dra = dict()

ctr = 0
for rowidx in rowindices:

    loc = metadata["LOCnum"][rowidx]

    probablybiography, probablydrama, probablyfiction, probablypoetry, maybefiction = choose_cascade(
        rowidx)
    LC = letterpart(loc)

    utils.addtodict(LC, 1, allLCs)

    if probablybiography:
        utils.addtodict(LC, 1, bio)
    if probablydrama:
        utils.addtodict(LC, 1, dra)
    if probablyfiction:
        utils.addtodict(LC, 1, fic)
    if probablypoetry:
        utils.addtodict(LC, 1, poe)
    if maybefiction:
        utils.addtodict(LC, 0.1, fic)

    ctr += 1
    if ctr % 1000 == 1:
        print(ctr)