def select_common_features(trainingset, n):
	''' Very simply, selects the top n features in the training set.
	Not a sophisticated feature-selection strategy, but in many
	cases it gets the job done.
	'''
	allwordcounts = dict()

	for avolume in trainingset:
		utils.add_dicts(avolume.rawcounts, allwordcounts)
		# The add_dicts function will add up all the raw counts into
		# a single master dictionary.

	descendingbyfreq = utils.sortkeysbyvalue(allwordcounts, whethertoreverse = True)
	# This returns a list of 2-tuple (frequency, word) pairs.

	if n > len(descendingbyfreq):
		n = len(descendingbyfreq)
		print("We only have " + str(n) + " features.")

	# List comprehension that gets the second element of each tuple, up to
	# a total of n tuples.

	topfeatures = [x[1] for x in descendingbyfreq[0 : n]]

	return topfeatures
def select_common_features(trainingset, n):
    ''' Very simply, selects the top n features in the training set.
	Not a sophisticated feature-selection strategy, but in many
	cases it gets the job done.
	'''
    allwordcounts = dict()

    for avolume in trainingset:
        utils.add_dicts(avolume.rawcounts, allwordcounts)
        # The add_dicts function will add up all the raw counts into
        # a single master dictionary.

    descendingbyfreq = utils.sortkeysbyvalue(allwordcounts,
                                             whethertoreverse=True)
    # This returns a list of 2-tuple (frequency, word) pairs.

    if n > len(descendingbyfreq):
        n = len(descendingbyfreq)
        print("We only have " + str(n) + " features.")

    # List comprehension that gets the second element of each tuple, up to
    # a total of n tuples.

    topfeatures = [x[1] for x in descendingbyfreq[0:n]]

    return topfeatures
Exemple #3
0
def main(listofmodels=[
    "newfeatures6", "newfeatures2", "newfeatures3", "newfeatures4",
    "newfeatures9", "forest", "bycallno", "forest2", "forest4"
]):

    genretranslations = {
        'subsc': 'front',
        'argum': 'non',
        'pref': 'non',
        'aut': 'bio',
        'bio': 'bio',
        'toc': 'front',
        'title': 'front',
        'bookp': 'front',
        'bibli': 'back',
        'gloss': 'back',
        'epi': 'fic',
        'errat': 'non',
        'notes': 'non',
        'ora': 'non',
        'let': 'bio',
        'trv': 'non',
        'lyr': 'poe',
        'nar': 'poe',
        'vdr': 'dra',
        'pdr': 'dra',
        'clo': 'dra',
        'impri': 'front',
        'libra': 'back',
        'index': 'back'
    }

    predictroot = "/Volumes/TARDIS/output/"
    firstdir = predictroot + listofmodels[0] + "/"
    predictfiles = os.listdir(firstdir)

    validfiles = list()

    for filename in predictfiles:
        if filename.endswith(".predict"):
            validfiles.append(filename)

    groundtruthdir = "/Users/tunder/Dropbox/pagedata/newfeatures/genremaps/"

    groundtruthfiles = os.listdir(groundtruthdir)

    groundtruths = dict()
    htidtable = dict()
    for filename in validfiles:
        gt = get_ground_truth_file(filename)
        if not gt in groundtruthfiles:
            continue
        htid = gt[0:-4]
        htidtable[filename] = htid
        if gt != "":
            groundtruth = get_ground_truth(gt, groundtruthdir,
                                           genretranslations)
            groundtruths[htid] = groundtruth

    dissensus = dict()
    pageprobsforfile = dict()

    for filename in validfiles:
        htid = htidtable[filename]
        versions = list()
        pageprobs = list()
        for model in listofmodels:
            try:
                thispath = predictroot + model + "/" + filename
                with open(thispath, encoding="utf-8") as f:
                    filelines = f.readlines()

                if len(pageprobs) < len(filelines):
                    # Initialize page probabilities to correct length.
                    if len(pageprobs) > 0:
                        print("Initializing more than once. Error condition.")
                    for i in range(len(filelines)):
                        newdict = dict()
                        pageprobs.append(newdict)

                smoothlist = list()
                roughlist = list()
                for i in range(len(filelines)):
                    line = filelines[i]
                    line = line.rstrip()
                    fields = line.split('\t')
                    rough = fields[1]
                    smoothed = fields[2]
                    smoothlist.append(smoothed)
                    roughlist.append(rough)
                    if len(fields) > 5:
                        probdict = interpret_probabilities(fields[5:])
                        utils.add_dicts(probdict, pageprobs[i])
                        # This will add all the probabilities for this page to the
                        # record of per-page probabilities.

                versions.append(smoothlist)
                versions.append(roughlist)
            except:
                pass
        pageprobsforfile[htid] = pageprobs

        dissensus[htid] = [x for x in zip(*versions)]

    consensus = dict()
    dissentperfile = dict()
    secondthoughts = dict()
    dissentsequences = dict()

    for htid, pagelist in dissensus.items():
        winners = list()
        runnersup = list()
        dissentseq = list()
        pageprobs = pageprobsforfile[htid]
        for i in range(len(pagelist)):
            page = pagelist[i]
            floatwinner = maxkey(pageprobs[i])
            winner, dissent, runnerup = resolve_voting(page, floatwinner)
            winners.append(winner)
            runnersup.append(runnerup)
            dissentseq.append(dissent)
        consensus[htid] = winners
        secondthoughts[htid] = runnersup
        dissentsequences[htid] = dissentseq

    return consensus, secondthoughts, pageprobsforfile, dissentsequences, groundtruths
def main(listofmodels = ["newfeatures6", "newfeatures2", "newfeatures3", "newfeatures4", "newfeatures9", "forest", "bycallno", "forest4", "forest7"]):

    genretranslations = {'subsc' : 'front', 'argum': 'non', 'pref': 'non', 'aut': 'bio', 'bio': 'bio', 'toc': 'front', 'title': 'front', 'bookp': 'front', 'bibli': 'back', 'gloss': 'back', 'epi': 'fic', 'errat': 'non', 'notes': 'non', 'ora': 'non', 'let': 'bio', 'trv': 'non', 'lyr': 'poe', 'nar': 'poe', 'vdr': 'dra', 'pdr': 'dra', 'clo': 'dra', 'impri': 'front', 'libra': 'back', 'index': 'back'}

    predictroot = "/Volumes/TARDIS/output/"
    firstdir = predictroot + listofmodels[0] + "/"
    predictfiles = os.listdir(firstdir)

    validfiles = list()

    for filename in predictfiles:
        if filename.endswith(".predict"):
            validfiles.append(filename)

    groundtruthdir = "/Users/tunder/Dropbox/pagedata/newfeatures/genremaps/"

    groundtruthfiles = os.listdir(groundtruthdir)

    groundtruths = dict()
    htidtable = dict()
    for filename in validfiles:
        gt = get_ground_truth_file(filename)
        if not gt in groundtruthfiles:
            continue
        htid = gt[0:-4]
        htidtable[filename] = htid
        if gt != "":
            groundtruth = get_ground_truth(gt, groundtruthdir, genretranslations)
            groundtruths[htid] = groundtruth

    dissensus = dict()
    pageprobsforfile = dict()

    for filename in validfiles:
        htid = htidtable[filename]
        versions = list()
        pageprobs = list()
        for model in listofmodels:
            try:
                thispath = predictroot + model + "/" + filename
                with open(thispath, encoding="utf-8") as f:
                    filelines = f.readlines()

                if len(pageprobs) < len(filelines):
                    # Initialize page probabilities to correct length.
                    if len(pageprobs) > 0:
                        print("Initializing more than once. Error condition.")
                    for i in range(len(filelines)):
                        newdict = dict()
                        pageprobs.append(newdict)

                smoothlist = list()
                roughlist = list()
                for i in range(len(filelines)):
                    line = filelines[i]
                    line = line.rstrip()
                    fields = line.split('\t')
                    rough = fields[1]
                    smoothed = fields[2]
                    smoothlist.append(smoothed)
                    roughlist.append(rough)
                    if len(fields) > 5:
                        probdict = interpret_probabilities(fields[5:])
                        utils.add_dicts(probdict, pageprobs[i])
                        # This will add all the probabilities for this page to the
                        # record of per-page probabilities.

                versions.append(smoothlist)
                versions.append(roughlist)
            except:
                pass
        pageprobsforfile[htid] = pageprobs

        dissensus[htid] = [x for x in zip(*versions)]

    consensus = dict()
    dissentperfile = dict()
    secondthoughts = dict()
    dissentsequences = dict()

    for htid, pagelist in dissensus.items():
        winners = list()
        runnersup = list()
        dissentseq = list()
        pageprobs = pageprobsforfile[htid]
        for i in range(len(pagelist)):
            page = pagelist[i]
            floatwinner = maxkey(pageprobs[i])
            winner, dissent, runnerup = resolve_voting(page, floatwinner)
            winners.append(winner)
            runnersup.append(runnerup)
            dissentseq.append(dissent)
        consensus[htid] = winners
        secondthoughts[htid] = runnersup
        dissentsequences[htid] = dissentseq

    return consensus, secondthoughts, pageprobsforfile, dissentsequences, groundtruths
Exemple #5
0
			smoothlist = list()
			roughlist = list()
			for i in range(len(filelines)):
				line = filelines[i]
				line = line.rstrip()
				fields = line.split('\t')
				rough = fields[1]
				smoothed = fields[2]
				smoothlist.append(smoothed)
				roughlist.append(rough)
				if len(fields) > 5:
					probdict = interpret_probabilities(fields[5:])
					# probdict = normalize(probdict)
					# make them all sum to 1
					utils.add_dicts(probdict, pageprobs[i])
					# This will add all the probabilities for this page to the
					# record of per-page probabilities.

			versions.append(smoothlist)
			versions.append(roughlist)

		except:
			pass
	pageprobsforfile[filename] = pageprobs

	dissensus[filename] = [x for x in zip(*versions)]

def maxkey(dictionary):
	tuplelist = utils.sortkeysbyvalue(dictionary, whethertoreverse = True)
	winner = tuplelist[0][1]