def select_common_features(trainingset, n): ''' Very simply, selects the top n features in the training set. Not a sophisticated feature-selection strategy, but in many cases it gets the job done. ''' allwordcounts = dict() for avolume in trainingset: utils.add_dicts(avolume.rawcounts, allwordcounts) # The add_dicts function will add up all the raw counts into # a single master dictionary. descendingbyfreq = utils.sortkeysbyvalue(allwordcounts, whethertoreverse = True) # This returns a list of 2-tuple (frequency, word) pairs. if n > len(descendingbyfreq): n = len(descendingbyfreq) print("We only have " + str(n) + " features.") # List comprehension that gets the second element of each tuple, up to # a total of n tuples. topfeatures = [x[1] for x in descendingbyfreq[0 : n]] return topfeatures
def select_common_features(trainingset, n): ''' Very simply, selects the top n features in the training set. Not a sophisticated feature-selection strategy, but in many cases it gets the job done. ''' allwordcounts = dict() for avolume in trainingset: utils.add_dicts(avolume.rawcounts, allwordcounts) # The add_dicts function will add up all the raw counts into # a single master dictionary. descendingbyfreq = utils.sortkeysbyvalue(allwordcounts, whethertoreverse=True) # This returns a list of 2-tuple (frequency, word) pairs. if n > len(descendingbyfreq): n = len(descendingbyfreq) print("We only have " + str(n) + " features.") # List comprehension that gets the second element of each tuple, up to # a total of n tuples. topfeatures = [x[1] for x in descendingbyfreq[0:n]] return topfeatures
def main(listofmodels=[ "newfeatures6", "newfeatures2", "newfeatures3", "newfeatures4", "newfeatures9", "forest", "bycallno", "forest2", "forest4" ]): genretranslations = { 'subsc': 'front', 'argum': 'non', 'pref': 'non', 'aut': 'bio', 'bio': 'bio', 'toc': 'front', 'title': 'front', 'bookp': 'front', 'bibli': 'back', 'gloss': 'back', 'epi': 'fic', 'errat': 'non', 'notes': 'non', 'ora': 'non', 'let': 'bio', 'trv': 'non', 'lyr': 'poe', 'nar': 'poe', 'vdr': 'dra', 'pdr': 'dra', 'clo': 'dra', 'impri': 'front', 'libra': 'back', 'index': 'back' } predictroot = "/Volumes/TARDIS/output/" firstdir = predictroot + listofmodels[0] + "/" predictfiles = os.listdir(firstdir) validfiles = list() for filename in predictfiles: if filename.endswith(".predict"): validfiles.append(filename) groundtruthdir = "/Users/tunder/Dropbox/pagedata/newfeatures/genremaps/" groundtruthfiles = os.listdir(groundtruthdir) groundtruths = dict() htidtable = dict() for filename in validfiles: gt = get_ground_truth_file(filename) if not gt in groundtruthfiles: continue htid = gt[0:-4] htidtable[filename] = htid if gt != "": groundtruth = get_ground_truth(gt, groundtruthdir, genretranslations) groundtruths[htid] = groundtruth dissensus = dict() pageprobsforfile = dict() for filename in validfiles: htid = htidtable[filename] versions = list() pageprobs = list() for model in listofmodels: try: thispath = predictroot + model + "/" + filename with open(thispath, encoding="utf-8") as f: filelines = f.readlines() if len(pageprobs) < len(filelines): # Initialize page probabilities to correct length. if len(pageprobs) > 0: print("Initializing more than once. Error condition.") for i in range(len(filelines)): newdict = dict() pageprobs.append(newdict) smoothlist = list() roughlist = list() for i in range(len(filelines)): line = filelines[i] line = line.rstrip() fields = line.split('\t') rough = fields[1] smoothed = fields[2] smoothlist.append(smoothed) roughlist.append(rough) if len(fields) > 5: probdict = interpret_probabilities(fields[5:]) utils.add_dicts(probdict, pageprobs[i]) # This will add all the probabilities for this page to the # record of per-page probabilities. versions.append(smoothlist) versions.append(roughlist) except: pass pageprobsforfile[htid] = pageprobs dissensus[htid] = [x for x in zip(*versions)] consensus = dict() dissentperfile = dict() secondthoughts = dict() dissentsequences = dict() for htid, pagelist in dissensus.items(): winners = list() runnersup = list() dissentseq = list() pageprobs = pageprobsforfile[htid] for i in range(len(pagelist)): page = pagelist[i] floatwinner = maxkey(pageprobs[i]) winner, dissent, runnerup = resolve_voting(page, floatwinner) winners.append(winner) runnersup.append(runnerup) dissentseq.append(dissent) consensus[htid] = winners secondthoughts[htid] = runnersup dissentsequences[htid] = dissentseq return consensus, secondthoughts, pageprobsforfile, dissentsequences, groundtruths
def main(listofmodels = ["newfeatures6", "newfeatures2", "newfeatures3", "newfeatures4", "newfeatures9", "forest", "bycallno", "forest4", "forest7"]): genretranslations = {'subsc' : 'front', 'argum': 'non', 'pref': 'non', 'aut': 'bio', 'bio': 'bio', 'toc': 'front', 'title': 'front', 'bookp': 'front', 'bibli': 'back', 'gloss': 'back', 'epi': 'fic', 'errat': 'non', 'notes': 'non', 'ora': 'non', 'let': 'bio', 'trv': 'non', 'lyr': 'poe', 'nar': 'poe', 'vdr': 'dra', 'pdr': 'dra', 'clo': 'dra', 'impri': 'front', 'libra': 'back', 'index': 'back'} predictroot = "/Volumes/TARDIS/output/" firstdir = predictroot + listofmodels[0] + "/" predictfiles = os.listdir(firstdir) validfiles = list() for filename in predictfiles: if filename.endswith(".predict"): validfiles.append(filename) groundtruthdir = "/Users/tunder/Dropbox/pagedata/newfeatures/genremaps/" groundtruthfiles = os.listdir(groundtruthdir) groundtruths = dict() htidtable = dict() for filename in validfiles: gt = get_ground_truth_file(filename) if not gt in groundtruthfiles: continue htid = gt[0:-4] htidtable[filename] = htid if gt != "": groundtruth = get_ground_truth(gt, groundtruthdir, genretranslations) groundtruths[htid] = groundtruth dissensus = dict() pageprobsforfile = dict() for filename in validfiles: htid = htidtable[filename] versions = list() pageprobs = list() for model in listofmodels: try: thispath = predictroot + model + "/" + filename with open(thispath, encoding="utf-8") as f: filelines = f.readlines() if len(pageprobs) < len(filelines): # Initialize page probabilities to correct length. if len(pageprobs) > 0: print("Initializing more than once. Error condition.") for i in range(len(filelines)): newdict = dict() pageprobs.append(newdict) smoothlist = list() roughlist = list() for i in range(len(filelines)): line = filelines[i] line = line.rstrip() fields = line.split('\t') rough = fields[1] smoothed = fields[2] smoothlist.append(smoothed) roughlist.append(rough) if len(fields) > 5: probdict = interpret_probabilities(fields[5:]) utils.add_dicts(probdict, pageprobs[i]) # This will add all the probabilities for this page to the # record of per-page probabilities. versions.append(smoothlist) versions.append(roughlist) except: pass pageprobsforfile[htid] = pageprobs dissensus[htid] = [x for x in zip(*versions)] consensus = dict() dissentperfile = dict() secondthoughts = dict() dissentsequences = dict() for htid, pagelist in dissensus.items(): winners = list() runnersup = list() dissentseq = list() pageprobs = pageprobsforfile[htid] for i in range(len(pagelist)): page = pagelist[i] floatwinner = maxkey(pageprobs[i]) winner, dissent, runnerup = resolve_voting(page, floatwinner) winners.append(winner) runnersup.append(runnerup) dissentseq.append(dissent) consensus[htid] = winners secondthoughts[htid] = runnersup dissentsequences[htid] = dissentseq return consensus, secondthoughts, pageprobsforfile, dissentsequences, groundtruths
smoothlist = list() roughlist = list() for i in range(len(filelines)): line = filelines[i] line = line.rstrip() fields = line.split('\t') rough = fields[1] smoothed = fields[2] smoothlist.append(smoothed) roughlist.append(rough) if len(fields) > 5: probdict = interpret_probabilities(fields[5:]) # probdict = normalize(probdict) # make them all sum to 1 utils.add_dicts(probdict, pageprobs[i]) # This will add all the probabilities for this page to the # record of per-page probabilities. versions.append(smoothlist) versions.append(roughlist) except: pass pageprobsforfile[filename] = pageprobs dissensus[filename] = [x for x in zip(*versions)] def maxkey(dictionary): tuplelist = utils.sortkeysbyvalue(dictionary, whethertoreverse = True) winner = tuplelist[0][1]