def reliable_genre_comparisons(): ''' This function was used in the current version of the article. It addresses weaknesses in earlier versions of genre comparison by comparing only models *with no shared instances*. [Edit Jan 1: To be even more careful about leakage, make that *no shared authors.*] Doing that required a ----load of complexity I'm afraid. I have to first split each genre into disjoint sets, then create self-comparisons between those disjoint sets, as well as cross-comparisons between genres, and then finally compare the self-comparisons to the cross-comparisons. ''' outmodels = '../results/reliable_models.tsv' outcomparisons = '../results/reliable_comparisons.tsv' columns = [ 'testype', 'name1', 'name2', 'ceiling', 'floor', 'meandate1', 'meandate2', 'acc1', 'acc2', 'alienacc1', 'alienacc2', 'spearman', 'spear1on2', 'spear2on1', 'loss', 'loss1on2', 'loss2on1' ] if not os.path.isfile(outcomparisons): with open(outcomparisons, mode='a', encoding='utf-8') as f: scribe = csv.DictWriter(f, delimiter='\t', fieldnames=columns) scribe.writeheader() if not os.path.isfile(outmodels): with open(outmodels, mode='a', encoding='utf-8') as f: outline = 'name\tsize\tfloor\tceiling\tmeandate\taccuracy\tfeatures\tregularization\ti\n' f.write(outline) sourcefolder = '../data/' sizecap = 72 c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100] featurestart = 1500 featureend = 6500 featurestep = 300 modelparams = 'logistic', 15, featurestart, featureend, featurestep, c_range master = pd.read_csv('../metadata/mastermetadata.csv', index_col='docid') periods = [(1800, 1909), (1880, 1924), (1900, 1949), (1910, 1959), (1930, 1969), (1950, 1979), (1970, 1989), (1980, 1999), (1990, 2010)] forbiddenwords = {'fantasy', 'fiction', 'science', 'horror'} # endpoints both inclusive for i in range(15): for floor, ceiling in periods: split_metadata(master, floor, ceiling, sizecap) # That function just above does the real work of preventing leakage, # by splitting the genre into two disjoint sets. This allows self- # comparisons that avoid shared authors, and are thus strictly # comparable to cross-comparisons. metaoptions = ['sf1', 'sf2', 'fant1', 'fant2'] for m in metaoptions: metadatapath = '../temp/' + m + '.csv' vocabpath = '../lexica/' + m + '.txt' name = 'temp_' + m + str(ceiling) + '_' + str(i) if m == 'sf1' or m == 'sf2': tags4positive = {'sf_loc', 'sf_oclc', 'sf_bailey'} else: tags4positive = {'fantasy_loc', 'fantasy_oclc', 'supernat'} tags4negative = {'random', 'randomB'} metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data( sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow=floor, excludeabove=ceiling, forbid4positive={'juv'}, forbid4negative={'juv'}, force_even_distribution=False, numfeatures=6500, forbiddenwords=forbiddenwords) matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model( metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../modeloutput/' + name + '.csv') meandate = int( round(np.sum(metadata.firstpub) / len(metadata.firstpub))) with open(outmodels, mode='a', encoding='utf-8') as f: outline = name + '\t' + str(sizecap) + '\t' + str( floor) + '\t' + str(ceiling) + '\t' + str( meandate) + '\t' + str(maxaccuracy) + '\t' + str( features4max) + '\t' + str( best_regularization_coef) + '\t' + str( i) + '\n' f.write(outline) os.remove(vocabpath) r = dict() r['testype'] = 'sfself' r['ceiling'] = ceiling r['floor'] = floor r['name1'] = 'temp_sf1' + str(ceiling) + '_' + str(i) r['name2'] = 'temp_sf2' + str(ceiling) + '_' + str(i) r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r[ 'loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r[ 'alienacc1'], r['alienacc2'], r['meandate1'], r[ 'meandate2'] = get_divergence(r['name1'], r['name2']) write_a_row(r, outcomparisons, columns) r = dict() r['testype'] = 'fantasyself' r['ceiling'] = ceiling r['floor'] = floor r['name1'] = 'temp_fant1' + str(ceiling) + '_' + str(i) r['name2'] = 'temp_fant2' + str(ceiling) + '_' + str(i) r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r[ 'loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r[ 'alienacc1'], r['alienacc2'], r['meandate1'], r[ 'meandate2'] = get_divergence(r['name1'], r['name2']) write_a_row(r, outcomparisons, columns) r = dict() r['testype'] = 'cross' r['ceiling'] = ceiling r['floor'] = floor r['name1'] = 'temp_sf1' + str(ceiling) + '_' + str(i) r['name2'] = 'temp_fant2' + str(ceiling) + '_' + str(i) r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r[ 'loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r[ 'alienacc1'], r['alienacc2'], r['meandate1'], r[ 'meandate2'] = get_divergence(r['name1'], r['name2']) write_a_row(r, outcomparisons, columns) r = dict() r['testype'] = 'cross' r['ceiling'] = ceiling r['floor'] = floor r['name1'] = 'temp_sf2' + str(ceiling) + '_' + str(i) r['name2'] = 'temp_fant1' + str(ceiling) + '_' + str(i) r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r[ 'loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r[ 'alienacc1'], r['alienacc2'], r['meandate1'], r[ 'meandate2'] = get_divergence(r['name1'], r['name2']) write_a_row(r, outcomparisons, columns)
def create_cross_models(): allgenres = set() meta = pd.read_csv('../genremeta.csv') for idx, row in meta.iterrows(): genres = row.tags.split('|') for g in genres: allgenres.add(g) allgenres = list(allgenres) print(allgenres) for g in allgenres: print() print(g) print() sourcefolder = '../data/' sizecap = 100 outmodels = '../results/crossmodels.tsv' c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100] featurestart = 1000 featureend = 7000 featurestep = 100 modelparams = 'logistic', 12, featurestart, featureend, featurestep, c_range metadatapath = '../genremeta.csv' for contrast in ['randomA', 'randomB']: name = g + '_' + contrast vocabpath = '../lexica/' + name + '.txt' tags4positive = {g} tags4negative = {contrast} floor = 1700 ceiling = 2011 checkpath = '../models/' + name + '.csv' if not os.path.isfile(checkpath): metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data( sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow=floor, excludeabove=ceiling, force_even_distribution=False, negative_strategy='closely match', numfeatures=7000, forbid4positive=set(), forbid4negative=set()) # notice that I am excluding children's lit this time! matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model( metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../models/' + name + '.csv') meandate = int( round(np.sum(metadata.firstpub) / len(metadata.firstpub))) with open(outmodels, mode='a', encoding='utf-8') as f: outline = name + '\t' + str(meandate) + '\t' + str( maxaccuracy) + '\t' + str(features4max) + '\t' + str( best_regularization_coef) + '\n' f.write(outline) os.remove(vocabpath)
def implement_assignment(assignment_file): assignments = dict() with open(assignment_file, encoding='utf-8') as f: for line in f: row = line.strip().split('\t') name = row[0].replace(': ', '') name = name.replace(' ', '') name = name.replace(',', '') positive_genres = [row[0]] if row[1] != 'self': positive_genres.append(row[1]) exclusion = row[1].split('-Not-')[1] excludename = exclusion.replace(' ', '') excludename = excludename.replace(':', '') excludename = excludename.replace(',', '') name = name + '-Not-' + excludename assignments[name] = positive_genres sourcefolder = '../data/' sizecap = 100 outmodels = '../results/crossmodels.tsv' for posname, assigned_positives in assignments.items(): print() print(name, assigned_positives) print() if len(assigned_positives) > 1: exclusion = assigned_positives[1].split('-Not-')[1] exclusionB = exclusion + ' B' set2exclude = {exclusion, exclusionB} else: set2exclude = set() c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100] featurestart = 500 featureend = 6800 featurestep = 100 modelparams = 'logistic', 12, featurestart, featureend, featurestep, c_range metadatapath = '../metadata/genremeta.csv' for contrast in ['randomA', 'randomB']: name = posname + '_' + contrast vocabpath = '../lexica/' + name + '.txt' tags4positive = set(assigned_positives) tags4negative = {contrast} floor = 1700 ceiling = 2011 checkpath = '../models/' + name + '.csv' if not os.path.isfile(checkpath): metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data( sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow=floor, excludeabove=ceiling, force_even_distribution=False, negative_strategy='closely match', numfeatures=6900, forbid4positive=set2exclude, forbid4negative=set()) matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model( metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../models/' + name + '.csv') meandate = int( round(np.sum(metadata.firstpub) / len(metadata.firstpub))) with open(outmodels, mode='a', encoding='utf-8') as f: outline = name + '\t' + str(meandate) + '\t' + str( maxaccuracy) + '\t' + str(features4max) + '\t' + str( best_regularization_coef) + '\n' f.write(outline) os.remove(vocabpath)
def implement_assignment(assignment_file): assignments = dict() with open(assignment_file, encoding = 'utf-8') as f: for line in f: row = line.strip().split('\t') name = row[0].replace(': ', '') name = name.replace(' ', '') name = name.replace(',', '') positive_genres = [row[0]] if row[1] != 'self': positive_genres.append(row[1]) exclusion = row[1].split('-Not-')[1] excludename = exclusion.replace(' ', '') excludename = excludename.replace(':', '') excludename = excludename.replace(',', '') name = name + '-Not-' + excludename assignments[name] = positive_genres sourcefolder = '../data/' sizecap = 100 outmodels = '../results/crossmodels.tsv' for posname, assigned_positives in assignments.items(): print() print(name, assigned_positives) print() if len(assigned_positives) > 1: exclusion = assigned_positives[1].split('-Not-')[1] exclusionB = exclusion + ' B' set2exclude = {exclusion, exclusionB} else: set2exclude = set() c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100] featurestart = 500 featureend = 6800 featurestep = 100 modelparams = 'logistic', 12, featurestart, featureend, featurestep, c_range metadatapath = '../metadata/genremeta.csv' for contrast in ['randomA', 'randomB']: name = posname + '_' + contrast vocabpath = '../lexica/' + name + '.txt' tags4positive = set(assigned_positives) tags4negative = {contrast} floor = 1700 ceiling = 2011 checkpath = '../models/' + name + '.csv' if not os.path.isfile(checkpath): metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, negative_strategy = 'closely match', numfeatures = 6900, forbid4positive = set2exclude, forbid4negative = set()) matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../models/' + name + '.csv') meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub))) with open(outmodels, mode = 'a', encoding = 'utf-8') as f: outline = name + '\t' + str(meandate) + '\t' + str(maxaccuracy) + '\t' + str(features4max) + '\t' + str(best_regularization_coef) + '\n' f.write(outline) os.remove(vocabpath)
def reliable_genre_comparisons(): ''' This function was used in the current version of the article. It addresses weaknesses in earlier versions of genre comparison by comparing only models *with no shared instances*. [Edit Jan 1: To be even more careful about leakage, make that *no shared authors.*] Doing that required a ----load of complexity I'm afraid. I have to first split each genre into disjoint sets, then create self-comparisons between those disjoint sets, as well as cross-comparisons between genres, and then finally compare the self-comparisons to the cross-comparisons. ''' outmodels = '../results/reliable_models.tsv' outcomparisons = '../results/reliable_comparisons.tsv' columns = ['testype', 'name1', 'name2', 'ceiling', 'floor', 'meandate1', 'meandate2', 'acc1', 'acc2', 'alienacc1', 'alienacc2', 'spearman', 'spear1on2', 'spear2on1', 'loss', 'loss1on2', 'loss2on1'] if not os.path.isfile(outcomparisons): with open(outcomparisons, mode = 'a', encoding = 'utf-8') as f: scribe = csv.DictWriter(f, delimiter = '\t', fieldnames = columns) scribe.writeheader() if not os.path.isfile(outmodels): with open(outmodels, mode = 'a', encoding = 'utf-8') as f: outline = 'name\tsize\tfloor\tceiling\tmeandate\taccuracy\tfeatures\tregularization\ti\n' f.write(outline) sourcefolder = '../data/' sizecap = 72 c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100] featurestart = 1500 featureend = 6500 featurestep = 300 modelparams = 'logistic', 15, featurestart, featureend, featurestep, c_range master = pd.read_csv('../metadata/mastermetadata.csv', index_col = 'docid') periods = [(1800, 1909), (1880, 1924), (1900, 1949), (1910, 1959), (1930, 1969), (1950, 1979), (1970, 1989), (1980, 1999), (1990, 2010)] forbiddenwords = {'fantasy', 'fiction', 'science', 'horror'} # endpoints both inclusive for i in range(15): for floor, ceiling in periods: split_metadata(master, floor, ceiling, sizecap) # That function just above does the real work of preventing leakage, # by splitting the genre into two disjoint sets. This allows self- # comparisons that avoid shared authors, and are thus strictly # comparable to cross-comparisons. metaoptions = ['sf1', 'sf2', 'fant1', 'fant2'] for m in metaoptions: metadatapath = '../temp/' + m + '.csv' vocabpath = '../lexica/' + m + '.txt' name = 'temp_' + m + str(ceiling) + '_' + str(i) if m == 'sf1' or m == 'sf2': tags4positive = {'sf_loc', 'sf_oclc', 'sf_bailey'} else: tags4positive = {'fantasy_loc', 'fantasy_oclc', 'supernat'} tags4negative = {'random', 'randomB'} metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, forbid4positive = {'juv'}, forbid4negative = {'juv'}, force_even_distribution = False, numfeatures = 6500, forbiddenwords = forbiddenwords) matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../modeloutput/' + name + '.csv') meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub))) with open(outmodels, mode = 'a', encoding = 'utf-8') as f: outline = name + '\t' + str(sizecap) + '\t' + str(floor) + '\t' + str(ceiling) + '\t' + str(meandate) + '\t' + str(maxaccuracy) + '\t' + str(features4max) + '\t' + str(best_regularization_coef) + '\t' + str(i) + '\n' f.write(outline) os.remove(vocabpath) r = dict() r['testype'] = 'sfself' r['ceiling'] = ceiling r['floor'] = floor r['name1'] = 'temp_sf1' + str(ceiling) + '_' + str(i) r['name2'] = 'temp_sf2' + str(ceiling) + '_' + str(i) r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r['loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r['alienacc1'], r['alienacc2'], r['meandate1'], r['meandate2'] = get_divergence(r['name1'], r['name2']) write_a_row(r, outcomparisons, columns) r = dict() r['testype'] = 'fantasyself' r['ceiling'] = ceiling r['floor'] = floor r['name1'] = 'temp_fant1' + str(ceiling) + '_' + str(i) r['name2'] = 'temp_fant2' + str(ceiling) + '_' + str(i) r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r['loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r['alienacc1'], r['alienacc2'], r['meandate1'], r['meandate2'] = get_divergence(r['name1'], r['name2']) write_a_row(r, outcomparisons, columns) r = dict() r['testype'] = 'cross' r['ceiling'] = ceiling r['floor'] = floor r['name1'] = 'temp_sf1' + str(ceiling) + '_' + str(i) r['name2'] = 'temp_fant2' + str(ceiling) + '_' + str(i) r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r['loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r['alienacc1'], r['alienacc2'], r['meandate1'], r['meandate2'] = get_divergence(r['name1'], r['name2']) write_a_row(r, outcomparisons, columns) r = dict() r['testype'] = 'cross' r['ceiling'] = ceiling r['floor'] = floor r['name1'] = 'temp_sf2' + str(ceiling) + '_' + str(i) r['name2'] = 'temp_fant1' + str(ceiling) + '_' + str(i) r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r['loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r['alienacc1'], r['alienacc2'], r['meandate1'], r['meandate2'] = get_divergence(r['name1'], r['name2']) write_a_row(r, outcomparisons, columns)
def vary_fantasy_ratio_against_sf(): if not os.path.isfile('../measuredivergence/modeldata.tsv'): with open('../measuredivergence/modeldata.tsv', mode='w', encoding='utf-8') as f: outline = 'name\tsize\tratio\taccuracy\tfeatures\tregularization\n' f.write(outline) size = 80 for iteration in [8, 9, 10]: ceiling = 105 if iteration == 10: ceiling = 5 for pct in range(0, ceiling, 5): ratio = pct / 100 name = 'iter' + str(iteration) + '_size' + str( size) + '_ratio' + str(pct) vocabpath = '../measuredivergence/vocabularies/' + name + '.txt' tags4positive = {'fantasy_loc', 'fantasy_oclc'} tags4negative = {'sf_loc', 'sf_oclc'} metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = get_ratio_data( vocabpath, size, ratio, tags4positive, tags4negative, excludebelow=0, excludeabove=3000) c_range = [.00005, .0003, .001, .004, .012, 0.2, 0.8, 3] featurestart = 2000 featureend = 8000 featurestep = 200 modelparams = 'logistic', 12, featurestart, featureend, featurestep, c_range tags4positive = size tags4negative = ratio matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model( metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../measuredivergence/modeloutput/' + name + '.csv', write_fullmodel=True) with open('../measuredivergence/modeldata.tsv', mode='a', encoding='utf-8') as f: outline = name + '\t' + str(size) + '\t' + str( ratio) + '\t' + str(maxaccuracy) + '\t' + str( features4max) + '\t' + str( best_regularization_coef) + '\n' f.write(outline)
def create_cross_models(): allgenres = set() meta = pd.read_csv('../genremeta.csv') for idx, row in meta.iterrows(): genres = row.tags.split('|') for g in genres: allgenres.add(g) allgenres = list(allgenres) print(allgenres) for g in allgenres: print() print(g) print() sourcefolder = '../data/' sizecap = 100 outmodels = '../results/crossmodels.tsv' c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100] featurestart = 1000 featureend = 7000 featurestep = 100 modelparams = 'logistic', 12, featurestart, featureend, featurestep, c_range metadatapath = '../genremeta.csv' for contrast in ['randomA', 'randomB']: name = g + '_' + contrast vocabpath = '../lexica/' + name + '.txt' tags4positive = {g} tags4negative = {contrast} floor = 1700 ceiling = 2011 checkpath = '../models/' + name + '.csv' if not os.path.isfile(checkpath): metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, negative_strategy = 'closely match', numfeatures = 7000, forbid4positive = set(), forbid4negative = set()) # notice that I am excluding children's lit this time! matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../models/' + name + '.csv') meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub))) with open(outmodels, mode = 'a', encoding = 'utf-8') as f: outline = name + '\t' + str(meandate) + '\t' + str(maxaccuracy) + '\t' + str(features4max) + '\t' + str(best_regularization_coef) + '\n' f.write(outline) os.remove(vocabpath)
def train_nonmodel(): sourcefolder = 'samplematrix.csv' sizecap = 700 c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100] featurestart = 1003 featureend = 1103 featurestep = 100 modelparams = 'logistic', 12, featurestart, featureend, featurestep, c_range metadatapath = '../union_of_subsets.csv' name = 'nonmodel' vocabpath = 'dummyvariable' tags4positive = {'notfiction'} tags4negative = {'longfiction', 'shortfiction', 'juvenile', 'poetry', 'drama'} floor = 1800 ceiling = 2011 metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, forbid4positive = {'nothing'}, forbid4negative = {'nothing'}, force_even_distribution = False, negative_strategy = 'random', numfeatures = 7500) matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, 'output/' + name + '.csv')
def first_experiment(): sourcefolder = '../data/' metadatapath = '../metadata/mastermetadata.csv' vocabpath = '../modeloutput/experimentalvocab.txt' tags4positive = {'fantasy_loc', 'fantasy_oclc'} tags4negative = {'sf_loc', 'sf_oclc'} sizecap = 200 metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data( sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap) c_range = [.004, .012, 0.3, 0.8, 2] featurestart = 3000 featureend = 4400 featurestep = 100 modelparams = 'logistic', 10, featurestart, featureend, featurestep, c_range matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model( metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, 'first_experiment', '../modeloutput/first_experiment.csv') plt.rcParams["figure.figsize"] = [9.0, 6.0] plt.matshow(matrix, origin='lower', cmap=plt.cm.YlOrRd) plt.show()
def vary_fantasy_ratio_against_random(): if not os.path.isfile('../measuredivergence/modeldata.tsv'): with open('../measuredivergence/modeldata.tsv', mode = 'w', encoding = 'utf-8') as f: outline = 'name\tsize\tratio\taccuracy\tfeatures\tregularization\n' f.write(outline) size = 80 for iteration in [11, 12, 13]: ceiling = 105 if iteration == 13: ceiling = 5 for pct in range(0, ceiling, 5): ratio = pct / 100 name = 'iter' + str(iteration) + '_size' + str(size) + '_ratio' + str(pct) vocabpath = '../measuredivergence/vocabularies/' + name + '.txt' tags4positive = {'fantasy_loc', 'fantasy_oclc'} tags4negative = {'random'} metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = get_ratio_data(vocabpath, size, ratio, tags4positive, tags4negative, excludebelow = 0, excludeabove = 3000) c_range = [.00005, .0003, .001, .004, .012, 0.2, 0.8, 3] featurestart = 1600 featureend = 6400 featurestep = 400 modelparams = 'logistic', 16, featurestart, featureend, featurestep, c_range matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../measuredivergence/modeloutput/' + name + '.csv', write_fullmodel = False) # write_fullmodel = False forces crossvalidation. with open('../measuredivergence/modeldata.tsv', mode = 'a', encoding = 'utf-8') as f: outline = name + '\t' + str(size) + '\t' + str(ratio) + '\t' + str(maxaccuracy) + '\t' + str(features4max) + '\t' + str(best_regularization_coef) + '\n' f.write(outline)
def new_experiment(): # The first time I ran this, I used partition 2 to build the # mixed data, and partition 1 as a gold standard. Now reversing. outmodelpath = '../measuredivergence/results/newexperimentmodels.csv' columns = ['name', 'size', 'ratio', 'iteration', 'meandate', 'maxaccuracy', 'features', 'regularization'] if not os.path.isfile(outmodelpath): with open(outmodelpath, mode = 'w', encoding = 'utf-8') as f: scribe = csv.DictWriter(f, fieldnames = columns) scribe.writeheader() c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100] featurestart = 1500 featureend = 6000 featurestep = 300 modelparams = 'logistic', 10, featurestart, featureend, featurestep, c_range sizecap = 75 for i in range(3, 6): for ratio in [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100]: sourcefolder = '../measuredivergence/mix/' + str(ratio) + '/' metadatapath = '../measuredivergence/partitionmeta/meta' + str(ratio) + '.csv' name = 'mixeddata_' + str(i) + '_' + str(ratio) vocabpath = '../lexica/' + name + '.txt' tags4positive = {'fantasy', 'detective'} tags4negative = {'random'} floor = 1800 ceiling = 1930 metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, numfeatures = 6000) matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../measuredivergence/newmodeloutput/' + name + '.csv') meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub))) row = dict() row['name'] = name row['size'] = sizecap row['ratio'] = ratio row['iteration'] = i row['meandate'] = meandate row['maxaccuracy'] = maxaccuracy row['features'] = features4max row['regularization'] = best_regularization_coef with open(outmodelpath, mode = 'a', encoding = 'utf-8') as f: scribe = csv.DictWriter(f, fieldnames = columns) scribe.writerow(row) os.remove(vocabpath) sourcefolder = '../data/' metadatapath = '../measuredivergence/partitionmeta/part2.csv' # note that this is changed if you create mix data with # partition 2 name = 'goldfantasy_' + str(i) vocabpath = '../lexica/' + name + '.txt' tags4positive = {'fantasy'} tags4negative = {'random', 'randomB'} floor = 1800 ceiling = 1930 metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, numfeatures = 6000) matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../measuredivergence/newmodeloutput/' + name + '.csv') meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub))) row = dict() row['name'] = name row['size'] = sizecap row['ratio'] = ratio row['iteration'] = i row['meandate'] = meandate row['maxaccuracy'] = maxaccuracy row['features'] = features4max row['regularization'] = best_regularization_coef with open(outmodelpath, mode = 'a', encoding = 'utf-8') as f: scribe = csv.DictWriter(f, fieldnames = columns) scribe.writerow(row) os.remove(vocabpath) sourcefolder = '../data/' metadatapath = '../measuredivergence/partitionmeta/part2.csv' # depending on which partition you used to create mix data; # this will be the other one name = 'golddetective_' + str(i) vocabpath = '../lexica/' + name + '.txt' tags4positive = {'detective'} tags4negative = {'random', 'randomB'} floor = 1800 ceiling = 1930 metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, numfeatures = 6000) matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../measuredivergence/newmodeloutput/' + name + '.csv') meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub))) row = dict() row['name'] = name row['size'] = sizecap row['ratio'] = ratio row['iteration'] = i row['meandate'] = meandate row['maxaccuracy'] = maxaccuracy row['features'] = features4max row['regularization'] = best_regularization_coef with open(outmodelpath, mode = 'a', encoding = 'utf-8') as f: scribe = csv.DictWriter(f, fieldnames = columns) scribe.writerow(row) os.remove(vocabpath)
def first_experiment(): sourcefolder = '../data/' metadatapath = '../metadata/mastermetadata.csv' vocabpath = '../modeloutput/experimentalvocab.txt' tags4positive = {'fantasy_loc', 'fantasy_oclc'} tags4negative = {'sf_loc', 'sf_oclc'} sizecap = 200 metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap) c_range = [.004, .012, 0.3, 0.8, 2] featurestart = 3000 featureend = 4400 featurestep = 100 modelparams = 'logistic', 10, featurestart, featureend, featurestep, c_range matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, 'first_experiment', '../modeloutput/first_experiment.csv') plt.rcParams["figure.figsize"] = [9.0, 6.0] plt.matshow(matrix, origin = 'lower', cmap = plt.cm.YlOrRd) plt.show()
def vary_sf_ratio_against_random(): if not os.path.isfile('../measuredivergence/modeldata.tsv'): with open('../measuredivergence/modeldata.tsv', mode = 'w', encoding = 'utf-8') as f: outline = 'name\tsize\tratio\taccuracy\tfeatures\tregularization\n' f.write(outline) size = 80 for iteration in [5, 6, 7]: ceiling = 105 if iteration == 7: ceiling = 5 for pct in range(0, ceiling, 5): ratio = pct / 100 name = 'iter' + str(iteration) + '_size' + str(size) + '_ratio' + str(pct) vocabpath = '../measuredivergence/vocabularies/' + name + '.txt' tags4positive = {'sf_loc', 'sf_oclc'} tags4negative = {'random'} metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = get_ratio_data(vocabpath, size, ratio, tags4positive, tags4negative, excludebelow = 0, excludeabove = 3000) c_range = [.00005, .0003, .001, .004, .012, 0.2, 0.8] featurestart = 1000 featureend = 6000 featurestep = 300 modelparams = 'logistic', 16, featurestart, featureend, featurestep, c_range matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../measuredivergence/modeloutput/' + name + '.csv', write_fullmodel = False) # It's important not to write fullmodel if you want the csvs # to accurately reflect terrible accuracy on diluted datasets. # write_fullmodel = False forces crossvalidation. with open('../measuredivergence/modeldata.tsv', mode = 'a', encoding = 'utf-8') as f: outline = name + '\t' + str(size) + '\t' + str(ratio) + '\t' + str(maxaccuracy) + '\t' + str(features4max) + '\t' + str(best_regularization_coef) + '\n' f.write(outline)
def repeatedly_model(modelname, tags4positive, tags4negative, sizecap, sourcefolder, metadatapath): outmodels = '../results/' + modelname + '_models.tsv' if not os.path.isfile(outmodels): with open(outmodels, mode='w', encoding='utf-8') as f: outline = 'name\tsize\tfloor\tceiling\tmeandate\taccuracy\tfeatures\tregularization\ti\n' f.write(outline) for i in range(10): name = modelname + str(i) vocabpath = '../lexica/' + name + '.txt' c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100] featurestart = 200 featureend = 5200 featurestep = 200 modelparams = 'logistic', 10, featurestart, featureend, featurestep, c_range forbiddenwords = {} floor = 1700 ceiling = 2020 metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data( sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, extension='.fic.tsv', excludebelow=floor, excludeabove=ceiling, forbid4positive={'juv'}, forbid4negative={'juv'}, force_even_distribution=False, numfeatures=6000, forbiddenwords=forbiddenwords) matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model( metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../modeloutput/' + name + '.csv') meandate = int( round(np.sum(metadata.firstpub) / len(metadata.firstpub))) floor = np.min(metadata.firstpub) ceiling = np.max(metadata.firstpub) with open(outmodels, mode='a', encoding='utf-8') as f: outline = name + '\t' + str(sizecap) + '\t' + str( floor) + '\t' + str(ceiling) + '\t' + str( meandate) + '\t' + str(maxaccuracy) + '\t' + str( features4max) + '\t' + str( best_regularization_coef) + '\t' + str(i) + '\n' f.write(outline) os.remove(vocabpath)
def create_variant_models(modelname, tags4positive, tags4negative, splityear): ''' Creates variant models that are then used by measure_parallax. ''' outmodels = '../results/' + modelname + '_models.tsv' columns = [ 'testype', 'name1', 'name2', 'ceiling1', 'floor1', 'ceiling2', 'floor2', 'meandate1', 'meandate2', 'acc1', 'acc2', 'alienacc1', 'alienacc2', 'spearman', 'spear1on2', 'spear2on1', 'loss', 'loss1on2', 'loss2on1' ] if not os.path.isfile(outmodels): with open(outmodels, mode='a', encoding='utf-8') as f: outline = 'name\tsize\tfloor\tceiling\tmeandate\taccuracy\tfeatures\tregularization\ti\n' f.write(outline) sourcefolder = '../newdata/' sizecap = 75 c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100] featurestart = 1000 featureend = 6500 featurestep = 300 modelparams = 'logistic', 15, featurestart, featureend, featurestep, c_range master = pd.read_csv('../meta/finalmeta.csv', index_col='docid') forbiddenwords = {} periods = [(1700, splityear - 1), (splityear, 2010)] for i in range(10): for floor, ceiling in periods: name = modelname + str(floor) + '_' + str(ceiling) + '_' + str(i) names = [] names.append(name) metadatapath = '../meta/finalmeta.csv' vocabpath = '../lexica/' + name + '.txt' metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data( sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, extension='.fic.tsv', excludebelow=floor, excludeabove=ceiling, forbid4positive={'juv'}, forbid4negative={'juv'}, force_even_distribution=False, numfeatures=6500, forbiddenwords=forbiddenwords) matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model( metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../modeloutput/' + name + '.csv') meandate = int( round(np.sum(metadata.firstpub) / len(metadata.firstpub))) with open(outmodels, mode='a', encoding='utf-8') as f: outline = name + '\t' + str(sizecap) + '\t' + str( floor) + '\t' + str(ceiling) + '\t' + str( meandate) + '\t' + str(maxaccuracy) + '\t' + str( features4max) + '\t' + str(best_regularization_coef ) + '\t' + str(i) + '\n' f.write(outline) os.remove(vocabpath)