def project_tags(tagstoproject, tagtargets): targetstring = ','.join(tagtargets) projectstring = ','.join(tagstoproject) print('First we create a model of ' + targetstring) sizecap = 400 modelname = targetstring + 'byitself' paths = make_paths(modelname) sourcefolder, extension, metadatapath, outputpath1, vocabpath = paths exclusions = make_exclusions(0, 2000, sizecap, tagstoproject) # Note that we exclude tagstoproject from the negative contrast set, so the # contrast sets for the two models will be identical. positive_tags = tagtargets negative_tags = ['random', 'chirandom'] testconditions = set() datetype = "firstpub" numfeatures = 10000 regularization = .000075 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) print() print('Then we create a model of ' + projectstring + ' and use it to predict ' + targetstring) modelname = projectstring + 'predicts' + targetstring paths = make_paths(modelname) sourcefolder, extension, metadatapath, outputpath2, vocabpath = paths exclusions = make_exclusions(0, 2000, sizecap, 'nonegatives') positive_tags = list(tagtargets) positive_tags.extend(tagstoproject) testconditions = set(tagtargets) # That's the line that actually excludes tagtarget from training. classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy)) print() # Now we compare the predictions made by these two models, comparing only # the volumes that are in both models but excluded from the training process # in the second model. comparemodels.compare_untrained(outputpath1, outputpath2)
def project_gothic_beyond_date(dividedate): print('First we create a model of gothic fiction only after ' + str(dividedate)) sizecap = 300 modelname = 'gothicjustpost' + str(dividedate) paths = make_paths(modelname) sourcefolder, extension, metadatapath, outputpath1, vocabpath = paths exclusions = make_exclusions(dividedate, 2000, sizecap, 'nonegatives') positive_tags = ['lochorror', 'pbgothic', 'locghost', 'stangothic', 'chihorror'] negative_tags = ['random', 'chirandom'] testconditions = set() datetype = "firstpub" numfeatures = 10000 regularization = .000075 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) print() print('Then we create a model of gothic fiction blindly predicting after ' + str(dividedate)) modelname = 'gothicpredictpost' + str(dividedate) paths = make_paths(modelname) sourcefolder, extension, metadatapath, outputpath2, vocabpath = paths exclusions = make_exclusions(0, 2001, sizecap, 'nonegatives') testconditions = {'1700', str(dividedate)} classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy)) print() # Now we compare the predictions made by these two models, comparing only # the volumes that are in both models but excluded from the training process # in the second model. comparemodels.compare_untrained(outputpath1, outputpath2)
def compare(dividedate): print('First we create a model of gender only after ' + str(dividedate)) sizecap = 500 modelname = 'post' + str(dividedate) sourcefolder = '/Volumes/TARDIS/work/characterdata/charpredict/' extension = '.tsv' metadatapath = '/Users/tunder/Dropbox/character/meta/predictmeta.csv' vocabpath = '/Users/tunder/Dropbox/character/meta/predictALLvocab.csv' outputpath1 = '/Users/tunder/Dropbox/character/results/' + modelname + str( datetime.date.today()) + '.csv' paths = (sourcefolder, extension, metadatapath, outputpath1, vocabpath) ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = 1900 excludeabove['firstpub'] = 1950 exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) positive_tags = ['f'] negative_tags = ['m'] testconditions = set() datetype = "firstpub" numfeatures = 2000 regularization = .00009 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model( paths, exclusions, classifyconditions) print( 'If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) print() print( 'Then we create a model of detective fiction blindly predicting after ' + str(dividedate)) modelname = 'predictpost' + str(dividedate) outputpath2 = '/Users/tunder/Dropbox/character/results/' + modelname + str( datetime.date.today()) + '.csv' paths = (sourcefolder, extension, metadatapath, outputpath2, vocabpath) excludebelow['firstpub'] = 1780 excludeabove['firstpub'] = 2000 sizecap = 1000 exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) testconditions = {'1700', 1880} classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model( paths, exclusions, classifyconditions) print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy)) print() # Now we compare the predictions made by these two models, comparing only # the volumes that are in both models but excluded from the training process # in the second model. comparemodels.compare_untrained(outputpath1, outputpath2)
def compare(dividedate): print('First we create a model of gender only after ' + str(dividedate)) sizecap = 500 modelname = 'post' + str(dividedate) sourcefolder = '/Volumes/TARDIS/work/characterdata/charpredict/' extension = '.tsv' metadatapath = '/Users/tunder/Dropbox/character/meta/predictmeta.csv' vocabpath = '/Users/tunder/Dropbox/character/meta/predictALLvocab.csv' outputpath1 = '/Users/tunder/Dropbox/character/results/' + modelname + str(datetime.date.today()) + '.csv' paths = (sourcefolder, extension, metadatapath, outputpath1, vocabpath) ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = 1900 excludeabove['firstpub'] = 1950 exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) positive_tags = ['f'] negative_tags = ['m'] testconditions = set() datetype = "firstpub" numfeatures = 2000 regularization = .00009 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) print() print('Then we create a model of detective fiction blindly predicting after ' + str(dividedate)) modelname = 'predictpost' + str(dividedate) outputpath2 = '/Users/tunder/Dropbox/character/results/' + modelname + str(datetime.date.today()) + '.csv' paths = (sourcefolder, extension, metadatapath, outputpath2, vocabpath) excludebelow['firstpub'] = 1780 excludeabove['firstpub'] = 2000 sizecap = 1000 exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) testconditions = {'1700', 1880} classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy)) print() # Now we compare the predictions made by these two models, comparing only # the volumes that are in both models but excluded from the training process # in the second model. comparemodels.compare_untrained(outputpath1, outputpath2)