Beispiel #1
0
def project_tags(tagstoproject, tagtargets):

    targetstring = ','.join(tagtargets)
    projectstring = ','.join(tagstoproject)

    print('First we create a model of ' + targetstring)

    sizecap = 400

    modelname = targetstring + 'byitself'
    paths = make_paths(modelname)
    sourcefolder, extension, metadatapath, outputpath1, vocabpath = paths

    exclusions = make_exclusions(0, 2000, sizecap, tagstoproject)
    # Note that we exclude tagstoproject from the negative contrast set, so the
    # contrast sets for the two models will be identical.

    positive_tags = tagtargets
    negative_tags = ['random', 'chirandom']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 10000
    regularization = .000075

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))
    print()
    print('Then we create a model of ' + projectstring + ' and use it to predict ' + targetstring)

    modelname = projectstring + 'predicts' + targetstring
    paths = make_paths(modelname)
    sourcefolder, extension, metadatapath, outputpath2, vocabpath = paths

    exclusions = make_exclusions(0, 2000, sizecap, 'nonegatives')

    positive_tags = list(tagtargets)
    positive_tags.extend(tagstoproject)
    testconditions = set(tagtargets)
    # That's the line that actually excludes tagtarget from training.

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy))
    print()

    # Now we compare the predictions made by these two models, comparing only
    # the volumes that are in both models but excluded from the training process
    # in the second model.

    comparemodels.compare_untrained(outputpath1, outputpath2)
Beispiel #2
0
def project_gothic_beyond_date(dividedate):

    print('First we create a model of gothic fiction only after ' + str(dividedate))

    sizecap = 300

    modelname = 'gothicjustpost' + str(dividedate)
    paths = make_paths(modelname)
    sourcefolder, extension, metadatapath, outputpath1, vocabpath = paths

    exclusions = make_exclusions(dividedate, 2000, sizecap, 'nonegatives')

    positive_tags = ['lochorror', 'pbgothic', 'locghost', 'stangothic', 'chihorror']
    negative_tags = ['random', 'chirandom']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 10000
    regularization = .000075

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))
    print()
    print('Then we create a model of gothic fiction blindly predicting after ' + str(dividedate))

    modelname = 'gothicpredictpost' + str(dividedate)
    paths = make_paths(modelname)
    sourcefolder, extension, metadatapath, outputpath2, vocabpath = paths

    exclusions = make_exclusions(0, 2001, sizecap, 'nonegatives')

    testconditions = {'1700', str(dividedate)}

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy))
    print()

    # Now we compare the predictions made by these two models, comparing only
    # the volumes that are in both models but excluded from the training process
    # in the second model.

    comparemodels.compare_untrained(outputpath1, outputpath2)
def compare(dividedate):

    print('First we create a model of gender only after ' + str(dividedate))

    sizecap = 500

    modelname = 'post' + str(dividedate)
    sourcefolder = '/Volumes/TARDIS/work/characterdata/charpredict/'
    extension = '.tsv'
    metadatapath = '/Users/tunder/Dropbox/character/meta/predictmeta.csv'
    vocabpath = '/Users/tunder/Dropbox/character/meta/predictALLvocab.csv'
    outputpath1 = '/Users/tunder/Dropbox/character/results/' + modelname + str(
        datetime.date.today()) + '.csv'
    paths = (sourcefolder, extension, metadatapath, outputpath1, vocabpath)

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()
    excludebelow['firstpub'] = 1900
    excludeabove['firstpub'] = 1950
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)

    positive_tags = ['f']
    negative_tags = ['m']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 2000
    regularization = .00009

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures,
                          regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(
        paths, exclusions, classifyconditions)

    print(
        'If we divide the dataset with a horizontal line at 0.5, accuracy is: ',
        str(rawaccuracy))
    print()
    print(
        'Then we create a model of detective fiction blindly predicting after '
        + str(dividedate))

    modelname = 'predictpost' + str(dividedate)
    outputpath2 = '/Users/tunder/Dropbox/character/results/' + modelname + str(
        datetime.date.today()) + '.csv'
    paths = (sourcefolder, extension, metadatapath, outputpath2, vocabpath)

    excludebelow['firstpub'] = 1780
    excludeabove['firstpub'] = 2000
    sizecap = 1000
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)

    testconditions = {'1700', 1880}

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures,
                          regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(
        paths, exclusions, classifyconditions)

    print('If we divide the second dataset at 0.5, accuracy is: ',
          str(rawaccuracy))
    print()

    # Now we compare the predictions made by these two models, comparing only
    # the volumes that are in both models but excluded from the training process
    # in the second model.

    comparemodels.compare_untrained(outputpath1, outputpath2)
def compare(dividedate):

    print('First we create a model of gender only after ' + str(dividedate))

    sizecap = 500

    modelname = 'post' + str(dividedate)
    sourcefolder = '/Volumes/TARDIS/work/characterdata/charpredict/'
    extension = '.tsv'
    metadatapath = '/Users/tunder/Dropbox/character/meta/predictmeta.csv'
    vocabpath = '/Users/tunder/Dropbox/character/meta/predictALLvocab.csv'
    outputpath1 = '/Users/tunder/Dropbox/character/results/' + modelname + str(datetime.date.today()) + '.csv'
    paths = (sourcefolder, extension, metadatapath, outputpath1, vocabpath)

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()
    excludebelow['firstpub'] = 1900
    excludeabove['firstpub'] = 1950
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)

    positive_tags = ['f']
    negative_tags = ['m']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 2000
    regularization = .00009

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))
    print()
    print('Then we create a model of detective fiction blindly predicting after ' + str(dividedate))

    modelname = 'predictpost' + str(dividedate)
    outputpath2 = '/Users/tunder/Dropbox/character/results/' + modelname + str(datetime.date.today()) + '.csv'
    paths = (sourcefolder, extension, metadatapath, outputpath2, vocabpath)

    excludebelow['firstpub'] = 1780
    excludeabove['firstpub'] = 2000
    sizecap = 1000
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)

    testconditions = {'1700', 1880}

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy))
    print()

    # Now we compare the predictions made by these two models, comparing only
    # the volumes that are in both models but excluded from the training process
    # in the second model.

    comparemodels.compare_untrained(outputpath1, outputpath2)