def genre_gridsearch(modelname, c_range, ftstart, ftend, ftstep, positive_tags = ['elite'], negative_tags = ['vulgar'], excl_below = 1700, excl_above = 2000):
    # Function does a gridsearch to identify an optimal number of features and setting of
    # the regularization constant; then produces that model.

    sourcefolder = '/Users/tunder/Dropbox/GenreProject/python/reception/fiction/fromEF/'
    extension = '.tsv'
    #metadatapath = '/Users/tunder/Dropbox/GenreProject/python/reception/fiction/prestigeficmeta.csv'
    metadatapath = '/Users/tunder/Dropbox/GenreProject/python/reception/fiction/littlemagazines.csv'
    vocabpath = '/Users/tunder/Dropbox/fiction/lexicon/' + modelname + '.txt'
    if os.path.exists(vocabpath):
        print('Vocabulary for ' + modelname + ' already exists. Using it.')
    outputpath = '/Users/tunder/Dropbox/GenreProject/python/reception/fiction/' + modelname + '.csv'

    # We can simply exclude volumes from consideration on the basis on any
    # metadata category we want, using the dictionaries defined below.

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    excludebelow['firstpub'] = excl_below
    excludeabove['firstpub'] = excl_above

    sizecap = 700

    # CLASSIFY CONDITIONS

    # print()
    # print("You can also specify positive tags to be excluded from training, and/or a pair")
    # print("of integer dates outside of which vols should be excluded from training.")
    # print("If you add 'donotmatch' to the list of tags, these volumes will not be")
    # print("matched with corresponding negative volumes.")
    # print()
    # ## testphrase = input("Comma-separated list of such tags: ")
    testphrase = ''
    testconditions = set([x.strip() for x in testphrase.split(',') if len(x) > 0])

    datetype = "firstpub"
    numfeatures = ftend
    regularization = .000075
    # linting the code would get rid of regularization, which is at this
    # point an unused dummy parameter

    paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath)
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    modelparams = 'logistic', 12, ftstart, ftend, ftstep, c_range

    matrix, rawaccuracy, allvolumes, coefficientuples = train.tune_a_model(paths, exclusions, classifyconditions, modelparams)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))
    tiltaccuracy = train.diachronic_tilt(allvolumes, 'linear', [])

    print("Divided with a line fit to the data trend, it's ", str(tiltaccuracy))
Beispiel #2
0
def genre_gridsearch(metadatapath, modelname, c_range, ftstart, ftend, ftstep, positive_tags = ['fic'], negative_tags = ['bio'], excl_below = 1700, excl_above = 2000):

    # Function does a gridsearch to identify an optimal number of features and setting of
    # the regularization constant; then produces that model.

    sourcefolder = '../sourcefiles/'
    extension = '.tsv'

    vocabpath = '../lexicons/' + modelname + '.txt'
    if os.path.exists(vocabpath):
        print('Vocabulary for ' + modelname + ' already exists. Using it.')
    outputpath = '../modeloutput/' + modelname + '.csv'

    # We can simply exclude volumes from consideration on the basis on any
    # metadata category we want, using the dictionaries defined below.

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    excludebelow['firstpub'] = excl_below
    excludeabove['firstpub'] = excl_above

    sizecap = 75

    # CLASSIFY CONDITIONS

    # print()
    # print("You can also specify positive tags to be excluded from training, and/or a pair")
    # print("of integer dates outside of which vols should be excluded from training.")
    # print("If you add 'donotmatch' to the list of tags, these volumes will not be")
    # print("matched with corresponding negative volumes.")
    # print()
    # ## testphrase = input("Comma-separated list of such tags: ")
    testphrase = ''
    testconditions = set([x.strip() for x in testphrase.split(',') if len(x) > 0])

    datetype = "firstpub"
    numfeatures = ftend
    regularization = .000075
    # linting the code would get rid of regularization, which is at this
    # point an unused dummy parameter

    paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath)
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    modelparams = 'logistic', 12, ftstart, ftend, ftstep, c_range

    matrix, rawaccuracy, allvolumes, coefficientuples = train.tune_a_model(paths, exclusions, classifyconditions, modelparams)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))
    tiltaccuracy = train.diachronic_tilt(allvolumes, 'linear', [])

    print("Divided with a line fit to the data trend, it's ", str(tiltaccuracy))
Beispiel #3
0
def gridsearch_a_model(metadatapath,
                       sourcefolder,
                       c_range,
                       ftstart,
                       ftend,
                       ftstep,
                       positive_tags=['f'],
                       negative_tags=['m']):
    ''' Function does a gridsearch to identify an optimal number of features and setting of
    the regularization constant; then produces that model. Note that we do not use this for
    models of specific decades. Just initially for model selection.'''

    modelname = metadatapath.replace('.//models/', '').replace('_meta.csv', '')
    extension = '.tsv'
    vocabpath = metadatapath.replace('_meta', '_vocab')
    if os.path.exists(vocabpath):
        print('Vocabulary for ' + modelname + ' already exists. Using it.')
    outputpath = metadatapath.replace('_meta', '')

    ## EXCLUSIONS. # not used in this project

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    sizecap = 2000

    # CLASSIFY CONDITIONS # not used in this project

    testconditions = set()

    datetype = "firstpub"
    numfeatures = ftend
    regularization = .000075
    # linting the code would get rid of regularization, which is at this
    # point an unused dummy parameter

    paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath)
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures,
                          regularization, testconditions)

    modelparams = 'logistic', 12, ftstart, ftend, ftstep, c_range

    matrix, rawaccuracy, allvolumes, coefficientuples = train.tune_a_model(
        paths, exclusions, classifyconditions, modelparams)

    print(
        'If we divide the dataset with a horizontal line at 0.5, accuracy is: ',
        str(rawaccuracy))
    tiltaccuracy = train.diachronic_tilt(allvolumes, 'linear', [])

    print("Divided with a line fit to the data trend, it's ",
          str(tiltaccuracy))
Beispiel #4
0
def genre_gridsearch(modelname, c_range, ftstart, ftend, ftstep, positive_tags, negative_tags = ['random', 'grandom', 'chirandom'], excl_below = 1700, excl_above = 2000, metadatapath = '../metadata/concatenatedmeta.csv'):

    '''
    This is at present the workhorse function of the script. It does a
    gridsearch to identify an optimal number of features and setting of
    the regularization constant. Then it produces that model, and writes it
    to disk.
    '''

    #sourcefolder = '/Users/tunder/Dropbox/fiction/newtsvs'
    sourcefolder = '../sourcefiles'

    extension = '.tsv'
    vocabpath = '../lexicons/' + modelname + '.txt'
    if os.path.exists(vocabpath):
        print('Vocabulary for ' + modelname + ' already exists. Using it.')
    outputpath = '../modeloutput/' + modelname + '.csv'

    # We can simply exclude volumes from consideration on the basis on any
    # metadata category we want, using the dictionaries defined below.

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    excludebelow['firstpub'] = excl_below
    excludeabove['firstpub'] = excl_above

    sizecap = 400

    # CLASSIFY CONDITIONS

    testphrase = ''
    testconditions = set([x.strip() for x in testphrase.split(',') if len(x) > 0])

    datetype = "firstpub"
    numfeatures = ftend
    regularization = .000075
    # linting the code would get rid of regularization, which is at this
    # point an unused dummy parameter

    paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath)
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    modelparams = 'logistic', 10, ftstart, ftend, ftstep, c_range

    matrix, rawaccuracy, allvolumes, coefficientuples = train.tune_a_model(paths, exclusions, classifyconditions, modelparams)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))