Beispiel #1
0
def collect_gcnn(name, most_similar_name, perc):
    gcnn_results = []

    data = dataTools.Authorship(name, ratioTrain, ratioValid, dataPath)
    logging.info('Training GCNN on {0}'.format(name))
    h_params = ClusterUtils.load_best_hyperparams(name)

    phi, indices = load_phi(name, data, percentage=perc)

    if indices.shape[0] < 2:
        return [{'acc': 0, 'f1': 0, 'auc': 0, 'prec': 0, 'time': 0}]

    for split_n in range(NO_OF_SPLITS):
        data.get_split(name, ratioTrain, ratioValid)

        data.reduce_dim(indices)

        start = time.perf_counter()

        gcnn = train_helper.train_net(data, h_params, phi=phi)

        end = time.perf_counter()

        gcnn_eval = evaluate_gcnn(gcnn, data)
        gcnn_eval['time'] = start - end
        gcnn_results.append(gcnn_eval)

        logging.info(
            'SPLIT {0}: GCNN results successfully collected: {1}'.format(
                split_n, gcnn_results[split_n]))

    return gcnn_results
Beispiel #2
0
def collect_gcnn(name, most_similar_name, phi, ind):
    gcnn_results = []

    data = dataTools.AuthorshipOneVsOne(name, most_similar_name, ratioTrain,
                                        ratioValid, dataPath)
    logging.info('Training GCNN on {0} v.s. {1}'.format(
        name, most_similar_name))
    h_params = ClusterUtils.load_best_hyperparams(name)

    for split_n in range(NO_OF_SPLITS):
        data.get_split(name, most_similar_name, ratioTrain, ratioValid)

        data.reduce_dim(ind)

        start = time.perf_counter()

        gcnn = train_helper.train_net(data, h_params, phi=phi)

        end = time.perf_counter()

        gcnn_eval = evaluate_gcnn(gcnn, data)
        gcnn_eval['time'] = start - end
        gcnn_results.append(gcnn_eval)

        logging.info(
            'SPLIT {0}: GCNN results successfully collected: {1}'.format(
                split_n, gcnn_results[split_n]))

    return gcnn_results
Beispiel #3
0
def collect_gcnn(name, most_similar_name):
    gcnn_results = []

    data = dataTools.AuthorshipOneVsOne(name, most_similar_name, ratioTrain, ratioValid, dataPath)
    logging.info('Training GCNN on {0} v.s. {1}'.format(name, most_similar_name))
    h_params = ClusterUtils.load_best_hyperparams(name)

    for split_n in range(NO_OF_SPLITS):
        data.get_split(name, most_similar_name, ratioTrain, ratioValid)
        gcnn = train_helper.train_net(data, h_params)

        gcnn_results.append(evaluate_gcnn(gcnn, data))

        logging.info('SPLIT {0}: GCNN results successfully collected: {1}'.format(split_n, gcnn_results[split_n]))

    return gcnn_results
Beispiel #4
0
def delete_active_author(name, active_file, signal, frame):
    print('EXIT F-TION')
    ClusterUtils.delete_from_active(active_file, name)
Beispiel #5
0
# randomStates[0]['state'] = numpyState
# randomStates.append({})
# randomStates[1]['module'] = 'torch'
# randomStates[1]['state'] = torchState
# randomStates[1]['seed'] = torchSeed
#   This list and dictionary follows the format to then be loaded, if needed,
#   by calling the loadSeed function in Utils.miscTools
# saveSeed(randomStates, saveDir)

########
# DATA #
########
# find the next author
combinations = [1, 3, 5, 7, 9]

authorName = ClusterUtils.get_author_name(ACTIVE_AUTHORS_FILE, BASE_FILE_NAME,
                                          combinations)

try:
    atexit.register(delete_active_author, authorName, ACTIVE_AUTHORS_FILE,
                    None, None)

    for sig in signal.Signals:
        try:
            # signal.signal(sig, test_fn)
            signal.signal(
                sig,
                partial(delete_active_author, authorName, ACTIVE_AUTHORS_FILE))
        except (ValueError, OSError):
            print('invalid: ' + str(sig))

    # signal.signal(signal.SIGHUP, partial(delete_active_author, authorName, ACTIVE_AUTHORS_FILE))
Beispiel #6
0
randomStates[0]['state'] = numpyState
randomStates.append({})
randomStates[1]['module'] = 'torch'
randomStates[1]['state'] = torchState
randomStates[1]['seed'] = torchSeed
#   This list and dictionary follows the format to then be loaded, if needed,
#   by calling the loadSeed function in Utils.miscTools
saveSeed(randomStates, saveDir)

########
# DATA #
########
# find the next author

authorName = ClusterUtils.get_author_name(ACTIVE_AUTHORS_FILE,
                                          BASE_FILE_NAME, [],
                                          hyperparams_path=HYPER_PARAM_FILE)
if authorName is None:
    exit()
# Possible authors: (just use the names in ' ')
# jacob 'abbott', robert louis 'stevenson', louisa may 'alcott',
# horatio 'alger', james 'allen', jane 'austen', emily 'bronte', james 'cooper',
# charles 'dickens', hamlin 'garland', nathaniel 'hawthorne', henry 'james',
# herman 'melville', 'page', herny 'thoreau', mark 'twain',
# arthur conan 'doyle', washington 'irving', edgar allan 'poe',
# sarah orne 'jewett', edith 'wharton'

try:
    atexit.register(delete_active_author, authorName, ACTIVE_AUTHORS_FILE,
                    None, None)
Beispiel #7
0
    file_name = "{0}{1}.txt".format(BASE_FILE_NAME, authorName)

    # create empty files so that other jobs would skip this author
    with open(file_name, mode='w+') as f:
        pass

    # Possible authors: (just use the names in ' ')
    # jacob 'abbott', robert louis 'stevenson', louisa may 'alcott',
    # horatio 'alger', james 'allen', jane 'austen', emily 'bronte', james 'cooper',
    # charles 'dickens', hamlin 'garland', nathaniel 'hawthorne', henry 'james',
    # herman 'melville', 'page', herny 'thoreau', mark 'twain',
    # arthur conan 'doyle', washington 'irving', edgar allan 'poe',
    # sarah orne 'jewett', edith 'wharton'

    nFeatures, nShifts = ClusterUtils.load_best_hyperparams(authorName)

    if doPrint:
        print('Author: {0}, Combination: {1}'.format(authorName, str((nFeatures, nShifts))))

    # set training params
    nClasses = 1  # Either authorName or not
    ratioTrain = 0.6  # Ratio of training samples
    ratioValid = 0.2  # Ratio of validation samples (out of the total training
    # samples)
    # Final split is:
    #   nValidation = round(ratioValid * ratioTrain * nTotal)
    #   nTrain = round((1 - ratioValid) * ratioTrain * nTotal)
    #   nTest = nTotal - nTrain - nValidation

    nDataSplits = 7  # Number of data realizations