def collect_gcnn(name, most_similar_name, perc): gcnn_results = [] data = dataTools.Authorship(name, ratioTrain, ratioValid, dataPath) logging.info('Training GCNN on {0}'.format(name)) h_params = ClusterUtils.load_best_hyperparams(name) phi, indices = load_phi(name, data, percentage=perc) if indices.shape[0] < 2: return [{'acc': 0, 'f1': 0, 'auc': 0, 'prec': 0, 'time': 0}] for split_n in range(NO_OF_SPLITS): data.get_split(name, ratioTrain, ratioValid) data.reduce_dim(indices) start = time.perf_counter() gcnn = train_helper.train_net(data, h_params, phi=phi) end = time.perf_counter() gcnn_eval = evaluate_gcnn(gcnn, data) gcnn_eval['time'] = start - end gcnn_results.append(gcnn_eval) logging.info( 'SPLIT {0}: GCNN results successfully collected: {1}'.format( split_n, gcnn_results[split_n])) return gcnn_results
def collect_gcnn(name, most_similar_name, phi, ind): gcnn_results = [] data = dataTools.AuthorshipOneVsOne(name, most_similar_name, ratioTrain, ratioValid, dataPath) logging.info('Training GCNN on {0} v.s. {1}'.format( name, most_similar_name)) h_params = ClusterUtils.load_best_hyperparams(name) for split_n in range(NO_OF_SPLITS): data.get_split(name, most_similar_name, ratioTrain, ratioValid) data.reduce_dim(ind) start = time.perf_counter() gcnn = train_helper.train_net(data, h_params, phi=phi) end = time.perf_counter() gcnn_eval = evaluate_gcnn(gcnn, data) gcnn_eval['time'] = start - end gcnn_results.append(gcnn_eval) logging.info( 'SPLIT {0}: GCNN results successfully collected: {1}'.format( split_n, gcnn_results[split_n])) return gcnn_results
def collect_gcnn(name, most_similar_name): gcnn_results = [] data = dataTools.AuthorshipOneVsOne(name, most_similar_name, ratioTrain, ratioValid, dataPath) logging.info('Training GCNN on {0} v.s. {1}'.format(name, most_similar_name)) h_params = ClusterUtils.load_best_hyperparams(name) for split_n in range(NO_OF_SPLITS): data.get_split(name, most_similar_name, ratioTrain, ratioValid) gcnn = train_helper.train_net(data, h_params) gcnn_results.append(evaluate_gcnn(gcnn, data)) logging.info('SPLIT {0}: GCNN results successfully collected: {1}'.format(split_n, gcnn_results[split_n])) return gcnn_results
def delete_active_author(name, active_file, signal, frame): print('EXIT F-TION') ClusterUtils.delete_from_active(active_file, name)
# randomStates[0]['state'] = numpyState # randomStates.append({}) # randomStates[1]['module'] = 'torch' # randomStates[1]['state'] = torchState # randomStates[1]['seed'] = torchSeed # This list and dictionary follows the format to then be loaded, if needed, # by calling the loadSeed function in Utils.miscTools # saveSeed(randomStates, saveDir) ######## # DATA # ######## # find the next author combinations = [1, 3, 5, 7, 9] authorName = ClusterUtils.get_author_name(ACTIVE_AUTHORS_FILE, BASE_FILE_NAME, combinations) try: atexit.register(delete_active_author, authorName, ACTIVE_AUTHORS_FILE, None, None) for sig in signal.Signals: try: # signal.signal(sig, test_fn) signal.signal( sig, partial(delete_active_author, authorName, ACTIVE_AUTHORS_FILE)) except (ValueError, OSError): print('invalid: ' + str(sig)) # signal.signal(signal.SIGHUP, partial(delete_active_author, authorName, ACTIVE_AUTHORS_FILE))
randomStates[0]['state'] = numpyState randomStates.append({}) randomStates[1]['module'] = 'torch' randomStates[1]['state'] = torchState randomStates[1]['seed'] = torchSeed # This list and dictionary follows the format to then be loaded, if needed, # by calling the loadSeed function in Utils.miscTools saveSeed(randomStates, saveDir) ######## # DATA # ######## # find the next author authorName = ClusterUtils.get_author_name(ACTIVE_AUTHORS_FILE, BASE_FILE_NAME, [], hyperparams_path=HYPER_PARAM_FILE) if authorName is None: exit() # Possible authors: (just use the names in ' ') # jacob 'abbott', robert louis 'stevenson', louisa may 'alcott', # horatio 'alger', james 'allen', jane 'austen', emily 'bronte', james 'cooper', # charles 'dickens', hamlin 'garland', nathaniel 'hawthorne', henry 'james', # herman 'melville', 'page', herny 'thoreau', mark 'twain', # arthur conan 'doyle', washington 'irving', edgar allan 'poe', # sarah orne 'jewett', edith 'wharton' try: atexit.register(delete_active_author, authorName, ACTIVE_AUTHORS_FILE, None, None)
file_name = "{0}{1}.txt".format(BASE_FILE_NAME, authorName) # create empty files so that other jobs would skip this author with open(file_name, mode='w+') as f: pass # Possible authors: (just use the names in ' ') # jacob 'abbott', robert louis 'stevenson', louisa may 'alcott', # horatio 'alger', james 'allen', jane 'austen', emily 'bronte', james 'cooper', # charles 'dickens', hamlin 'garland', nathaniel 'hawthorne', henry 'james', # herman 'melville', 'page', herny 'thoreau', mark 'twain', # arthur conan 'doyle', washington 'irving', edgar allan 'poe', # sarah orne 'jewett', edith 'wharton' nFeatures, nShifts = ClusterUtils.load_best_hyperparams(authorName) if doPrint: print('Author: {0}, Combination: {1}'.format(authorName, str((nFeatures, nShifts)))) # set training params nClasses = 1 # Either authorName or not ratioTrain = 0.6 # Ratio of training samples ratioValid = 0.2 # Ratio of validation samples (out of the total training # samples) # Final split is: # nValidation = round(ratioValid * ratioTrain * nTotal) # nTrain = round((1 - ratioValid) * ratioTrain * nTotal) # nTest = nTotal - nTrain - nValidation nDataSplits = 7 # Number of data realizations