def init_glv(): global GLOVE_MAT, GLOVE_VOCAB if GLOVE_MAT.any(): return ''' Lazily initializes GloVe vectors if they don't already exist ''' prettyPrint("Building GloVe vectors: ", color.CYAN) GLOVE_MAT, GLOVE_VOCAB, _ = build(root_dir + '/nli-data/glove.6B.{0}d.txt'.format(_glv_dim), delimiter=' ', header=False, quoting=csv.QUOTE_NONE) prettyPrint("Loaded vectors, dimension {0} ".format(np.shape(GLOVE_MAT)[1]), color.CYAN)
def run(args): ''' Provides a simple execution of the test harness from command-line invocation. ''' params = set_config(args.conf) model, feat_vec, labels = train_model(params) load, params['load_vectors'] = params['load_vectors'], True test_model(params, 'train_dev', model, feat_vec, labels) params['load_vectors'] = load test_model(params, 'test', model, feat_vec, labels) prettyPrint("-" * 80, color.YELLOW)
def main(args): ''' Runs an ablation study on the features. ''' params = set_config(args.conf) feature_list = [feature for feature in params['features']] subproc_args = [ deepcopy(params) for i in range(len(feature_list)) ] for i in range(len(subproc_args)): subproc_args[i]['index'] = i ''' -------------------- ''' proc_pool = Pool(args.mp, initargs = (_console_lock,)) proc_pool.map(single_ablation, subproc_args) prettyPrint("-" * 80, color.YELLOW)
def init_glv(): global GLOVE_MAT, GLOVE_VOCAB if GLOVE_MAT.any(): return ''' Lazily initializes GloVe vectors if they don't already exist ''' prettyPrint("Building GloVe vectors: ", color.CYAN) GLOVE_MAT, GLOVE_VOCAB, _ = build( root_dir + '/nli-data/glove.6B.{0}d.txt'.format(_glv_dim), delimiter=' ', header=False, quoting=csv.QUOTE_NONE) prettyPrint( "Loaded vectors, dimension {0} ".format(np.shape(GLOVE_MAT)[1]), color.CYAN)
def evaluate_model(pipeline=None, reader=sick_dev_reader, features=None, file_name="", load_vec=None): """Evaluates the given model on the test data and outputs statistics.""" if reader == sick_dev_reader: reader_name = 'Dev' elif reader == sick_train_reader: reader_name = 'Train + Dev' elif reader == sick_test_reader: reader_name = 'Test' else: reader_name = 'Train' if len(pipeline.steps ) == 2: #Only have a vectorizer and a classifier step in pipeline dict_vectorizer = pipeline.steps[0][1] print reader_name + ' Feature Set Size: ', len( dict_vectorizer.feature_names_) else: feature_selector = pipeline.steps[1][ 1] #Extracts the dictVectorizer from the pipeline object (assumes feature vectorizer is first transform applied) print reader_name + ' Feature Set Size: ', len( feature_selector.get_support(True)) prettyColor = color.RED if reader == 'sick_dev_reader': reader = sick_dev_reader file_name += ".dev" elif reader == 'sick_train_dev_reader': reader = sick_train_dev_reader file_name += ".train_dev" elif reader == 'sick_train_reader': reader = sick_train_reader file_name += ".train" prettyColor = color.CYAN else: reader = sick_test_reader file_name += ".test" feat_vec, gold_labels = obtain_vectors(file_name, load_vec, reader, features) predicted_labels = pipeline.predict(feat_vec) prettyPrint( metrics.classification_report(gold_labels, predicted_labels, digits=5), prettyColor)
def save_vectors(feat_vec=None, labels=None, file_extension=None): """ Saves the feature vectors and classification labels under the given file extension. """ feat_file_name = 'output/' + file_extension + '.feature' label_file_name = 'output/' + file_extension + '.label' prettyPrint( 'Saving feature vector file: {0} ... \n' 'Saving Labels file: {1} ... '.format(feat_file_name, label_file_name), color.CYAN) #Save feature vector to disk with open(feat_file_name, 'w') as f: pickle.dump(feat_vec, f) #Save label file with open(label_file_name, 'w') as f: pickle.dump(labels, f)
def train_model(params): ''' Trains the model, with pretty output. Returns the model, feature vectors, and labels tuple, ready for evaluation. ''' compression = 'lsa' if params[ 'plot'] else None # Test change to get rid of LSA prettyPrint( "-" * 80 + "\nTraining model '{0}' ... ".format(params['model']), color.YELLOW) prettyPrint("With features: {0}".format(params['features']), color.YELLOW) start_train = time.time() model, feat_vec, labels = build_model( clf=params['model'], train_reader=sick_train_reader, features=params['features'], file_name=params['feature_file'] + ".train_dev", load_vec=params['load_vectors'], feature_selector=SelectKBest(chi2, k='all'), compression=compression) best_model = parameter_tune(params['model'], model, feat_vec, labels, grid=params['param_grid']) end_train = time.time() prettyPrint( "Finished training. Took {0:.2f} seconds".format(end_train - start_train), color.RED) return best_model, feat_vec, labels
def set_config(config_file): ''' Sets the configuration file. Returns a parameter hash. ''' params = collections.defaultdict(list) # Loads the parameters set in the conf file and saves in a global dict. with open(config_file, 'r') as f: stream = f.readlines() for line in stream: kv = re.split(r'[ ,:;]*', line.rstrip()) val = kv[1:] if (len(kv) > 2 or kv[0] == 'features') else kv[1] if kv[0] == 'param_grid': # Need to re-parse the expression # Eval to allow for numpy definitions in the config file. val = eval( ':'.join( line.split(':')[1 :] ).strip() ) params[kv[0]] = val # Special-case parsing of arguments for arg in ('load_vectors', 'plot'): params[arg] = False if not params[arg] or not params[arg].lower() == 'true' else True prettyPrint( '{0}'.format(params), color.YELLOW) prettyPrint('Configuration file used: ' + config_file, color.YELLOW) return params
def set_config(config_file): ''' Sets the configuration file. Returns a parameter hash. ''' params = collections.defaultdict(list) # Loads the parameters set in the conf file and saves in a global dict. with open(config_file, 'r') as f: stream = f.readlines() for line in stream: kv = re.split(r'[ ,:;]*', line.rstrip()) val = kv[1:] if (len(kv) > 2 or kv[0] == 'features') else kv[1] if kv[0] == 'param_grid': # Need to re-parse the expression # Eval to allow for numpy definitions in the config file. val = eval(':'.join(line.split(':')[1:]).strip()) params[kv[0]] = val # Special-case parsing of arguments for arg in ('load_vectors', 'plot'): params[arg] = False if not params[arg] or not params[arg].lower( ) == 'true' else True prettyPrint('{0}'.format(params), color.YELLOW) prettyPrint('Configuration file used: ' + config_file, color.YELLOW) return params
def single_ablation(params): print params index = params['index'] ''' Called by a subprocess ''' feature = params['features'][index] params['feature_file'] = 'wo+' + feature params['features'] = ['word_overlap', feature] # params['param_grid']['feature_selector__k'] = ['all'] # Use all features _console_lock.acquire() prettyPrint("Starting job for word overlap + {0}".format(feature), color.YELLOW) _console_lock.release() old_stdout = pipe_stdout(params['feature_file']) results = train_model(params) test_model(params, 'dev', *results) sys.stdout = old_stdout _console_lock.acquire() prettyPrint("Done with job for feature set {0}".format(params['feature_file'], color.RED)) _console_lock.release()
def test_model (params = None, data_set = 'test', best_model = None, feat_vec = None, labels = None): ''' Tests a trained model, or plots it if the params[plot] flag is set ''' if params['plot'] and data_set != 'train': prettyPrint("Generating decision boundary graph ...", color.YELLOW) filename = params['feature_file'] + '.{0}'.format(data_set) feat_vec, labels = obtain_vectors(file_extension = filename, load_vec = params['load_vectors'], reader = sick_dev_reader, features = params['features']) bp.plot_boundary(best_model, feat_vec, labels) prettyPrint("Saved in output/foo.png\n" + "-" * 80, color.YELLOW) return '-----------------' prettyPrint("Testing on data set: {0}".format(data_set), color.YELLOW) evaluate_model(best_model, reader = 'sick_{0}_reader'.format(data_set), features = params['features'], file_name = params['feature_file'], load_vec = params['load_vectors']) prettyPrint("Finished training and evaluating model\n" + "-" * 80, color.YELLOW)
def train_model(params): ''' Trains the model, with pretty output. Returns the model, feature vectors, and labels tuple, ready for evaluation. ''' compression = 'lsa' if params['plot'] else None # Test change to get rid of LSA prettyPrint("-" * 80 + "\nTraining model '{0}' ... ".format(params['model']), color.YELLOW) prettyPrint("With features: {0}".format(params['features']), color.YELLOW) start_train = time.time() model, feat_vec, labels = build_model(clf = params['model'], train_reader = sick_train_reader, features = params['features'], file_name = params['feature_file'] + ".train_dev", load_vec = params['load_vectors'], feature_selector = SelectKBest(chi2, k = 'all'), compression = compression) best_model = parameter_tune(params['model'], model, feat_vec, labels, grid = params['param_grid']) end_train = time.time() prettyPrint ("Finished training. Took {0:.2f} seconds".format(end_train - start_train), color.RED) return best_model, feat_vec, labels
def load_vectors(file_extension=None): """ Loads the feature vector and classification labels from the canonical output files in output. If the file does not exist, the load is aborted. """ feat_file_name = 'output/' + file_extension + '.feature' label_file_name = 'output/' + file_extension + '.label' prettyPrint("Loading feature vectors and labels from disk ... ", color.CYAN) if not os.path.isfile(feat_file_name) or not os.path.isfile( label_file_name): prettyPrint( "Feature vector files {0} could not be found. Generating from scratch instead ..." .format(feat_file_name), color.CYAN) return None, None with open(feat_file_name, 'r') as f: feat_vec = pickle.load(f) with open(label_file_name, 'r') as f: labels = pickle.load(f) prettyPrint("Done loading feature vectors.", color.CYAN) return feat_vec, labels
sys.stdout = old_stdout _console_lock.acquire() prettyPrint("Done with job for feature set {0}".format(params['feature_file'], color.RED)) _console_lock.release() def pipe_stdout(file_name): ''' Pipes stdout to a file. Returns the old file stream. ''' old_stdout = sys.stdout sys.stdout = open('output/' + file_name, 'w+') return old_stdout if __name__ == '__main__': parser = ArgumentParser('description = provide arguments for running model pipeline') parser.add_argument('--conf', help = 'name of configuration file ') parser.add_argument('--mp', help = 'Number of processes to spawn') arguments = parser.parse_args() if not arguments.mp or int(arguments.mp) < 1 or int(arguments.mp) > 8: prettyPrint("Valid multiprocessing argument not found: defaulting to single process", color.YELLOW) arguments.mp = 1 arguments.mp = int(arguments.mp) main(arguments)