Beispiel #1
0
def init_glv():

    global GLOVE_MAT, GLOVE_VOCAB
    if GLOVE_MAT.any():
        return
    ''' Lazily initializes GloVe vectors if they don't already exist '''
    prettyPrint("Building GloVe vectors: ", color.CYAN)
    GLOVE_MAT, GLOVE_VOCAB, _ = build(root_dir + '/nli-data/glove.6B.{0}d.txt'.format(_glv_dim), delimiter=' ', header=False, quoting=csv.QUOTE_NONE)
    prettyPrint("Loaded vectors, dimension {0} ".format(np.shape(GLOVE_MAT)[1]), color.CYAN)
Beispiel #2
0
def run(args):
    ''' Provides a simple execution of the test harness from
        command-line invocation.  ''' 
    params = set_config(args.conf)
    model, feat_vec, labels = train_model(params)
    
    load, params['load_vectors'] = params['load_vectors'], True
    
    test_model(params, 'train_dev', model, feat_vec, labels)
    params['load_vectors'] = load

    test_model(params, 'test', model, feat_vec, labels)
    prettyPrint("-" * 80, color.YELLOW)
Beispiel #3
0
def main(args):
    ''' Runs an ablation study on the features. '''
    params = set_config(args.conf)
    feature_list = [feature for feature in params['features']]
    

    subproc_args = [ deepcopy(params) for i in range(len(feature_list)) ]
    for i in range(len(subproc_args)):
        subproc_args[i]['index'] = i
    ''' -------------------- '''
    proc_pool = Pool(args.mp, initargs = (_console_lock,))
    proc_pool.map(single_ablation, subproc_args)
    prettyPrint("-" * 80, color.YELLOW)
Beispiel #4
0
def init_glv():

    global GLOVE_MAT, GLOVE_VOCAB
    if GLOVE_MAT.any():
        return
    ''' Lazily initializes GloVe vectors if they don't already exist '''
    prettyPrint("Building GloVe vectors: ", color.CYAN)
    GLOVE_MAT, GLOVE_VOCAB, _ = build(
        root_dir + '/nli-data/glove.6B.{0}d.txt'.format(_glv_dim),
        delimiter=' ',
        header=False,
        quoting=csv.QUOTE_NONE)
    prettyPrint(
        "Loaded vectors, dimension {0} ".format(np.shape(GLOVE_MAT)[1]),
        color.CYAN)
Beispiel #5
0
def evaluate_model(pipeline=None,
                   reader=sick_dev_reader,
                   features=None,
                   file_name="",
                   load_vec=None):
    """Evaluates the given model on the test data and outputs statistics."""
    if reader == sick_dev_reader:
        reader_name = 'Dev'
    elif reader == sick_train_reader:
        reader_name = 'Train + Dev'
    elif reader == sick_test_reader:
        reader_name = 'Test'
    else:
        reader_name = 'Train'

    if len(pipeline.steps
           ) == 2:  #Only have a vectorizer and a classifier step in pipeline
        dict_vectorizer = pipeline.steps[0][1]
        print reader_name + ' Feature Set Size: ', len(
            dict_vectorizer.feature_names_)
    else:
        feature_selector = pipeline.steps[1][
            1]  #Extracts the dictVectorizer from the pipeline object (assumes feature vectorizer is first transform applied)
        print reader_name + ' Feature Set Size: ', len(
            feature_selector.get_support(True))

    prettyColor = color.RED
    if reader == 'sick_dev_reader':
        reader = sick_dev_reader
        file_name += ".dev"
    elif reader == 'sick_train_dev_reader':
        reader = sick_train_dev_reader
        file_name += ".train_dev"
    elif reader == 'sick_train_reader':
        reader = sick_train_reader
        file_name += ".train"
        prettyColor = color.CYAN
    else:
        reader = sick_test_reader
        file_name += ".test"
    feat_vec, gold_labels = obtain_vectors(file_name, load_vec, reader,
                                           features)

    predicted_labels = pipeline.predict(feat_vec)
    prettyPrint(
        metrics.classification_report(gold_labels, predicted_labels, digits=5),
        prettyColor)
Beispiel #6
0
def save_vectors(feat_vec=None, labels=None, file_extension=None):
    """ Saves the feature vectors and classification labels under the given file extension.  """

    feat_file_name = 'output/' + file_extension + '.feature'
    label_file_name = 'output/' + file_extension + '.label'

    prettyPrint(
        'Saving feature vector file: {0} ... \n'
        'Saving Labels file: {1} ... '.format(feat_file_name, label_file_name),
        color.CYAN)

    #Save feature vector to disk
    with open(feat_file_name, 'w') as f:
        pickle.dump(feat_vec, f)
    #Save label file
    with open(label_file_name, 'w') as f:
        pickle.dump(labels, f)
Beispiel #7
0
def train_model(params):
    ''' Trains the model, with pretty output.  Returns the model, feature
        vectors, and labels tuple, ready for evaluation. '''
    compression = 'lsa' if params[
        'plot'] else None  # Test change to get rid of LSA

    prettyPrint(
        "-" * 80 + "\nTraining model '{0}' ... ".format(params['model']),
        color.YELLOW)
    prettyPrint("With features: {0}".format(params['features']), color.YELLOW)
    start_train = time.time()
    model, feat_vec, labels = build_model(
        clf=params['model'],
        train_reader=sick_train_reader,
        features=params['features'],
        file_name=params['feature_file'] + ".train_dev",
        load_vec=params['load_vectors'],
        feature_selector=SelectKBest(chi2, k='all'),
        compression=compression)

    best_model = parameter_tune(params['model'],
                                model,
                                feat_vec,
                                labels,
                                grid=params['param_grid'])

    end_train = time.time()
    prettyPrint(
        "Finished training.  Took {0:.2f} seconds".format(end_train -
                                                          start_train),
        color.RED)

    return best_model, feat_vec, labels
Beispiel #8
0
def set_config(config_file):
    ''' Sets the configuration file.  Returns a parameter hash. '''
    params = collections.defaultdict(list)
    # Loads the parameters set in the conf file and saves in a global dict.
    with open(config_file, 'r') as f:
	stream = f.readlines()
	for line in stream:	
		kv = re.split(r'[ ,:;]*', line.rstrip())
		val = kv[1:] if (len(kv) > 2 or kv[0] == 'features') else kv[1]

		if kv[0] == 'param_grid': # Need to re-parse the expression
			# Eval to allow for numpy definitions in the config file.
			val = eval( ':'.join( line.split(':')[1 :] ).strip() )  
		
		params[kv[0]] = val
        # Special-case parsing of arguments
        for arg in ('load_vectors', 'plot'):
            params[arg] = False if not params[arg] or not params[arg].lower() == 'true' else True
        prettyPrint( '{0}'.format(params), color.YELLOW)
        prettyPrint('Configuration file used: ' + config_file, color.YELLOW)

    return params
Beispiel #9
0
def set_config(config_file):
    ''' Sets the configuration file.  Returns a parameter hash. '''
    params = collections.defaultdict(list)
    # Loads the parameters set in the conf file and saves in a global dict.
    with open(config_file, 'r') as f:
        stream = f.readlines()
        for line in stream:
            kv = re.split(r'[ ,:;]*', line.rstrip())
            val = kv[1:] if (len(kv) > 2 or kv[0] == 'features') else kv[1]

            if kv[0] == 'param_grid':  # Need to re-parse the expression
                # Eval to allow for numpy definitions in the config file.
                val = eval(':'.join(line.split(':')[1:]).strip())

            params[kv[0]] = val
    # Special-case parsing of arguments
        for arg in ('load_vectors', 'plot'):
            params[arg] = False if not params[arg] or not params[arg].lower(
            ) == 'true' else True
        prettyPrint('{0}'.format(params), color.YELLOW)
        prettyPrint('Configuration file used: ' + config_file, color.YELLOW)

    return params
Beispiel #10
0
def single_ablation(params):
    print params
    index = params['index']
    ''' Called by a subprocess '''
    
    feature = params['features'][index]
    params['feature_file'] = 'wo+' + feature
    params['features'] = ['word_overlap', feature]
    
    # params['param_grid']['feature_selector__k'] = ['all'] # Use all features

    _console_lock.acquire()
    prettyPrint("Starting job for word overlap + {0}".format(feature), color.YELLOW)
    _console_lock.release()
        
    old_stdout = pipe_stdout(params['feature_file'])
    results = train_model(params)
    test_model(params, 'dev', *results)

    sys.stdout = old_stdout

    _console_lock.acquire()
    prettyPrint("Done with job for feature set {0}".format(params['feature_file'], color.RED))
    _console_lock.release()
Beispiel #11
0
def test_model (params = None, data_set = 'test', best_model = None, feat_vec = None, labels = None):
    ''' Tests a trained model, or plots it if the params[plot] flag is set '''
    if params['plot'] and data_set != 'train':
        prettyPrint("Generating decision boundary graph ...", color.YELLOW)

        filename = params['feature_file'] + '.{0}'.format(data_set)
        feat_vec, labels = obtain_vectors(file_extension = filename,
                                          load_vec = params['load_vectors'],
                                          reader = sick_dev_reader,
                                          features = params['features'])                                         
        bp.plot_boundary(best_model, feat_vec, labels)
        prettyPrint("Saved in output/foo.png\n" + "-" * 80, color.YELLOW)
        return
    '-----------------'
    
    prettyPrint("Testing on data set: {0}".format(data_set), color.YELLOW)
    
    evaluate_model(best_model, reader = 'sick_{0}_reader'.format(data_set),
                    features = params['features'],
                    file_name = params['feature_file'],
                    load_vec = params['load_vectors'])
    
    prettyPrint("Finished training and evaluating model\n" + "-" * 80, color.YELLOW)
Beispiel #12
0
def train_model(params):
    ''' Trains the model, with pretty output.  Returns the model, feature
        vectors, and labels tuple, ready for evaluation. '''
    compression = 'lsa' if params['plot'] else None # Test change to get rid of LSA
    
    prettyPrint("-" * 80 + "\nTraining model '{0}' ... ".format(params['model']), color.YELLOW)
    prettyPrint("With features: {0}".format(params['features']), color.YELLOW)
    start_train = time.time()
    model, feat_vec, labels = build_model(clf = params['model'],
                                          train_reader = sick_train_reader,
                                          features = params['features'],
                                          file_name = params['feature_file'] + ".train_dev",
                                          load_vec = params['load_vectors'],
                                          feature_selector = SelectKBest(chi2, k = 'all'),
                                          compression = compression)
    
    best_model = parameter_tune(params['model'], model, feat_vec, labels, grid = params['param_grid'])

    end_train = time.time() 
    prettyPrint ("Finished training.  Took {0:.2f} seconds".format(end_train - start_train), color.RED)
    
    return best_model, feat_vec, labels
Beispiel #13
0
def load_vectors(file_extension=None):
    """ Loads the feature vector and classification labels from the
        canonical output files in output.  If the file does not exist,
        the load is aborted. 
    """

    feat_file_name = 'output/' + file_extension + '.feature'
    label_file_name = 'output/' + file_extension + '.label'

    prettyPrint("Loading feature vectors and labels from disk ... ",
                color.CYAN)
    if not os.path.isfile(feat_file_name) or not os.path.isfile(
            label_file_name):
        prettyPrint(
            "Feature vector files {0} could not be found.  Generating from scratch instead ..."
            .format(feat_file_name), color.CYAN)
        return None, None
    with open(feat_file_name, 'r') as f:
        feat_vec = pickle.load(f)
    with open(label_file_name, 'r') as f:
        labels = pickle.load(f)

    prettyPrint("Done loading feature vectors.", color.CYAN)
    return feat_vec, labels
Beispiel #14
0
    sys.stdout = old_stdout

    _console_lock.acquire()
    prettyPrint("Done with job for feature set {0}".format(params['feature_file'], color.RED))
    _console_lock.release()
        
def pipe_stdout(file_name):
    ''' Pipes stdout to a file.  Returns the old file stream. '''
    old_stdout = sys.stdout
    sys.stdout = open('output/' + file_name, 'w+')
    return old_stdout


if __name__ == '__main__':
    parser = ArgumentParser('description = provide arguments for running model pipeline')
    parser.add_argument('--conf', help = 'name of configuration file ')
    parser.add_argument('--mp', help = 'Number of processes to spawn')
    arguments = parser.parse_args()
  
    if not arguments.mp or int(arguments.mp) < 1 or int(arguments.mp) > 8:
        prettyPrint("Valid multiprocessing argument not found: defaulting to single process", color.YELLOW) 
        arguments.mp = 1
    arguments.mp = int(arguments.mp)
    main(arguments)