Beispiel #1
0
def load_features_for_all_tasks():
	# log_file = log_dir + 'loading_all_tasks_to_save_fid.txt'
	log_file = log_dir + 'loading_full_task_1_to_save_fid.txt'
	load_logger = logger(log_file, {'MISSION': 'Save fid for all tasks'})
	for task_id in range(1, 2):
		load_logger.log('----------- TASK ' + str(task_id) + '-----------')
		task = tasks[task_id]
		prefix = 'task_' + str(task_id)
		X, y = load_features_and_save_id(task, load_logger)
		util.save_matrices_to_disk(
			X, y, [0.2, 0.2], saves_directory, prefix, num_datapoints)
Beispiel #2
0
def load_features():
    features_array = []
    outcomes_array = []
    features_sizes = [0]

    # as we load each by_field/features_{}
    # we record the shape of each Panda Dataframe returned
    # and combine the df into one giant matrix

    # each df has a common set of agg. features, and differ in the number of field features
    # so we merge the df on the agg. features, and impute the missing features
    # with the average value from other examples

    # after imputing is done, we re-separate our giant df by number of columns
    # such that all examples with the same number of columns in the original
    # data are saved to the same file

    for num_fields in range(1, max_fields + 1):
        features_df_file_name = 'by_field/features_{}.csv'.format(num_fields)
        outcomes_df_file_name = 'by_field/outcomes_{}.csv'.format(num_fields)
        features_df = pd.read_csv(os.path.join(features_directory,
                                               features_df_file_name),
                                  nrows=num_datapoints)
        outcomes_df = pd.read_csv(
            os.path.join(features_directory, outcomes_df_file_name))
        features_sizes.append(features_df.shape[0])
        features_array.append(features_df)
        outcomes_array.append(outcomes_df)

    # here we combine features_array and outcomes_array, and delete the original arrays to save memory
    # any missing features are first filled in with N/A
    features_df = pd.concat(features_array, axis=0, ignore_index=True)
    outcomes_df = pd.concat(outcomes_array, axis=0, ignore_index=True)
    del features_array, outcomes_array

    # drop the fid, impute any N/A entries, and readd fid
    features_id_column = features_df[['fid']]
    features_df = features_df.drop(['fid'],
                                   axis=1,
                                   inplace=False,
                                   errors='ignore')
    features_df = process_features_df(features_df)
    features_df = pd.concat([features_df, features_id_column], axis=1)

    # add feature representing num_fields for each training example
    # and concat it with our features_df
    num_fields_array = []
    for num_fields in range(1, max_fields + 1):
        np_array = np.zeros((features_sizes[num_fields], ), dtype=np.int64)
        np_array.fill(num_fields)
        num_fields_array.append(np_array)
    num_fields_array = np.concatenate(num_fields_array)

    assert num_fields_array.shape[0] == features_df.shape[0]
    num_fields_array = pd.DataFrame(
        {"special_original_num_fields": num_fields_array})
    features_df = pd.concat([features_df, num_fields_array], axis=1)

    # process outcomes
    outcome_variable_name = 'all_one_trace_type'
    outcomes = ['line', 'scatter', 'bar']
    outcomes_df_subset = outcomes_df[outcomes_df[outcome_variable_name].isin(
        outcomes)][['fid', outcome_variable_name]]

    # Join features and outcomes
    final_df = pd.merge(features_df, outcomes_df_subset, on='fid', how='inner')
    final_df = final_df.drop(['fid'], axis=1, inplace=False, errors='ignore')
    del features_df, outcomes_df_subset

    # filter out examples with same num fields
    # drop our special_original_num_fields column
    # and save the matrices to disk
    for num_fields in range(1, max_fields + 1):
        X_with_field = final_df[final_df['special_original_num_fields'] ==
                                num_fields]
        X_with_field = X_with_field.drop(['special_original_num_fields'],
                                         axis=1,
                                         inplace=False,
                                         errors='ignore')
        X = X_with_field.iloc[:, :-1]
        y = X_with_field.iloc[:, -1]
        y = pd.get_dummies(y).values.argmax(1)

        res = RandomOverSampler(random_state=RANDOM_STATE)
        X, y = res.fit_sample(X, y)
        X, y = util.unison_shuffle(X, y)
        util.save_matrices_to_disk(X, y, [0.1, 0.1], saves_directory,
                                   'field_' + str(num_fields), num_datapoints)
Beispiel #3
0
def main():
    # all the parameters for the script can be adjusted here

    # num_epochs: the max number of epochs we will train the NN for
    # hidden_sizes: the number of neurons in each hidden layer, enter it as a list
    # output_dim: the dimension of the output. Since outputs are 'line', 'scatter', 'bar', it's 3
    # weight_decay: how much to decay LR in the NN. This can be set to 0 since we decrease LR already through
    #   the ReduceLROnPlateau() function
    # dropout: the dropout in each layer
    # patience: how many epochs we go through (with a near constant learning rate, this threshold is adjusted using
    #   threshold) before dropping learning rate by a factor of 10
    # model_prefix: all models will be loaded/saved with the prefix of the file in the beginning
    # save_model: save each epoch's model onto models/ folder.
    # print_test: print test accuracies into test.txt
    # test_best: test the test accuracy of the best model we've found (best
    # model determined using val accuracy)

    # note: training is automatically stopped when learning rate < 0.01 *
    # starting learning rate
    parameters = {
        'batch_size': 200,
        'num_epochs': 100,
        'hidden_sizes': [800, 800, 800],
        'learning_rate': 5e-4,
        'output_dim': 3,
        'weight_decay': 0,
        'dropout': 0.00,
        'patience': 20,
        'threshold': 1e-3,
        'model_prefix': 'agg',
        'save_model': False,
        'print_test': True,
        'test_best': False
    }

    # LOAD loads the unfiltered features from the .csv files and converts them into filtered .npy files into ~/saves
    # TRAIN trains using the given parameters and .npy files
    # EVAL evaluates prefix_.model_number (giving you test accuracy)

    assert len(
        sys.argv) >= 2, 'You must specify a command LOAD, TRAIN, or EVAL'
    assert (parameters['model_prefix']
            ), 'You must specify a prefix for the model name'
    if parameters['test_best']:
        assert parameters[
            'save_model'], 'You must save a model to test the best version!'

    command = sys.argv[1].lower()
    if command == 'load':
        X, y = load_features()
        # here, we split 10% of examples into val, and 10% into test
        util.save_matrices_to_disk(X, y, [0.1, 0.1], saves_directory,
                                   parameters['model_prefix'], num_datapoints)
        return

    X_train, y_train, X_val, y_val, X_test, y_test = util.load_matrices_from_disk(
        saves_directory, parameters['model_prefix'], num_datapoints)

    if command == 'train':
        train_dataloader, val_dataloader, test_dataloader = train.load_datasets(
            X_train,
            y_train,
            X_val,
            y_val,
            parameters,
            X_test=X_test,
            y_test=y_test)
        train.train(train_dataloader, val_dataloader, test_dataloader,
                    parameters)

    elif command == 'eval':
        assert len(sys.argv) >= 3
        model_suffix = sys.argv[2]
        evaluate.evaluate(model_suffix, X_test, y_test, parameters)
    else:
        assert False, 'The command must either be LOAD, TRAIN, or EVAL'
Beispiel #4
0
def main():
    tasks = [None,
             {'outcome_variable_name': 'all_one_trace_type',
              'prediction_task': 'two',
              'sampling_mode': 'over',
              'pref_id': 1,
              'dataset': 'dataset'},
             {'outcome_variable_name': 'all_one_trace_type',
              'prediction_task': 'three',
              'sampling_mode': 'over',
              'pref_id': 2,
              'dataset': 'dataset'},
             {'outcome_variable_name': 'all_one_trace_type', 'prediction_task': 'six',
              'sampling_mode': 'over', 'pref_id': 3, 'dataset': 'dataset'},
             {'outcome_variable_name': 'has_single_src', 'prediction_task': 'two',
              'sampling_mode': 'over', 'pref_id': 4, 'dataset': 'dataset'},
             {'outcome_variable_name': 'num_x_axes', 'prediction_task': 'numeric',
              'sampling_mode': 100, 'pref_id': 5, 'dataset': 'dataset'},         #10000
             {'outcome_variable_name': 'num_y_axes', 'prediction_task': 'numeric',
              'sampling_mode': 100, 'pref_id': 6, 'dataset': 'dataset'},          #10000
             {'outcome_variable_name': 'trace_type', 'prediction_task': 'two',
              'sampling_mode': 'over', 'pref_id': 7, 'dataset': 'field'},
             {'outcome_variable_name': 'trace_type', 'prediction_task': 'three',
              'sampling_mode': 'over', 'pref_id': 8, 'dataset': 'field'},
             {'outcome_variable_name': 'trace_type', 'prediction_task': 'six',
              'sampling_mode': 'over', 'pref_id': 9, 'dataset': 'field'},
             {'outcome_variable_name': 'is_single_src', 'prediction_task': 'two',
              'sampling_mode': 'over', 'pref_id': 10, 'dataset': 'field'},
             {'outcome_variable_name': 'is_x_or_y', 'prediction_task': 'two',
              'sampling_mode': 'over', 'pref_id': 11, 'dataset': 'field'},
             ]

    for i in [1]: #range(2, 12):  # range(7, len(tasks)):
        task = tasks[i]
        model_prefix = 'paper_' + task['dataset'] + '_' + str(task['pref_id'])

        parameters = {
            'batch_size': 200,
            'num_epochs': 100,
            'hidden_sizes': [1000, 1000, 1000],
            'learning_rate': 5e-4,
            'weight_decay': 0,
            'dropout': 0.00,
            'patience': 10,
            'threshold': 1e-3,
            'model_prefix': model_prefix,
            # uncomment this if you want to print test accuracies/save model
            'only_train': False,
            'save_model': True,
            'print_test': True,
            # for constructing learning curves
            'dataset_ratios': [0.01, 0.1, 0.5, 1.0],
            'test_best': True,
            'use_cuda': False
        }

        if parameters['use_cuda'] == True: 
            os.environ["CUDA_VISIBLE_DEVICES"] = '0'

        for feature_set in [3]:  #[0, 1, 2, 3]:  # range(0, 4): # dimensions, types, values, names
            assert len(sys.argv) >= 2,          'You must specify a command LOAD, TRAIN, or EVAL'
            assert(parameters['model_prefix']), 'You must specify a prefix for the model name'
            if 'test_best' in parameters and parameters['test_best']:
                assert parameters['save_model'], 'You must save a model to test the best version!'

            command = sys.argv[1].lower()
            if command == 'load':
                X, y = load_features(task)
                util.save_matrices_to_disk(
                    X, y, [0.2, 0.2], saves_directory, parameters['model_prefix'], num_datapoints)
            else:
                X_train, y_train, X_val, y_val, X_test, y_test = util.load_matrices_from_disk(
                    saves_directory, parameters['model_prefix'], num_datapoints)

                if task['dataset'] == 'dataset':
                    X_train = X_train[:, dataset_indices[feature_set]]
                    X_val = X_val[:, dataset_indices[feature_set]]
                    X_test = X_test[:, dataset_indices[feature_set]]
                else:
                    assert task['dataset'] == 'field'
                    X_train = X_train[:, field_indices[feature_set]]
                    X_val = X_val[:, field_indices[feature_set]]
                    X_test = X_test[:, field_indices[feature_set]]
                print('loaded dimensions are', X_train.shape)
                print('task_num and feature_set:(' +
                      str(task['pref_id']) + ',' + str(feature_set) + ')')

                if command == 'train':
                    log_file = log_dir + 'training_task_' + str(task['pref_id']) + '.txt'
                    train_logger = logger(log_file, task)
                    train_dataloader, val_dataloader, test_dataloader = train.load_datasets(
                        X_train, y_train, X_val, y_val, parameters, X_test, y_test, train_logger)
                    train.train(
                        train_dataloader,
                        val_dataloader,
                        test_dataloader,
                        parameters,
                        models_directory=models_directory,
                        suffix=suffix,
                        logger=train_logger)
                
                elif command == 'eval':
                    assert len(sys.argv) >= 3
                    model_suffix = sys.argv[2]
                    log_file = log_dir + 'testing_task_' + str(task['pref_id']) + '.txt'
                    test_logger = logger(log_file, task)
                    train_dataloader, val_dataloader, test_dataloader = train.load_datasets(
                        X_train, y_train, X_val, y_val, parameters, X_test, y_test, test_logger)
                    evaluate.evaluate(
                        model_suffix, test_dataloader, parameters, models_directory)
                else:
                    assert False, 'The command must either be LOAD, TRAIN, or EVAL'