Example #1
0
def augment(dataset, format, shift_size):
    """

    Parameters
    ----------
    dataset :
    format :
    shift_size :

    Returns
    -------

    """
    shift_size *= (math.pi/180.0)
    num_shifts = int(2*math.pi / shift_size)
    x_train, y_train, x_test, y_test = ds.load_dataset(dataset, format)
    augmented_x = np.zeros((x_train.shape[0]*num_shifts, x_train.shape[1]))
    augmented_y = np.zeros((y_train.shape[0]*num_shifts, y_train.shape[1]))
    for ix, line in enumerate(x_train):
        if (ix+1)%1000 == 0: print ix+1
        for s in xrange(num_shifts):
            shift = s * shift_size
            augmented_x[ix*num_shifts+s] = [verify_angle(val+shift) if index%4==2 else val for index,val in enumerate(line)]
            augmented_y[ix*num_shifts+s] = y_train[ix]
    tr.shuffle_in_unison(augmented_x, augmented_y)

    output_path = os.path.join(ds.get_path_to_dataset(dataset), "augmented_{}.npz".format(format))
    np.savez(output_path, x_train=augmented_x, x_test=x_test, y_train=augmented_y, y_test=y_test)
Example #2
0
def _save_by_jet_num(dataset, num_jets):
    data, format = dataset.split('/')
    x_train, y_train, x_test, y_test = ds.load_dataset(data, format)
    if num_jets.endswith("+"):
        val = lambda x: x >= int(num_jets[:-1])
    elif num_jets.endswith("-"):
        val = lambda x: x <= int(num_jets[:-1])
    else:
        val = lambda x: x == int(num_jets)
    all_x = np.concatenate((x_train, x_test), axis=0)
    all_y = np.concatenate((y_train, y_test), axis=0)
    nulls = np.zeros((all_x.shape[0], all_x.shape[1]/4), dtype=np.bool)
    for y in xrange(all_x.shape[1]/4):
        for ix, row in enumerate((x_train > 0)[:, y*4:(y+1)*4]):
            nulls[ix, y] = all(row == 0)
    events_with_x_jets = val(np.array([row[:-2].sum() for row in ~nulls]))

    all_x, all_y = all_x[events_with_x_jets], all_y[events_with_x_jets]
    tr.shuffle_in_unison(all_x, all_y)

    cutoff = int(all_x.shape[0] * 0.8)  # 80% training 20% testing
    train_x = all_x[:cutoff]
    train_y = all_y[:cutoff]
    test_x = all_x[cutoff:]
    test_y = all_y[cutoff:]

    output_path = os.path.join(ds.get_path_to_dataset(data), "{}jets_{}.npz".format(num_jets, format))
    np.savez(output_path, x_train=train_x, x_test=test_x, y_train=train_y, y_test=test_y)
Example #3
0
def _save_by_jet_num(dataset, num_jets):
    data, format = dataset.split('/')
    x_train, y_train, x_test, y_test = ds.load_dataset(data, format)
    if num_jets.endswith("+"):
        val = lambda x: x >= int(num_jets[:-1])
    elif num_jets.endswith("-"):
        val = lambda x: x <= int(num_jets[:-1])
    else:
        val = lambda x: x == int(num_jets)
    all_x = np.concatenate((x_train, x_test), axis=0)
    all_y = np.concatenate((y_train, y_test), axis=0)
    nulls = np.zeros((all_x.shape[0], all_x.shape[1]/4), dtype=np.bool)
    for y in xrange(all_x.shape[1]/4):
        for ix, row in enumerate((x_train > 0)[:, y*4:(y+1)*4]):
            nulls[ix, y] = all(row == 0)
    events_with_x_jets = val(np.array([row[:-2].sum() for row in ~nulls]))

    all_x, all_y = all_x[events_with_x_jets], all_y[events_with_x_jets]
    tr.shuffle_in_unison(all_x, all_y)

    cutoff = int(all_x.shape[0] * 0.8)  # 80% training 20% testing
    train_x = all_x[:cutoff]
    train_y = all_y[:cutoff]
    test_x = all_x[cutoff:]
    test_y = all_y[cutoff:]

    output_path = os.path.join(ds.get_path_to_dataset(data), "{}jets_{}.npz".format(num_jets, format))
    np.savez(output_path, x_train=train_x, x_test=test_x, y_train=train_y, y_test=test_y)
Example #4
0
def augment(dataset, format, shift_size):
    shift_size *= (math.pi/180.0)
    num_shifts = int(2*math.pi / shift_size)
    x_train, y_train, x_test, y_test = ds.load_dataset(dataset, format)
    augmented_x = np.zeros((x_train.shape[0]*num_shifts, x_train.shape[1]))
    augmented_y = np.zeros((y_train.shape[0]*num_shifts, y_train.shape[1]))
    for ix, line in enumerate(x_train):
        if (ix+1)%1000 == 0: print ix+1
        for s in xrange(num_shifts):
            shift = s * shift_size
            augmented_x[ix*num_shifts+s] = [verify_angle(val+shift) if index%4==2 else val for index,val in enumerate(line)]
            augmented_y[ix*num_shifts+s] = y_train[ix]
    tr.shuffle_in_unison(augmented_x, augmented_y)

    output_path = os.path.join(ds.get_path_to_dataset(dataset), "augmented_{}.npz".format(format))
    np.savez(output_path, x_train=augmented_x, x_test=x_test, y_train=augmented_y, y_test=y_test)
Example #5
0
def save_ratios(dataset, ratios, buffer=1000):
    """
    Divides a certain dataset into subsets of data with certain ratios of backgrond to signal. For a ratio list of
    length n, the counting index, i, for the background starts from index 0, and the counting index, j, for the signal
    starts at n-1. The ratio for each iteration is then i to j (i/j). Generates a temporary file to accomplish this.

    Parameters
    ----------
    dataset <string> : the name of the dataset (/-separated)
    ratios <list> : a list of integers that define ratios of background to signal.
    buffer <int> : an integer defining the number of data points to load into memory at a time.

    """
    ratios = [ratios] if type(ratios) is str else ratios
    ratios = map(lambda x: map(float, x.split(':')), ratios)
    data = dataset.split('/')[0]
    format = '/'.join(dataset.split('/')[1:])
    main_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(data, format, mode='a')

    bkg_test, sig_test = sum_cols(y_test)
    bkg_train, sig_train = sum_cols(y_train)

    TEST_UPPER_LIMIT = int(1.5 * bkg_test) if bkg_test < sig_test else int(1.5 * sig_test)
    TRAIN_UPPER_LIMIT = int(1.5 * bkg_train) if bkg_train < sig_train else int(1.5 * sig_train)

    temp_h_file, temp_h_data = add_group_hdf5(".deep_learning.temp.h5", "Temp",
                                    [(bkg_train, x_train.shape[1]),
                                     (bkg_train, y_train.shape[1]),
                                     (sig_train, x_train.shape[1]),
                                     (sig_train, y_train.shape[1]),
                                     (bkg_test, x_test.shape[1]),
                                     (bkg_test, y_test.shape[1]),
                                     (sig_test, x_test.shape[1]),
                                     (sig_test, y_test.shape[1])],
                                    names=["train_bkg_x",
                                           "train_bkg_y",
                                           "train_sig_x",
                                           "train_sig_y",
                                           "test_bkg_x",
                                           "test_bkg_y",
                                           "test_sig_x",
                                           "test_sig_y"])

    print "Generating temporary files..."
    for i in xrange(int(math.ceil(x_train.shape[0] / buffer))):
        # index should be same shape and need to reshape the result :/
        train_bkg_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        train_sig_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        test_bkg_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])
        test_sig_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])

        for j in xrange(x_train.shape[1]):
            train_bkg_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 0] == 1
            train_sig_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 1] == 1
        for j in xrange(x_test.shape[1]):
            test_bkg_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 0] == 1
            test_sig_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 1] == 1

        selection = x_train[train_bkg_index]
        temp_h_data[0].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_bkg_index[:, :y_train.shape[1]]]
        temp_h_data[1].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_train[train_sig_index]
        temp_h_data[2].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_sig_index[:, :y_train.shape[1]]]
        temp_h_data[3].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_test[test_bkg_index]
        temp_h_data[4].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_bkg_index[:, :y_test.shape[1]]]
        temp_h_data[5].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

        selection = x_test[test_sig_index]
        temp_h_data[6].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_sig_index[:, :y_test.shape[1]]]
        temp_h_data[7].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

    # Perform all of this in archive so that you write to file every iteration
    buffer_reset = buffer
    for rat in ratios:

        print "Creating ratio {:d}/{:d} ...".format(*map(int, rat))

        h_file, h_data = add_group_hdf5(ds.get_path_to_dataset(data)+os.sep+data+".h5",
                                        "{}to{}".format(*map(int, rat)),
                                        [(TRAIN_UPPER_LIMIT, x_train.shape[1]),
                                         (TRAIN_UPPER_LIMIT, y_train.shape[1]),
                                         (TEST_UPPER_LIMIT, x_test.shape[1]),
                                         (TEST_UPPER_LIMIT, y_test.shape[1])],
                                        where='/{}'.format(format))

        test_bkg_indices = np.arange(bkg_test)
        test_sig_indices = np.arange(sig_test)
        train_bkg_indices = np.arange(bkg_train)
        train_sig_indices = np.arange(sig_train)

        train_count = 0
        buffer = buffer_reset
        while train_count < TRAIN_UPPER_LIMIT:
            if TRAIN_UPPER_LIMIT - train_count < buffer:
                buffer = TRAIN_UPPER_LIMIT - train_count

            # Indices to NOT include
            train_bkg_ix = np.random.choice(train_bkg_indices,
                                            train_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                            replace=False)
            train_sig_ix = np.random.choice(train_sig_indices,
                                            train_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                            replace=False)

            # Indices to keep
            k_train_bkg = np.setdiff1d(train_bkg_indices, train_bkg_ix)
            k_train_sig = np.setdiff1d(train_sig_indices, train_sig_ix)

            train_small_x_sig = temp_h_data[2][k_train_sig]
            train_small_y_sig = temp_h_data[3][k_train_sig]
            train_small_x_bkg = temp_h_data[0][k_train_bkg]
            train_small_y_bkg = temp_h_data[1][k_train_bkg]

            train_x = np.concatenate((train_small_x_bkg, train_small_x_sig))
            train_y = np.concatenate((train_small_y_bkg, train_small_y_sig))

            tr.shuffle_in_unison(train_x, train_y)

            h_data[0].append(train_x)
            h_data[1].append(train_y)

            train_count += k_train_bkg.size + k_train_sig.size

            train_bkg_indices = train_bkg_ix
            train_sig_indices = train_sig_ix

        test_count = 0
        buffer = buffer_reset
        while test_count < TEST_UPPER_LIMIT:
            if TEST_UPPER_LIMIT - test_count < buffer:
                buffer = TEST_UPPER_LIMIT - test_count

            # Indices to NOT include
            test_bkg_ix = np.random.choice(test_bkg_indices, test_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                           replace=False)
            test_sig_ix = np.random.choice(test_sig_indices, test_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                           replace=False)

            # Indices to keep
            k_test_bkg = np.setdiff1d(test_bkg_indices, test_bkg_ix)
            k_test_sig = np.setdiff1d(test_sig_indices, test_sig_ix)

            test_small_x_sig = temp_h_data[6][k_test_sig]
            test_small_y_sig = temp_h_data[7][k_test_sig]
            test_small_x_bkg = temp_h_data[4][k_test_bkg]
            test_small_y_bkg = temp_h_data[5][k_test_bkg]

            test_x = np.concatenate((test_small_x_bkg, test_small_x_sig))
            test_y = np.concatenate((test_small_y_bkg, test_small_y_sig))

            tr.shuffle_in_unison(test_x, test_y)

            h_data[2].append(test_x)
            h_data[3].append(test_y)

            test_count += k_test_bkg.size + k_test_sig.size

            test_bkg_indices = test_bkg_ix
            test_sig_indices = test_sig_ix

        print "Created Group: {}/{}to{}".format(format, *map(int, rat))

        h_file.flush()
        h_file.close()

    main_file.close()
    temp_h_file.close()
    os.remove(".deep_learning.temp.h5")
Example #6
0
def create_archive(dataset_name, format, buffer=1000, train_fraction=0.8):
    """ converts a series of text files into a single hdf5 archive
    create_archive takes the name of a dataset and the
    format that the data is in, loads the config file
    from the dataset's directory, loads the right text files, and saves
    the training and testing data as numpy arrays in a single hdf5
    file.

    Parameters
    ----------
    dataset_name <string> : name of the dataset (directory) to look for a
    configuration file in

    format <string> : name of the format that you want to
    build a hdf5 archive for

    Notes
    -----
    The locations of the text files to load should be described in
    the config file.  See the example in the OSUTTBAR dataset
    directory.
    """
    path_dict = read_config_file(dataset_name, format)
    output_path = os.path.join(ds.get_path_to_dataset(dataset_name), "{}.h5".format(dataset_name))

    if "train_path" in path_dict:
        train_len, train_cols = get_file_len_and_shape(path_dict["train_path"])
        test_len, test_cols = get_file_len_and_shape(path_dict["test_path"])
        assert train_cols == test_cols  # Train and test files should have the same data shape
        h_file, h_data = add_group_hdf5(output_path, format, zip([train_len]*2+[test_len]*2, train_cols+test_cols))
        with open(path_dict["train_path"]) as train_f:
            for l in train_f:
                event = np.fromstring(l, sep=',', dtype="float32")
                h_data[0].append(event[1:][None])
                h_data[1].append(make_one_hot(event[0], 0, train_cols[1]-1)[0])
        with open(path_dict["test_path"]) as test_f:
            for l in test_f:
                event = np.fromstring(l, sep=',', dtype="float32")
                h_data[2].append(event[1:][None])
                h_data[3].append(make_one_hot(event[0], 0, test_cols[1]-1)[0])

    elif "background_path" in path_dict:
        bkg_len, bkg_cols = get_file_len_and_shape(path_dict["background_path"])
        sig_len, sig_cols = get_file_len_and_shape(path_dict["signal_path"])
        assert sig_cols[0] == bkg_cols[0] # the background and signal should have the same shape
        n_labels = 1 + bkg_cols[1]

        total_len = bkg_len+sig_len
        bkg_read_amt = int(bkg_len*buffer/total_len)
        sig_read_amt = int(sig_len*buffer/total_len)
        h_file, h_data = add_group_hdf5(output_path,
                                        format,
                                        zip([round(total_len*train_fraction)] * 2 + [round(total_len*(1-train_fraction))] * 2,
                                            [bkg_cols[0], n_labels]*2))
        # Read in a buffer of 1000 lines at a time (allows fraction accuracy to 0.xxx)
        with open(path_dict["background_path"]) as bkg_f:
            with open(path_dict["signal_path"]) as sig_f:
                i = 0
                while i < total_len - 1:
                    x_buffer_array = np.zeros((buffer, bkg_cols[0]))
                    y_buffer_array = np.zeros((buffer, n_labels))
                    ix = 0
                    for j in xrange(bkg_read_amt):
                        line = bkg_f.readline()
                        if line:
                            event = np.fromstring(line, sep=',', dtype="float32")
                            x_buffer_array[ix] = event[1:]
                            y_buffer_array[ix] = make_one_hot(event[0], 0, n_labels - 1)[0]
                            ix += 1
                        else:
                            break
                    for k in xrange(sig_read_amt):
                        line = sig_f.readline()
                        if line:
                            event = np.fromstring(line, sep=',', dtype="float32")
                            x_buffer_array[ix] = event[1:]
                            y_buffer_array[ix] = make_one_hot(event[0], 0, n_labels - 1)[0]
                            ix += 1
                        else:
                            break
                    indices = np.any(~(x_buffer_array==0), axis=1)
                    x_buffer_array = x_buffer_array[indices]
                    y_buffer_array = y_buffer_array[indices]
                    tr.shuffle_in_unison(x_buffer_array, y_buffer_array)
                    cutoff = int(x_buffer_array.shape[0]*train_fraction)
                    for r in x_buffer_array[:cutoff]:
                        h_data[0].append(r[None])
                    for r in y_buffer_array[:cutoff]:
                        h_data[1].append(r[None])
                    for r in x_buffer_array[cutoff:]:
                        h_data[2].append(r[None])
                    for r in y_buffer_array[cutoff:]:
                        h_data[3].append(r[None])
                    i += ix
    elif "both" in path_dict:

        total_len, total_cols = get_file_len_and_shape(path_dict["both"])
        train_len = round(train_fraction*total_len)
        test_len = round((1-train_fraction)*total_len)
        h_file, h_data = add_group_hdf5(output_path, format,
                                        zip([train_len] * 2 + [test_len] * 2, total_cols*2))
        with open(path_dict["both"]) as data_f:
            x_buffer_array = np.zeros((buffer, total_cols[0]))
            y_buffer_array = np.zeros((buffer, total_cols[1]))
            for i, l in enumerate(data_f):
                event = np.fromstring(l, sep=',', dtype="float32")
                x_buffer_array[i%buffer] = event[1:]
                y_buffer_array[i%buffer] = make_one_hot(event[0], 0, total_cols[1] - 1)[0]
                if i%buffer == buffer - 1:
                    indices = np.any(~(x_buffer_array == 0), axis=1)
                    x_buffer_array = x_buffer_array[indices]
                    y_buffer_array = y_buffer_array[indices]
                    tr.shuffle_in_unison(x_buffer_array, y_buffer_array)
                    cutoff = int(x_buffer_array.shape[0] * train_fraction)
                    for r in x_buffer_array[:cutoff]:
                        h_data[0].append(r[None])
                    for r in y_buffer_array[:cutoff]:
                        h_data[1].append(r[None])
                    for r in x_buffer_array[cutoff:]:
                        h_data[2].append(r[None])
                    for r in y_buffer_array[cutoff:]:
                        h_data[3].append(r[None])
                    x_buffer_array = np.zeros((buffer, total_cols[0]))
                    y_buffer_array = np.zeros((buffer, total_cols[1]))

    h_file.flush()
    h_file.close()
Example #7
0
def save_ratios(dataset, ratios, buffer=1000):

    ratios = [ratios] if type(ratios) is str else ratios
    ratios = map(lambda x: map(float, x.split(':')), ratios)
    data, format = dataset.split('/')
    main_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(data, format, mode='a')

    bkg_test, sig_test = sum_cols(y_test)
    bkg_train, sig_train = sum_cols(y_train)

    TEST_UPPER_LIMIT = int(1.5 * bkg_test) if bkg_test < sig_test else int(1.5 * sig_test)
    TRAIN_UPPER_LIMIT = int(1.5 * bkg_train) if bkg_train < sig_train else int(1.5 * sig_train)

    temp_h_file, temp_h_data = add_group_hdf5(".deep_learning.temp.hdf5", "Temp",
                                    [(bkg_train, x_train.shape[1]),
                                     (bkg_train, y_train.shape[1]),
                                     (sig_train, x_train.shape[1]),
                                     (sig_train, y_train.shape[1]),
                                     (bkg_test, x_test.shape[1]),
                                     (bkg_test, y_test.shape[1]),
                                     (sig_test, x_test.shape[1]),
                                     (sig_test, y_test.shape[1])],
                                    names=["train_bkg_x",
                                           "train_bkg_y",
                                           "train_sig_x",
                                           "train_sig_y",
                                           "test_bkg_x",
                                           "test_bkg_y",
                                           "test_sig_x",
                                           "test_sig_y"])

    print "Generating temporary files..."
    for i in xrange(int(math.ceil(x_train.shape[0] / buffer))):
        # index should be same shape and need to reshape the result :/
        train_bkg_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        train_sig_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        test_bkg_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])
        test_sig_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])

        for j in xrange(x_train.shape[1]):
            train_bkg_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 0] == 1
            train_sig_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 1] == 1
        for j in xrange(x_test.shape[1]):
            test_bkg_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 0] == 1
            test_sig_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 1] == 1

        selection = x_train[train_bkg_index]
        temp_h_data[0].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_bkg_index[:, :y_train.shape[1]]]
        temp_h_data[1].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_train[train_sig_index]
        temp_h_data[2].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_sig_index[:, :y_train.shape[1]]]
        temp_h_data[3].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_test[test_bkg_index]
        temp_h_data[4].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_bkg_index[:, :y_test.shape[1]]]
        temp_h_data[5].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

        selection = x_test[test_sig_index]
        temp_h_data[6].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_sig_index[:, :y_test.shape[1]]]
        temp_h_data[7].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

    # Perform all of this in archive so that you write to file every iteration
    buffer_reset = buffer
    for rat in ratios:

        print "Creating ratio {:d}/{:d} ...".format(*map(int, rat))

        h_file, h_data = add_group_hdf5(ds.get_path_to_dataset(data)+os.sep+data+".hdf5",
                                        "{}to{}".format(*map(int, rat)),
                                        [(TRAIN_UPPER_LIMIT, x_train.shape[1]),
                                         (TRAIN_UPPER_LIMIT, y_train.shape[1]),
                                         (TEST_UPPER_LIMIT, x_test.shape[1]),
                                         (TEST_UPPER_LIMIT, y_test.shape[1])],
                                        where='/{}'.format(format))

        test_bkg_indices = np.arange(bkg_test)
        test_sig_indices = np.arange(sig_test)
        train_bkg_indices = np.arange(bkg_train)
        train_sig_indices = np.arange(sig_train)

        train_count = 0
        buffer = buffer_reset
        while train_count < TRAIN_UPPER_LIMIT:
            if TRAIN_UPPER_LIMIT - train_count < buffer:
                buffer = TRAIN_UPPER_LIMIT - train_count

            # Indices to NOT include
            train_bkg_ix = np.random.choice(train_bkg_indices,
                                            train_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                            replace=False)
            train_sig_ix = np.random.choice(train_sig_indices,
                                            train_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                            replace=False)

            # Indices to keep
            k_train_bkg = np.setdiff1d(train_bkg_indices, train_bkg_ix)
            k_train_sig = np.setdiff1d(train_sig_indices, train_sig_ix)

            train_small_x_sig = temp_h_data[2][k_train_sig]
            train_small_y_sig = temp_h_data[3][k_train_sig]
            train_small_x_bkg = temp_h_data[0][k_train_bkg]
            train_small_y_bkg = temp_h_data[1][k_train_bkg]

            train_x = np.concatenate((train_small_x_bkg, train_small_x_sig))
            train_y = np.concatenate((train_small_y_bkg, train_small_y_sig))

            tr.shuffle_in_unison(train_x, train_y)

            h_data[0].append(train_x)
            h_data[1].append(train_y)

            train_count += k_train_bkg.size + k_train_sig.size

            train_bkg_indices = train_bkg_ix
            train_sig_indices = train_sig_ix

        test_count = 0
        buffer = buffer_reset
        while test_count < TEST_UPPER_LIMIT:
            if TEST_UPPER_LIMIT - test_count < buffer:
                buffer = TEST_UPPER_LIMIT - test_count

            # Indices to NOT include
            test_bkg_ix = np.random.choice(test_bkg_indices, test_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                           replace=False)
            test_sig_ix = np.random.choice(test_sig_indices, test_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                           replace=False)

            # Indices to keep
            k_test_bkg = np.setdiff1d(test_bkg_indices, test_bkg_ix)
            k_test_sig = np.setdiff1d(test_sig_indices, test_sig_ix)

            test_small_x_sig = temp_h_data[6][k_test_sig]
            test_small_y_sig = temp_h_data[7][k_test_sig]
            test_small_x_bkg = temp_h_data[4][k_test_bkg]
            test_small_y_bkg = temp_h_data[5][k_test_bkg]

            test_x = np.concatenate((test_small_x_bkg, test_small_x_sig))
            test_y = np.concatenate((test_small_y_bkg, test_small_y_sig))

            tr.shuffle_in_unison(test_x, test_y)

            h_data[2].append(test_x)
            h_data[3].append(test_y)

            test_count += k_test_bkg.size + k_test_sig.size

            test_bkg_indices = test_bkg_ix
            test_sig_indices = test_sig_ix

        print "Created Group: {}/{}to{}".format(format, *map(int, rat))

        h_file.flush()
        h_file.close()

    main_file.close()
    temp_h_file.close()
    os.remove(".deep_learning.temp.hdf5")
Example #8
0
def create_archive(dataset_name, format, buffer=1000, train_fraction=0.8):
    """ converts a series of text files into a single .npz archive
    create_archive takes the name of a dataset and the
    format that the data is in, loads the config file
    from the dataset's directory, loads the right text files, and saves
    the training and testing data as numpy arrays in a single .npz
    file. The data is normalized and its variance is set to unity
    before saving, but the data will be randomized when it is loaded
    because otherwise we would be using the same "random" ordering of
    the data each time we train a network on the dataset

    Parameters
    ----------
    dataset_name : name of the dataset (directory) to look for a
    configuration file in

    format : name of the format that you want to
    build a .npz archive for

    Notes
    -----
    The locations of the text files to load should be described in
    the config file.  See the example in the OSUTTBAR dataset
    directory.
    """
    path_dict = read_config_file(dataset_name, format)
    output_path = os.path.join(ds.get_path_to_dataset(dataset_name), "{}.hdf5".format(dataset_name))

    if "train_path" in path_dict:
        train_len, train_cols = get_file_len_and_shape(path_dict["train_path"])
        test_len, test_cols = get_file_len_and_shape(path_dict["test_path"])
        assert train_cols == test_cols  # Train and test files should have the same data shape
        h_file, h_data = add_group_hdf5(output_path, format, zip([train_len]*2+[test_len]*2, train_cols+test_cols))
        with open(path_dict["train_path"]) as train_f:
            for l in train_f:
                event = np.fromstring(l, sep=',', dtype="float32")
                h_data[0].append(event[1:][None])
                h_data[1].append(make_one_hot(event[0], 0, train_cols[1]-1)[0])
        with open(path_dict["test_path"]) as test_f:
            for l in test_f:
                event = np.fromstring(l, sep=',', dtype="float32")
                h_data[2].append(event[1:][None])
                h_data[3].append(make_one_hot(event[0], 0, test_cols[1]-1)[0])

    elif "background_path" in path_dict:
        bkg_len, bkg_cols = get_file_len_and_shape(path_dict["background_path"])
        sig_len, sig_cols = get_file_len_and_shape(path_dict["signal_path"])
        n_labels = 2

        total_len = bkg_len+sig_len
        bkg_read_amt = int(bkg_len*buffer/total_len)
        sig_read_amt = int(sig_len*buffer/total_len)
        assert bkg_cols == sig_cols # Bkg and sig files should have the same data shape
        h_file, h_data = add_group_hdf5(output_path,
                                        format,
                                        zip([round(total_len*train_fraction)] * 2 + [round(total_len*(1-train_fraction))] * 2,
                                            [bkg_cols[0], n_labels]*2))
        # Read in a buffer of 1000 lines at a time (allows fraction accuracy to 0.xxx)
        with open(path_dict["background_path"]) as bkg_f:
            with open(path_dict["signal_path"]) as sig_f:
                i = 0
                while i < total_len - 1:
                    x_buffer_array = np.zeros((buffer, bkg_cols[0]))
                    y_buffer_array = np.zeros((buffer, n_labels))
                    ix = 0
                    for j in xrange(bkg_read_amt):
                        line = bkg_f.readline()
                        if line:
                            event = np.fromstring(line, sep=',', dtype="float32")
                            x_buffer_array[ix] = event[1:]
                            y_buffer_array[ix] = make_one_hot(event[0], 0, n_labels - 1)[0]
                            ix += 1
                        else:
                            break
                    for k in xrange(sig_read_amt):
                        line = sig_f.readline()
                        if line:
                            event = np.fromstring(line, sep=',', dtype="float32")
                            x_buffer_array[ix] = event[1:]
                            y_buffer_array[ix] = make_one_hot(event[0], 0, n_labels - 1)[0]
                            ix += 1
                        else:
                            break
                    indices = np.any(~(x_buffer_array==0), axis=1)
                    x_buffer_array = x_buffer_array[indices]
                    y_buffer_array = y_buffer_array[indices]
                    tr.shuffle_in_unison(x_buffer_array, y_buffer_array)
                    cutoff = int(x_buffer_array.shape[0]*train_fraction)
                    for r in x_buffer_array[:cutoff]:
                        h_data[0].append(r[None])
                    for r in y_buffer_array[:cutoff]:
                        h_data[1].append(r[None])
                    for r in x_buffer_array[cutoff:]:
                        h_data[2].append(r[None])
                    for r in y_buffer_array[cutoff:]:
                        h_data[3].append(r[None])
                    i += ix
    elif "both" in path_dict:

        total_len, total_cols = get_file_len_and_shape(path_dict["both"])
        train_len = round(train_fraction*total_len)
        test_len = round((1-train_fraction)*total_len)
        h_file, h_data = add_group_hdf5(output_path, format,
                                        zip([train_len] * 2 + [test_len] * 2, total_cols*2))
        with open(path_dict["both"]) as data_f:
            x_buffer_array = np.zeros((buffer, total_cols[0]))
            y_buffer_array = np.zeros((buffer, total_cols[1]))
            for i, l in enumerate(data_f):
                event = np.fromstring(l, sep=',', dtype="float32")
                x_buffer_array[i%buffer] = event[1:]
                y_buffer_array[i%buffer] = make_one_hot(event[0], 0, total_cols[1] - 1)[0]
                if i%buffer == buffer - 1:
                    indices = np.any(~(x_buffer_array == 0), axis=1)
                    x_buffer_array = x_buffer_array[indices]
                    y_buffer_array = y_buffer_array[indices]
                    tr.shuffle_in_unison(x_buffer_array, y_buffer_array)
                    cutoff = int(x_buffer_array.shape[0] * train_fraction)
                    for r in x_buffer_array[:cutoff]:
                        h_data[0].append(r[None])
                    for r in y_buffer_array[:cutoff]:
                        h_data[1].append(r[None])
                    for r in x_buffer_array[cutoff:]:
                        h_data[2].append(r[None])
                    for r in y_buffer_array[cutoff:]:
                        h_data[3].append(r[None])
                    x_buffer_array = np.zeros((buffer, total_cols[0]))
                    y_buffer_array = np.zeros((buffer, total_cols[1]))

    h_file.flush()
    h_file.close()