Ejemplo n.º 1
0
def num_of_each_cell(model, data, cutoff=0.5):
    x_train, y_train, x_test, y_test = data
    num_correct = count_filter(model, lambda p, y: (y > 0) == (p - y > cutoff - 1), (x_test, y_test))
    num_predicted = count_filter(model, lambda p, y: p > cutoff, (x_test, y_test))
    test_totals = ut.sum_cols(y_test)
    ss = num_correct[1]
    bs = num_predicted[1] - ss
    sb = test_totals[1]-ss
    bb = test_totals[0]-bs
    return np.array([[bb, bs],
                     [sb, ss]])
Ejemplo n.º 2
0
MATRIX = """
{0}      Predicted
{0}   ==================
{0}R  || {1:02.1f} || {2:02.1f} || TTBar
{0}e  ==================
{0}a  || {3:02.1f} || {4:02.1f} || TTHiggs
{0}l  ==================
{0}     TTBar   TTHiggs
"""

def count_filter(model, criteria, (x_test, y_test), batch_size=64, **kwargs):
    rval = []
    for i in xrange(int(y_test.shape[0]/batch_size)):
        predictions = model.predict([x_test[i*batch_size:(i+1)*batch_size]], **kwargs)
        bArray = criteria(predictions, y_test[i*batch_size:(i+1)*batch_size])
        rval.append(ut.sum_cols(bArray, batch_size))
    return tuple([c.sum() for c in np.array(rval).T])

def num_of_each_cell(model, data, cutoff=0.5):
    x_train, y_train, x_test, y_test = data
    num_correct = count_filter(model, lambda p, y: (y > 0) == (p - y > cutoff - 1), (x_test, y_test))
    num_predicted = count_filter(model, lambda p, y: p > cutoff, (x_test, y_test))
    test_totals = ut.sum_cols(y_test)
    ss = num_correct[1]
    bs = num_predicted[1] - ss
    sb = test_totals[1]-ss
    bb = test_totals[0]-bs
    return np.array([[bb, bs],
                     [sb, ss]])

# Need to generalize for more categories
Ejemplo n.º 3
0
def save_ratios(dataset, ratios, buffer=1000):

    ratios = [ratios] if type(ratios) is str else ratios
    ratios = map(lambda x: map(float, x.split(':')), ratios)
    data, format = dataset.split('/')
    main_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(data, format, mode='a')

    bkg_test, sig_test = sum_cols(y_test)
    bkg_train, sig_train = sum_cols(y_train)

    TEST_UPPER_LIMIT = int(1.5 * bkg_test) if bkg_test < sig_test else int(1.5 * sig_test)
    TRAIN_UPPER_LIMIT = int(1.5 * bkg_train) if bkg_train < sig_train else int(1.5 * sig_train)

    temp_h_file, temp_h_data = add_group_hdf5(".deep_learning.temp.hdf5", "Temp",
                                    [(bkg_train, x_train.shape[1]),
                                     (bkg_train, y_train.shape[1]),
                                     (sig_train, x_train.shape[1]),
                                     (sig_train, y_train.shape[1]),
                                     (bkg_test, x_test.shape[1]),
                                     (bkg_test, y_test.shape[1]),
                                     (sig_test, x_test.shape[1]),
                                     (sig_test, y_test.shape[1])],
                                    names=["train_bkg_x",
                                           "train_bkg_y",
                                           "train_sig_x",
                                           "train_sig_y",
                                           "test_bkg_x",
                                           "test_bkg_y",
                                           "test_sig_x",
                                           "test_sig_y"])

    print "Generating temporary files..."
    for i in xrange(int(math.ceil(x_train.shape[0] / buffer))):
        # index should be same shape and need to reshape the result :/
        train_bkg_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        train_sig_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        test_bkg_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])
        test_sig_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])

        for j in xrange(x_train.shape[1]):
            train_bkg_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 0] == 1
            train_sig_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 1] == 1
        for j in xrange(x_test.shape[1]):
            test_bkg_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 0] == 1
            test_sig_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 1] == 1

        selection = x_train[train_bkg_index]
        temp_h_data[0].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_bkg_index[:, :y_train.shape[1]]]
        temp_h_data[1].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_train[train_sig_index]
        temp_h_data[2].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_sig_index[:, :y_train.shape[1]]]
        temp_h_data[3].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_test[test_bkg_index]
        temp_h_data[4].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_bkg_index[:, :y_test.shape[1]]]
        temp_h_data[5].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

        selection = x_test[test_sig_index]
        temp_h_data[6].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_sig_index[:, :y_test.shape[1]]]
        temp_h_data[7].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

    # Perform all of this in archive so that you write to file every iteration
    buffer_reset = buffer
    for rat in ratios:

        print "Creating ratio {:d}/{:d} ...".format(*map(int, rat))

        h_file, h_data = add_group_hdf5(ds.get_path_to_dataset(data)+os.sep+data+".hdf5",
                                        "{}to{}".format(*map(int, rat)),
                                        [(TRAIN_UPPER_LIMIT, x_train.shape[1]),
                                         (TRAIN_UPPER_LIMIT, y_train.shape[1]),
                                         (TEST_UPPER_LIMIT, x_test.shape[1]),
                                         (TEST_UPPER_LIMIT, y_test.shape[1])],
                                        where='/{}'.format(format))

        test_bkg_indices = np.arange(bkg_test)
        test_sig_indices = np.arange(sig_test)
        train_bkg_indices = np.arange(bkg_train)
        train_sig_indices = np.arange(sig_train)

        train_count = 0
        buffer = buffer_reset
        while train_count < TRAIN_UPPER_LIMIT:
            if TRAIN_UPPER_LIMIT - train_count < buffer:
                buffer = TRAIN_UPPER_LIMIT - train_count

            # Indices to NOT include
            train_bkg_ix = np.random.choice(train_bkg_indices,
                                            train_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                            replace=False)
            train_sig_ix = np.random.choice(train_sig_indices,
                                            train_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                            replace=False)

            # Indices to keep
            k_train_bkg = np.setdiff1d(train_bkg_indices, train_bkg_ix)
            k_train_sig = np.setdiff1d(train_sig_indices, train_sig_ix)

            train_small_x_sig = temp_h_data[2][k_train_sig]
            train_small_y_sig = temp_h_data[3][k_train_sig]
            train_small_x_bkg = temp_h_data[0][k_train_bkg]
            train_small_y_bkg = temp_h_data[1][k_train_bkg]

            train_x = np.concatenate((train_small_x_bkg, train_small_x_sig))
            train_y = np.concatenate((train_small_y_bkg, train_small_y_sig))

            tr.shuffle_in_unison(train_x, train_y)

            h_data[0].append(train_x)
            h_data[1].append(train_y)

            train_count += k_train_bkg.size + k_train_sig.size

            train_bkg_indices = train_bkg_ix
            train_sig_indices = train_sig_ix

        test_count = 0
        buffer = buffer_reset
        while test_count < TEST_UPPER_LIMIT:
            if TEST_UPPER_LIMIT - test_count < buffer:
                buffer = TEST_UPPER_LIMIT - test_count

            # Indices to NOT include
            test_bkg_ix = np.random.choice(test_bkg_indices, test_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                           replace=False)
            test_sig_ix = np.random.choice(test_sig_indices, test_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                           replace=False)

            # Indices to keep
            k_test_bkg = np.setdiff1d(test_bkg_indices, test_bkg_ix)
            k_test_sig = np.setdiff1d(test_sig_indices, test_sig_ix)

            test_small_x_sig = temp_h_data[6][k_test_sig]
            test_small_y_sig = temp_h_data[7][k_test_sig]
            test_small_x_bkg = temp_h_data[4][k_test_bkg]
            test_small_y_bkg = temp_h_data[5][k_test_bkg]

            test_x = np.concatenate((test_small_x_bkg, test_small_x_sig))
            test_y = np.concatenate((test_small_y_bkg, test_small_y_sig))

            tr.shuffle_in_unison(test_x, test_y)

            h_data[2].append(test_x)
            h_data[3].append(test_y)

            test_count += k_test_bkg.size + k_test_sig.size

            test_bkg_indices = test_bkg_ix
            test_sig_indices = test_sig_ix

        print "Created Group: {}/{}to{}".format(format, *map(int, rat))

        h_file.flush()
        h_file.close()

    main_file.close()
    temp_h_file.close()
    os.remove(".deep_learning.temp.hdf5")
Ejemplo n.º 4
0
def save_ratios(dataset, ratios, buffer=1000):
    """
    Divides a certain dataset into subsets of data with certain ratios of backgrond to signal. For a ratio list of
    length n, the counting index, i, for the background starts from index 0, and the counting index, j, for the signal
    starts at n-1. The ratio for each iteration is then i to j (i/j). Generates a temporary file to accomplish this.

    Parameters
    ----------
    dataset <string> : the name of the dataset (/-separated)
    ratios <list> : a list of integers that define ratios of background to signal.
    buffer <int> : an integer defining the number of data points to load into memory at a time.

    """
    ratios = [ratios] if type(ratios) is str else ratios
    ratios = map(lambda x: map(float, x.split(':')), ratios)
    data = dataset.split('/')[0]
    format = '/'.join(dataset.split('/')[1:])
    main_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(data, format, mode='a')

    bkg_test, sig_test = sum_cols(y_test)
    bkg_train, sig_train = sum_cols(y_train)

    TEST_UPPER_LIMIT = int(1.5 * bkg_test) if bkg_test < sig_test else int(1.5 * sig_test)
    TRAIN_UPPER_LIMIT = int(1.5 * bkg_train) if bkg_train < sig_train else int(1.5 * sig_train)

    temp_h_file, temp_h_data = add_group_hdf5(".deep_learning.temp.h5", "Temp",
                                    [(bkg_train, x_train.shape[1]),
                                     (bkg_train, y_train.shape[1]),
                                     (sig_train, x_train.shape[1]),
                                     (sig_train, y_train.shape[1]),
                                     (bkg_test, x_test.shape[1]),
                                     (bkg_test, y_test.shape[1]),
                                     (sig_test, x_test.shape[1]),
                                     (sig_test, y_test.shape[1])],
                                    names=["train_bkg_x",
                                           "train_bkg_y",
                                           "train_sig_x",
                                           "train_sig_y",
                                           "test_bkg_x",
                                           "test_bkg_y",
                                           "test_sig_x",
                                           "test_sig_y"])

    print "Generating temporary files..."
    for i in xrange(int(math.ceil(x_train.shape[0] / buffer))):
        # index should be same shape and need to reshape the result :/
        train_bkg_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        train_sig_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        test_bkg_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])
        test_sig_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])

        for j in xrange(x_train.shape[1]):
            train_bkg_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 0] == 1
            train_sig_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 1] == 1
        for j in xrange(x_test.shape[1]):
            test_bkg_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 0] == 1
            test_sig_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 1] == 1

        selection = x_train[train_bkg_index]
        temp_h_data[0].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_bkg_index[:, :y_train.shape[1]]]
        temp_h_data[1].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_train[train_sig_index]
        temp_h_data[2].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_sig_index[:, :y_train.shape[1]]]
        temp_h_data[3].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_test[test_bkg_index]
        temp_h_data[4].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_bkg_index[:, :y_test.shape[1]]]
        temp_h_data[5].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

        selection = x_test[test_sig_index]
        temp_h_data[6].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_sig_index[:, :y_test.shape[1]]]
        temp_h_data[7].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

    # Perform all of this in archive so that you write to file every iteration
    buffer_reset = buffer
    for rat in ratios:

        print "Creating ratio {:d}/{:d} ...".format(*map(int, rat))

        h_file, h_data = add_group_hdf5(ds.get_path_to_dataset(data)+os.sep+data+".h5",
                                        "{}to{}".format(*map(int, rat)),
                                        [(TRAIN_UPPER_LIMIT, x_train.shape[1]),
                                         (TRAIN_UPPER_LIMIT, y_train.shape[1]),
                                         (TEST_UPPER_LIMIT, x_test.shape[1]),
                                         (TEST_UPPER_LIMIT, y_test.shape[1])],
                                        where='/{}'.format(format))

        test_bkg_indices = np.arange(bkg_test)
        test_sig_indices = np.arange(sig_test)
        train_bkg_indices = np.arange(bkg_train)
        train_sig_indices = np.arange(sig_train)

        train_count = 0
        buffer = buffer_reset
        while train_count < TRAIN_UPPER_LIMIT:
            if TRAIN_UPPER_LIMIT - train_count < buffer:
                buffer = TRAIN_UPPER_LIMIT - train_count

            # Indices to NOT include
            train_bkg_ix = np.random.choice(train_bkg_indices,
                                            train_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                            replace=False)
            train_sig_ix = np.random.choice(train_sig_indices,
                                            train_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                            replace=False)

            # Indices to keep
            k_train_bkg = np.setdiff1d(train_bkg_indices, train_bkg_ix)
            k_train_sig = np.setdiff1d(train_sig_indices, train_sig_ix)

            train_small_x_sig = temp_h_data[2][k_train_sig]
            train_small_y_sig = temp_h_data[3][k_train_sig]
            train_small_x_bkg = temp_h_data[0][k_train_bkg]
            train_small_y_bkg = temp_h_data[1][k_train_bkg]

            train_x = np.concatenate((train_small_x_bkg, train_small_x_sig))
            train_y = np.concatenate((train_small_y_bkg, train_small_y_sig))

            tr.shuffle_in_unison(train_x, train_y)

            h_data[0].append(train_x)
            h_data[1].append(train_y)

            train_count += k_train_bkg.size + k_train_sig.size

            train_bkg_indices = train_bkg_ix
            train_sig_indices = train_sig_ix

        test_count = 0
        buffer = buffer_reset
        while test_count < TEST_UPPER_LIMIT:
            if TEST_UPPER_LIMIT - test_count < buffer:
                buffer = TEST_UPPER_LIMIT - test_count

            # Indices to NOT include
            test_bkg_ix = np.random.choice(test_bkg_indices, test_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                           replace=False)
            test_sig_ix = np.random.choice(test_sig_indices, test_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                           replace=False)

            # Indices to keep
            k_test_bkg = np.setdiff1d(test_bkg_indices, test_bkg_ix)
            k_test_sig = np.setdiff1d(test_sig_indices, test_sig_ix)

            test_small_x_sig = temp_h_data[6][k_test_sig]
            test_small_y_sig = temp_h_data[7][k_test_sig]
            test_small_x_bkg = temp_h_data[4][k_test_bkg]
            test_small_y_bkg = temp_h_data[5][k_test_bkg]

            test_x = np.concatenate((test_small_x_bkg, test_small_x_sig))
            test_y = np.concatenate((test_small_y_bkg, test_small_y_sig))

            tr.shuffle_in_unison(test_x, test_y)

            h_data[2].append(test_x)
            h_data[3].append(test_y)

            test_count += k_test_bkg.size + k_test_sig.size

            test_bkg_indices = test_bkg_ix
            test_sig_indices = test_sig_ix

        print "Created Group: {}/{}to{}".format(format, *map(int, rat))

        h_file.flush()
        h_file.close()

    main_file.close()
    temp_h_file.close()
    os.remove(".deep_learning.temp.h5")
Ejemplo n.º 5
0
    for i in xrange(int(ceil(y_test.shape[0] / batch_size))):
        predictions = model.predict(
            [x_test[i * batch_size:(i + 1) * batch_size]], **kwargs)
        if type(index) is np.ndarray:
            if index[i * batch_size:(i + 1) * batch_size].any():
                predictions = predictions[index[i * batch_size:(i + 1) *
                                                batch_size]]
                y_temp = y_test[i * batch_size:(i + 1) *
                                batch_size][index[i * batch_size:(i + 1) *
                                                  batch_size]]
            else:
                continue
        else:
            y_temp = y_test[i * batch_size:(i + 1) * batch_size]
        bArray = criteria(predictions, y_temp)
        rval.append(ut.sum_cols(bArray, batch_size))
    return tuple([c.sum() for c in np.array(rval).T])


def num_of_each_cell(model, data):
    x_train, y_train, x_test, y_test = data
    matrix = []
    for i in xrange(y_test.shape[1]):
        index = (y_test[:, i] > 0).flatten()
        num_predicted = count_filter(
            model,
            lambda p, y: np.vstack(map(lambda x: x == max(x), p)),
            (x_test, y_test),
            index=index)
        matrix.append(num_predicted)
    return np.array(matrix)