Beispiel #1
0
def prep_data_for_sklearn(**kwargs):
    if kwargs.get('comb_data', False):
        features = [f.functor(f.particle) for f in gcm().comb_bkg_bdt_vars]
    else:
        features = [f.functor(f.particle) for f in gcm().rand_spi_bdt_vars]
    spectators = [f.functor(f.particle) for f in gcm().spectator_vars]

    kwargs.update({'sklearn': True})
    data = get_bdt_data(**kwargs)

    train, test = train_test_split(data, random_state=43)
    return (train, test, train['labels'].astype(np.bool),
            test['labels'].astype(np.bool)), features, spectators
def test_splitting():
    signal_df = pandas.DataFrame(numpy.ones([10, 10]))
    bg_df = pandas.DataFrame(numpy.zeros([10, 10]))

    common_X = pandas.concat([signal_df, bg_df], ignore_index=True)
    common_y = numpy.concatenate([numpy.ones(len(signal_df)), numpy.zeros(len(bg_df))])

    trainX, testX, trainY, testY = commonutils.train_test_split(common_X, common_y)

    for (index, row), label in zip(trainX.iterrows(), trainY):
        assert numpy.all(row == label), 'wrong data partition'
    for (index, row), label in zip(testX.iterrows(), testY):
        assert numpy.all(row == label), 'wrong data partition'
Beispiel #3
0
def test_splitting():
    signal_df = pandas.DataFrame(numpy.ones([10, 10]))
    bg_df = pandas.DataFrame(numpy.zeros([10, 10]))

    common_X = pandas.concat([signal_df, bg_df], ignore_index=True)
    common_y = numpy.concatenate(
        [numpy.ones(len(signal_df)),
         numpy.zeros(len(bg_df))])

    trainX, testX, trainY, testY = commonutils.train_test_split(
        common_X, common_y)

    for (index, row), label in zip(trainX.iterrows(), trainY):
        assert numpy.all(row == label), 'wrong data partition'
    for (index, row), label in zip(testX.iterrows(), testY):
        assert numpy.all(row == label), 'wrong data partition'
def test_feature_splitter(size=2000):
    X, y = commonutils.generate_sample(size, 10, distance=0.5)
    X['column0'] = numpy.clip(numpy.array(X['column0']).astype(numpy.int), -2, 2)
    trainX, testX, trainY, testY = commonutils.train_test_split(X, y)
    base_estimators = {'rf': RandomForestClassifier()}
    splitter = FeatureSplitter('column0', base_estimators=base_estimators, final_estimator=RandomForestClassifier())
    splitter.fit(trainX, trainY)

    print(splitter.score(testX, testY))
    print(RandomForestClassifier().fit(trainX, trainY).score(testX, testY))
    print(DumbSplitter('column0', base_estimator=RandomForestClassifier()).fit(trainX, trainY).score(testX, testY))
    chain = OrderedDict()
    chain['QDA'] = QDA()
    chain['LDA'] = LDA()
    chain['RF'] = RandomForestClassifier()
    print(ChainClassifiers(chain).fit(trainX, trainY).score(testX, testY))
    print(LDA().fit(trainX, trainY).score(testX, testY))
def test_splitting(n_rows=10, n_columns=8):
    column_names = ['col' + str(i) for i in range(n_columns)]
    signal_df = pandas.DataFrame(numpy.ones([n_rows, n_columns]), columns=column_names)
    bg_df = pandas.DataFrame(numpy.zeros([n_rows, n_columns]), columns=column_names)

    common_X = pandas.concat([signal_df, bg_df], ignore_index=True)
    common_y = numpy.concatenate([numpy.ones(len(signal_df)), numpy.zeros(len(bg_df))])

    trainX, testX, trainY, testY = commonutils.train_test_split(common_X, common_y)

    for (index, row), label in zip(trainX.iterrows(), trainY):
        assert numpy.all(row == label), 'wrong data partition'
    for (index, row), label in zip(testX.iterrows(), testY):
        assert numpy.all(row == label), 'wrong data partition'

    assert (trainX.columns == column_names).all(), 'new column names!'
    assert (testX.columns == column_names).all(), 'new column names!'
    assert len(trainX) + len(testX) == len(common_X), 'new size is strange'
Beispiel #6
0
def test_feature_splitter(size=2000):
    X, y = commonutils.generate_sample(size, 10, distance=0.5)
    X['column0'] = numpy.clip(
        numpy.array(X['column0']).astype(numpy.int), -2, 2)
    trainX, testX, trainY, testY = commonutils.train_test_split(X, y)
    base_estimators = {'rf': RandomForestClassifier()}
    splitter = FeatureSplitter('column0',
                               base_estimators=base_estimators,
                               final_estimator=RandomForestClassifier())
    splitter.fit(trainX, trainY)

    print(splitter.score(testX, testY))
    print(RandomForestClassifier().fit(trainX, trainY).score(testX, testY))
    print(
        DumbSplitter('column0', base_estimator=RandomForestClassifier()).fit(
            trainX, trainY).score(testX, testY))
    chain = OrderedDict()
    chain['QDA'] = QDA()
    chain['LDA'] = LDA()
    chain['RF'] = RandomForestClassifier()
    print(ChainClassifiers(chain).fit(trainX, trainY).score(testX, testY))
    print(LDA().fit(trainX, trainY).score(testX, testY))
def test_splitting(n_rows=10, n_columns=8):
    column_names = ['col' + str(i) for i in range(n_columns)]
    signal_df = pandas.DataFrame(numpy.ones([n_rows, n_columns]),
                                 columns=column_names)
    bg_df = pandas.DataFrame(numpy.zeros([n_rows, n_columns]),
                             columns=column_names)

    common_X = pandas.concat([signal_df, bg_df], ignore_index=True)
    common_y = numpy.concatenate(
        [numpy.ones(len(signal_df)),
         numpy.zeros(len(bg_df))])

    trainX, testX, trainY, testY = commonutils.train_test_split(
        common_X, common_y)

    for (index, row), label in zip(trainX.iterrows(), trainY):
        assert numpy.all(row == label), 'wrong data partition'
    for (index, row), label in zip(testX.iterrows(), testY):
        assert numpy.all(row == label), 'wrong data partition'

    assert (trainX.columns == column_names).all(), 'new column names!'
    assert (testX.columns == column_names).all(), 'new column names!'
    assert len(trainX) + len(testX) == len(common_X), 'new size is strange'
Beispiel #8
0
def just_the_labels(sw=False, comb_data=False):
    data = get_bdt_data(sw=sw, sklearn=True, comb_data=comb_data)

    train, test = train_test_split(data, random_state=43)
    return train[['labels', 'weights']], test[['labels', 'weights']]