Example #1
0
def choice_sampling(df, sigma):

    df_features = preprocessing.get_features(df)
    index = []
    cols = df_features.columns
    for i in cols:
        sd = np.std(df[i])
        mean = np.mean(df[i])
        set1 = df.index[df[i] < (mean + sigma * sd)]
        set2 = df.index[df[i] < (mean - sigma * sd)]
        set3 = set(set1 - set2)
        index.append(set3)
    s = set.intersection(*index)
    df_sample = df.ix[s]
    return df_sample
Example #2
0
                        help="If True, use a reduced subset of the data.")

    args = parser.parse_args()

    if args.weights is None:
        weights_backup = get_weight_path(args.model, args.input_size,
                                         args.output_size, args.maxlen)
    else:
        weights_backup = args.weights

    if args.debug:
        df_train = joblib.load('data/small_df_train.pkl.gz')
        df_test = joblib.load('data/small_df_test.pkl.gz')
    else:
        from preprocessing import get_features
        df_train, df_test = get_features(split=True)

    X_train, y_train = reformat(df_train,
                                input_size=args.input_size,
                                output_size=args.output_size,
                                maxlen=args.maxlen,
                                step_days=args.step_days,
                                max_sequences=args.N_train)
    del df_train
    X_test, y_test = reformat(df_test,
                              input_size=args.input_size,
                              output_size=args.output_size,
                              maxlen=args.maxlen,
                              step_days=args.step_days,
                              max_sequences=args.N_test)
    del df_test
Example #3
0
                color='black')
    plt.xlabel('Threshold (% Rejected)')
    plt.ylabel('AMS Score')
    plt.legend(loc=3)
    plt.title('AMS Curve ' + label)
    plt.savefig('Graphs/AMS_curve_' + label + '.png', bbox_inches='tight')


if __name__ == "__main__":

    # Loading the dataset
    df = preprocessing.load_data(
        path='/home/raid/vr308/workspace/Python/higgsDT/Data/')
    df = preprocessing.drop_missing_values(df)
    df_normed = preprocessing.normalize(df)[0]
    df_features = preprocessing.get_features(df)

    # Get background/signal samples
    b = df[df.Label == 1]
    s = df[df.Label == 0]

    features = df_features.columns

    f1 = 'DER_mass_MMC'  # Sample feature 1
    f2 = 'PRI_tau_eta'  # Sample feature 2

    # Feature Heat Map
    title = 'Feature Correlation Heatmap : Primary & Derived Features'
    figsave = True

    plot_feature_corr_matrix(df_features, title, figsave)
Example #4
0
import numpy as np
import sys
import pandas as pd
import sklearn.metrics as metrics
import matplotlib.pylab as plt
import itertools

import preprocessing
import cross_validation
import discovery_significance

df = preprocessing.load_data(path='/local/data/public/vr308/')
df = preprocessing.drop_missing_values(df)
df_normed = preprocessing.normalize(df)[0]
df_features = preprocessing.get_features(df)

# Get background/signal samples
b = df[df.Label == 1]
s = df[df.Label == 0]

X = np.asarray(df[['DER_mass_MMC', 'A']])[0:5000]
Y = np.asarray(df.Label[0:5000])

C = 100
gamma = 0.005

clf = cross_validation.fit_svm(X, Y, 'rbf', C, gamma)

h = 1
Example #5
0
    print 'Step 2 : Scaling features, drawing choice sample (+/- 1.6*sd) around mean for each feature '
    train_choice = sampling.get_training_sample(train,
                                                sample_type='choice',
                                                normalize=True)

    print 'Step 3 : Preparing training and test data'

    if (train_sample_type == 'choice_sample'):
        train_sample = train_choice
    else:
        train_sample = train_uniform

    train_sample = preprocessing.generate_balanced_weights(train_sample)

    X_train = preprocessing.get_features(train_sample)
    Y_train = train_sample['Label']
    W_train = train_sample['Weight']
    W_train_balanced = X_train.pop('W_balanced')

    test = preprocessing.generate_balanced_weights(test)

    X_test = preprocessing.normalize_fit_test(
        preprocessing.normalize(train)[1],
        test.drop(labels=['W_balanced'], axis=1))
    X_test = preprocessing.get_features(X_test)
    Y_test = test['Label']
    W_test = test['Weight']
    W_test_balanced = test.pop('W_balanced')

    elapsed = timer(start_timer_sampling)