def choice_sampling(df, sigma): df_features = preprocessing.get_features(df) index = [] cols = df_features.columns for i in cols: sd = np.std(df[i]) mean = np.mean(df[i]) set1 = df.index[df[i] < (mean + sigma * sd)] set2 = df.index[df[i] < (mean - sigma * sd)] set3 = set(set1 - set2) index.append(set3) s = set.intersection(*index) df_sample = df.ix[s] return df_sample
help="If True, use a reduced subset of the data.") args = parser.parse_args() if args.weights is None: weights_backup = get_weight_path(args.model, args.input_size, args.output_size, args.maxlen) else: weights_backup = args.weights if args.debug: df_train = joblib.load('data/small_df_train.pkl.gz') df_test = joblib.load('data/small_df_test.pkl.gz') else: from preprocessing import get_features df_train, df_test = get_features(split=True) X_train, y_train = reformat(df_train, input_size=args.input_size, output_size=args.output_size, maxlen=args.maxlen, step_days=args.step_days, max_sequences=args.N_train) del df_train X_test, y_test = reformat(df_test, input_size=args.input_size, output_size=args.output_size, maxlen=args.maxlen, step_days=args.step_days, max_sequences=args.N_test) del df_test
color='black') plt.xlabel('Threshold (% Rejected)') plt.ylabel('AMS Score') plt.legend(loc=3) plt.title('AMS Curve ' + label) plt.savefig('Graphs/AMS_curve_' + label + '.png', bbox_inches='tight') if __name__ == "__main__": # Loading the dataset df = preprocessing.load_data( path='/home/raid/vr308/workspace/Python/higgsDT/Data/') df = preprocessing.drop_missing_values(df) df_normed = preprocessing.normalize(df)[0] df_features = preprocessing.get_features(df) # Get background/signal samples b = df[df.Label == 1] s = df[df.Label == 0] features = df_features.columns f1 = 'DER_mass_MMC' # Sample feature 1 f2 = 'PRI_tau_eta' # Sample feature 2 # Feature Heat Map title = 'Feature Correlation Heatmap : Primary & Derived Features' figsave = True plot_feature_corr_matrix(df_features, title, figsave)
import numpy as np import sys import pandas as pd import sklearn.metrics as metrics import matplotlib.pylab as plt import itertools import preprocessing import cross_validation import discovery_significance df = preprocessing.load_data(path='/local/data/public/vr308/') df = preprocessing.drop_missing_values(df) df_normed = preprocessing.normalize(df)[0] df_features = preprocessing.get_features(df) # Get background/signal samples b = df[df.Label == 1] s = df[df.Label == 0] X = np.asarray(df[['DER_mass_MMC', 'A']])[0:5000] Y = np.asarray(df.Label[0:5000]) C = 100 gamma = 0.005 clf = cross_validation.fit_svm(X, Y, 'rbf', C, gamma) h = 1
print 'Step 2 : Scaling features, drawing choice sample (+/- 1.6*sd) around mean for each feature ' train_choice = sampling.get_training_sample(train, sample_type='choice', normalize=True) print 'Step 3 : Preparing training and test data' if (train_sample_type == 'choice_sample'): train_sample = train_choice else: train_sample = train_uniform train_sample = preprocessing.generate_balanced_weights(train_sample) X_train = preprocessing.get_features(train_sample) Y_train = train_sample['Label'] W_train = train_sample['Weight'] W_train_balanced = X_train.pop('W_balanced') test = preprocessing.generate_balanced_weights(test) X_test = preprocessing.normalize_fit_test( preprocessing.normalize(train)[1], test.drop(labels=['W_balanced'], axis=1)) X_test = preprocessing.get_features(X_test) Y_test = test['Label'] W_test = test['Weight'] W_test_balanced = test.pop('W_balanced') elapsed = timer(start_timer_sampling)