def get_training_data(training_file, test_file): ''' Loads training data. ''' # load training data df = pd.read_csv(training_file) # map y values to integers df['Label'] = df['Label'].map({'b': 0, 's': 1}) # rearrange columns for convenience cols = df.columns.tolist() cols = [cols[-1]] + cols[:-1] df = df[cols] print 'original features' print df.columns df_new = add_features(df) #df_new = df cols_new = df_new.columns.tolist() cols_new = cols_new[:32] + cols_new[33:] + [cols_new[32] ] #make the weight to the last #the ending comma!! #only remove phi and tau,lep eta black_list = [ 'PRI_met_phi', 'PRI_lep_phi', 'PRI_tau_phi', 'PRI_jet_leading_phi', 'PRI_jet_subleading_phi', 'PRI_tau_eta', 'PRI_lep_eta', #'PRI_jet_leading_eta','PRI_jet_subleading_eta',#replace with abs values #'PRI_lep_px','PRI_lep_py','PRI_lep_pz', 'PRI_lep_px_abs','PRI_lep_py_abs',#these raw values are noisy #'PRI_tau_px','PRI_tau_py','PRI_tau_pz', 'PRI_tau_pz_abs', #'PRI_jet_leading_px','PRI_jet_leading_py','PRI_jet_leading_pz', #leading pxyz has separation but abs #'PRI_jet_subleading_px','PRI_jet_subleading_py','PRI_jet_subleading_pz', ] #experiment if these phi values makes no sense: TRUE phi itself is really noisy as expected cols_new = [c for c in cols_new if c not in black_list] df_new = df_new[cols_new] print 'newly added features' print df_new.columns # convert into numpy array #train_data = df_new.values print 'select X features ', cols_new[2:-1] X_new = df_new[cols_new[2:-1]].values labels = df_new['Label'].values weights = df_new['Weight'].values #print 'exporting to csv with additional feat' #df_new.to_csv('./additional_feat_training.csv') #sys.exit() return X_new, labels, weights
def get_training_data(training_file, test_file): ''' Loads training data. ''' # load training data df = pd.read_csv(training_file) # map y values to integers df['Label'] = df['Label'].map({'b':0, 's':1}) # rearrange columns for convenience cols = df.columns.tolist() cols = [cols[-1]] + cols[:-1] df = df[cols] print 'original features' print df.columns df_new = add_features(df) #df_new = df cols_new = df_new.columns.tolist() cols_new = cols_new[:32]+cols_new[33:]+[cols_new[32]] #make the weight to the last #the ending comma!! #only remove phi and tau,lep eta black_list = ['PRI_met_phi', 'PRI_lep_phi', 'PRI_tau_phi', 'PRI_jet_leading_phi','PRI_jet_subleading_phi', 'PRI_tau_eta','PRI_lep_eta', #'PRI_jet_leading_eta','PRI_jet_subleading_eta',#replace with abs values #'PRI_lep_px','PRI_lep_py','PRI_lep_pz', 'PRI_lep_px_abs','PRI_lep_py_abs',#these raw values are noisy #'PRI_tau_px','PRI_tau_py','PRI_tau_pz', 'PRI_tau_pz_abs', #'PRI_jet_leading_px','PRI_jet_leading_py','PRI_jet_leading_pz', #leading pxyz has separation but abs #'PRI_jet_subleading_px','PRI_jet_subleading_py','PRI_jet_subleading_pz', ] #experiment if these phi values makes no sense: TRUE phi itself is really noisy as expected cols_new = [c for c in cols_new if c not in black_list] df_new=df_new[cols_new] print 'newly added features' print df_new.columns # convert into numpy array #train_data = df_new.values print 'select X features ', cols_new[2:-1] X_new = df_new[cols_new[2:-1]].values labels = df_new['Label'].values weights = df_new['Weight'].values #print 'exporting to csv with additional feat' #df_new.to_csv('./additional_feat_training.csv') #sys.exit() return X_new, labels, weights
# rearrange columns for convenience cols = df.columns.tolist() cols = [cols[-1]] + cols[:-1] df = df[cols] df_new = add_features(df) cols_new = df_new.columns.tolist() cols_new = cols_new[:32]+cols_new[33:]+[cols_new[32]] #make the weight to the last df_new=df_new[cols_new] X_new = df_new[cols_new[2:-1]].values #idx = dtest[:,0] ''' df_test = pd.read_csv(dpath+'/test.csv') #df_test.replace(-999.0,0.) df_test_data = add_features(df_test) cols_new = df_test_data.columns.tolist() black_list = ['PRI_met_phi', 'PRI_lep_phi', 'PRI_tau_phi', 'PRI_jet_leading_phi','PRI_jet_subleading_phi', 'PRI_tau_eta','PRI_lep_eta'] cols_new = [c for c in cols_new if c not in black_list] df_test_data=df_test_data[cols_new] data = df_test_data.values[:,1:] #scaler = StandardScaler().fit(np.vstack((X_new, data))) #normalize with training scale #scaler = MinMaxScaler(feature_range=(-10,10)).fit(np.vstack((X_new, data))) #data = scaler.transform(data) idx = df_test_data.values[:,0] print ('finish loading from csv ') xgmat = xgb.DMatrix( data, missing = -999.0 ) bst = xgb.Booster({'nthread':16}) bst.load_model( modelfile )
def get_training_data(training_file, test_file): ''' Loads training data. ''' # load training data df = pd.read_csv(training_file) #df.replace(-999.0,0.) # map y values to integers df['Label'] = df['Label'].map({'b':0, 's':1}) # rearrange columns for convenience cols = df.columns.tolist() cols = [cols[-1]] + cols[:-1] df = df[cols] #print df.columns df_new = add_features(df) cols_new = df_new.columns.tolist() cols_new = cols_new[:32]+cols_new[33:]+[cols_new[32]] #make the weight to the last #print len(cols_new) #black_list = ['PRI_met_phi', 'PRI_lep_phi', 'PRI_tau_phi', 'PRI_jet_leading_phi','PRI_jet_subleading_phi', # 'PRI_tau_eta','PRI_lep_eta' # ] black_list = ['PRI_met_phi', 'PRI_lep_phi', 'PRI_tau_phi', 'PRI_jet_leading_phi','PRI_jet_subleading_phi', 'PRI_tau_eta','PRI_lep_eta', 'PRI_jet_leading_eta','PRI_jet_subleading_eta',#replace with abs values 'PRI_lep_px','PRI_lep_py','PRI_lep_pz', 'PRI_lep_px_abs','PRI_lep_py_abs',#these raw values are noisy 'PRI_tau_px','PRI_tau_py','PRI_tau_pz', 'PRI_tau_pz_abs', 'PRI_jet_leading_px','PRI_jet_leading_py','PRI_jet_leading_pz', #leading pxyz has separation but abs 'PRI_jet_subleading_px','PRI_jet_subleading_py','PRI_jet_subleading_pz', ] #experiment if these phi values makes no sense: TRUE phi itself is really noisy as expected cols_new = [c for c in cols_new if c not in black_list] df_new=df_new[cols_new] #print df_new.columns X_new = df_new[cols_new[2:-1]].values labels = df_new['Label'].values weights = df_new['Weight'].values #load test data for better scaling #df_test = pd.read_csv(test_file) #df_test.replace(-999.0,0.) #df_test_data = add_features(df_test) #X_test = df_test_data.values[:,1:] #tree-based feature selection #treeClf = ExtraTreesClassifier() #print 'feature selection' #treeClf = AdaBoostClassifier(n_estimators=1000,learning_rate=0.1) #X_new = treeClf.fit(X, labels).transform(X) #print treeClf.feature_importances_ #print X_new.shape #scaler = StandardScaler().fit(X_new) #standardize the training along with the test scale #scaler = StandardScaler().fit(np.vstack((X_new, X_test))) #scaler = MinMaxScaler(feature_range=(-10,10)).fit(np.vstack((X_new, X_test))) #scaler = Binarizer().fit(np.vstack((X_new, X_test))) #X_new = scaler.transform(X_new) return X_new, labels, weights
def get_training_data(training_file, test_file): ''' Loads training data. ''' # load training data df = pd.read_csv(training_file) #df.replace(-999.0,0.) # map y values to integers df['Label'] = df['Label'].map({'b': 0, 's': 1}) # rearrange columns for convenience cols = df.columns.tolist() cols = [cols[-1]] + cols[:-1] df = df[cols] #print df.columns df_new = add_features(df) cols_new = df_new.columns.tolist() cols_new = cols_new[:32] + cols_new[33:] + [cols_new[32] ] #make the weight to the last #print len(cols_new) #black_list = ['PRI_met_phi', 'PRI_lep_phi', 'PRI_tau_phi', 'PRI_jet_leading_phi','PRI_jet_subleading_phi', # 'PRI_tau_eta','PRI_lep_eta' # ] black_list = [ 'PRI_met_phi', 'PRI_lep_phi', 'PRI_tau_phi', 'PRI_jet_leading_phi', 'PRI_jet_subleading_phi', 'PRI_tau_eta', 'PRI_lep_eta', 'PRI_jet_leading_eta', 'PRI_jet_subleading_eta', #replace with abs values 'PRI_lep_px', 'PRI_lep_py', 'PRI_lep_pz', 'PRI_lep_px_abs', 'PRI_lep_py_abs', #these raw values are noisy 'PRI_tau_px', 'PRI_tau_py', 'PRI_tau_pz', 'PRI_tau_pz_abs', 'PRI_jet_leading_px', 'PRI_jet_leading_py', 'PRI_jet_leading_pz', #leading pxyz has separation but abs 'PRI_jet_subleading_px', 'PRI_jet_subleading_py', 'PRI_jet_subleading_pz', ] #experiment if these phi values makes no sense: TRUE phi itself is really noisy as expected cols_new = [c for c in cols_new if c not in black_list] df_new = df_new[cols_new] #print df_new.columns X_new = df_new[cols_new[2:-1]].values labels = df_new['Label'].values weights = df_new['Weight'].values #load test data for better scaling #df_test = pd.read_csv(test_file) #df_test.replace(-999.0,0.) #df_test_data = add_features(df_test) #X_test = df_test_data.values[:,1:] #tree-based feature selection #treeClf = ExtraTreesClassifier() #print 'feature selection' #treeClf = AdaBoostClassifier(n_estimators=1000,learning_rate=0.1) #X_new = treeClf.fit(X, labels).transform(X) #print treeClf.feature_importances_ #print X_new.shape #scaler = StandardScaler().fit(X_new) #standardize the training along with the test scale #scaler = StandardScaler().fit(np.vstack((X_new, X_test))) #scaler = MinMaxScaler(feature_range=(-10,10)).fit(np.vstack((X_new, X_test))) #scaler = Binarizer().fit(np.vstack((X_new, X_test))) #X_new = scaler.transform(X_new) return X_new, labels, weights
from add_features import add_features from make_msci_regions import add_msci_regions from calculate_returns2 import add_returns if True: start = time.time() print("Started data processing...") refresh_data() print("Adding MSCI regions...") add_msci_regions() print("Checking dates and prices...") check_data_and_prices() print("Calculating returns...") add_returns() print("Calculating volatility, beta, ratios, market cap...") add_features() end = time.time() print("Done! Task took {} seconds.\n\n".format(end - start)) print( "WARN: RUN data_process_msci FROM ml_advisor TO ENCODE regions AND sectors" ) del start, end if True: fundamentals_2016 = pd.read_hdf( "../sources/fundamentals_2016_with_feat_msci_regions.hdf5", "dataset1/x") fundamentals_2017 = pd.read_hdf( "../sources/fundamentals_2017_with_feat_msci_regions.hdf5", "dataset1/x")