Exemple #1
0
def get_training_data(training_file, test_file):
    '''
    Loads training data.
    '''
    # load training data
    df = pd.read_csv(training_file)

    # map y values to integers
    df['Label'] = df['Label'].map({'b': 0, 's': 1})

    # rearrange columns for convenience
    cols = df.columns.tolist()
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]

    print 'original features'
    print df.columns

    df_new = add_features(df)
    #df_new = df
    cols_new = df_new.columns.tolist()
    cols_new = cols_new[:32] + cols_new[33:] + [cols_new[32]
                                                ]  #make the weight to the last
    #the ending comma!!
    #only remove phi and tau,lep eta
    black_list = [
        'PRI_met_phi',
        'PRI_lep_phi',
        'PRI_tau_phi',
        'PRI_jet_leading_phi',
        'PRI_jet_subleading_phi',
        'PRI_tau_eta',
        'PRI_lep_eta',
        #'PRI_jet_leading_eta','PRI_jet_subleading_eta',#replace with abs values
        #'PRI_lep_px','PRI_lep_py','PRI_lep_pz', 'PRI_lep_px_abs','PRI_lep_py_abs',#these raw values are noisy
        #'PRI_tau_px','PRI_tau_py','PRI_tau_pz', 'PRI_tau_pz_abs',
        #'PRI_jet_leading_px','PRI_jet_leading_py','PRI_jet_leading_pz', #leading pxyz has separation but abs
        #'PRI_jet_subleading_px','PRI_jet_subleading_py','PRI_jet_subleading_pz',
    ]
    #experiment if these phi values makes no sense: TRUE phi itself is really noisy as expected
    cols_new = [c for c in cols_new if c not in black_list]
    df_new = df_new[cols_new]
    print 'newly added features'
    print df_new.columns
    # convert into numpy array
    #train_data = df_new.values
    print 'select X features ', cols_new[2:-1]
    X_new = df_new[cols_new[2:-1]].values
    labels = df_new['Label'].values
    weights = df_new['Weight'].values

    #print 'exporting to csv with additional feat'
    #df_new.to_csv('./additional_feat_training.csv')
    #sys.exit()
    return X_new, labels, weights
def get_training_data(training_file, test_file):
    '''
    Loads training data.
    '''
    # load training data
    df = pd.read_csv(training_file)

    # map y values to integers
    df['Label'] = df['Label'].map({'b':0, 's':1})

    # rearrange columns for convenience
    cols = df.columns.tolist()
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]

    print 'original features'
    print df.columns

    df_new = add_features(df)
    #df_new = df
    cols_new = df_new.columns.tolist()
    cols_new = cols_new[:32]+cols_new[33:]+[cols_new[32]] #make the weight to the last
    #the ending comma!!
    #only remove phi and tau,lep eta
    black_list = ['PRI_met_phi', 'PRI_lep_phi', 'PRI_tau_phi', 'PRI_jet_leading_phi','PRI_jet_subleading_phi',
                  'PRI_tau_eta','PRI_lep_eta',
                  #'PRI_jet_leading_eta','PRI_jet_subleading_eta',#replace with abs values
                  #'PRI_lep_px','PRI_lep_py','PRI_lep_pz', 'PRI_lep_px_abs','PRI_lep_py_abs',#these raw values are noisy
                  #'PRI_tau_px','PRI_tau_py','PRI_tau_pz', 'PRI_tau_pz_abs', 
                  #'PRI_jet_leading_px','PRI_jet_leading_py','PRI_jet_leading_pz', #leading pxyz has separation but abs
                  #'PRI_jet_subleading_px','PRI_jet_subleading_py','PRI_jet_subleading_pz',
                  ] 
    #experiment if these phi values makes no sense: TRUE phi itself is really noisy as expected
    cols_new = [c for c in cols_new if c not in black_list]
    df_new=df_new[cols_new]
    print 'newly added features'
    print df_new.columns
    # convert into numpy array
    #train_data = df_new.values
    print 'select X features ', cols_new[2:-1]
    X_new = df_new[cols_new[2:-1]].values
    labels = df_new['Label'].values
    weights = df_new['Weight'].values

    #print 'exporting to csv with additional feat'
    #df_new.to_csv('./additional_feat_training.csv')
    #sys.exit()
    return X_new, labels, weights
# rearrange columns for convenience
cols = df.columns.tolist()
cols = [cols[-1]] + cols[:-1]
df = df[cols]

df_new = add_features(df)
cols_new = df_new.columns.tolist()
cols_new = cols_new[:32]+cols_new[33:]+[cols_new[32]] #make the weight to the last
df_new=df_new[cols_new]
X_new = df_new[cols_new[2:-1]].values
#idx = dtest[:,0]
'''
df_test = pd.read_csv(dpath+'/test.csv')
#df_test.replace(-999.0,0.)
df_test_data = add_features(df_test)
cols_new = df_test_data.columns.tolist()
black_list = ['PRI_met_phi', 'PRI_lep_phi', 'PRI_tau_phi', 'PRI_jet_leading_phi','PRI_jet_subleading_phi',
              'PRI_tau_eta','PRI_lep_eta'] 
cols_new = [c for c in cols_new if c not in black_list]
df_test_data=df_test_data[cols_new]
data = df_test_data.values[:,1:]
#scaler = StandardScaler().fit(np.vstack((X_new, data))) #normalize with training scale
#scaler = MinMaxScaler(feature_range=(-10,10)).fit(np.vstack((X_new, data)))
#data = scaler.transform(data)
idx = df_test_data.values[:,0]

print ('finish loading from csv ')
xgmat = xgb.DMatrix( data, missing = -999.0 )
bst = xgb.Booster({'nthread':16})
bst.load_model( modelfile )
def get_training_data(training_file, test_file):
    '''
    Loads training data.
    '''
    # load training data
    df = pd.read_csv(training_file)
    #df.replace(-999.0,0.)

    # map y values to integers
    df['Label'] = df['Label'].map({'b':0, 's':1})

    # rearrange columns for convenience
    cols = df.columns.tolist()
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]

    #print df.columns

    df_new = add_features(df)
    cols_new = df_new.columns.tolist()
    cols_new = cols_new[:32]+cols_new[33:]+[cols_new[32]] #make the weight to the last
    #print len(cols_new)
    #black_list = ['PRI_met_phi', 'PRI_lep_phi', 'PRI_tau_phi', 'PRI_jet_leading_phi','PRI_jet_subleading_phi',
    #              'PRI_tau_eta','PRI_lep_eta'
    #              ] 
    black_list = ['PRI_met_phi', 'PRI_lep_phi', 'PRI_tau_phi', 'PRI_jet_leading_phi','PRI_jet_subleading_phi',
                  'PRI_tau_eta','PRI_lep_eta',
                  'PRI_jet_leading_eta','PRI_jet_subleading_eta',#replace with abs values
                  'PRI_lep_px','PRI_lep_py','PRI_lep_pz', 'PRI_lep_px_abs','PRI_lep_py_abs',#these raw values are noisy
                  'PRI_tau_px','PRI_tau_py','PRI_tau_pz', 'PRI_tau_pz_abs', 
                  'PRI_jet_leading_px','PRI_jet_leading_py','PRI_jet_leading_pz', #leading pxyz has separation but abs
                  'PRI_jet_subleading_px','PRI_jet_subleading_py','PRI_jet_subleading_pz',
                  ] 
    #experiment if these phi values makes no sense: TRUE phi itself is really noisy as expected
    cols_new = [c for c in cols_new if c not in black_list]
    df_new=df_new[cols_new]
    #print df_new.columns
    X_new = df_new[cols_new[2:-1]].values
    labels = df_new['Label'].values
    weights = df_new['Weight'].values

    #load test data for better scaling
    #df_test = pd.read_csv(test_file)
    #df_test.replace(-999.0,0.)
    #df_test_data = add_features(df_test)
    #X_test = df_test_data.values[:,1:]

    #tree-based feature selection
    #treeClf = ExtraTreesClassifier()
    #print 'feature selection'
    #treeClf = AdaBoostClassifier(n_estimators=1000,learning_rate=0.1)
    #X_new = treeClf.fit(X, labels).transform(X)
    #print treeClf.feature_importances_

    #print X_new.shape

    #scaler = StandardScaler().fit(X_new)
    #standardize the training along with the test scale
    #scaler = StandardScaler().fit(np.vstack((X_new, X_test)))
    #scaler = MinMaxScaler(feature_range=(-10,10)).fit(np.vstack((X_new, X_test)))
    #scaler = Binarizer().fit(np.vstack((X_new, X_test)))
    #X_new = scaler.transform(X_new)

    return X_new, labels, weights
def get_training_data(training_file, test_file):
    '''
    Loads training data.
    '''
    # load training data
    df = pd.read_csv(training_file)
    #df.replace(-999.0,0.)

    # map y values to integers
    df['Label'] = df['Label'].map({'b': 0, 's': 1})

    # rearrange columns for convenience
    cols = df.columns.tolist()
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]

    #print df.columns

    df_new = add_features(df)
    cols_new = df_new.columns.tolist()
    cols_new = cols_new[:32] + cols_new[33:] + [cols_new[32]
                                                ]  #make the weight to the last
    #print len(cols_new)
    #black_list = ['PRI_met_phi', 'PRI_lep_phi', 'PRI_tau_phi', 'PRI_jet_leading_phi','PRI_jet_subleading_phi',
    #              'PRI_tau_eta','PRI_lep_eta'
    #              ]
    black_list = [
        'PRI_met_phi',
        'PRI_lep_phi',
        'PRI_tau_phi',
        'PRI_jet_leading_phi',
        'PRI_jet_subleading_phi',
        'PRI_tau_eta',
        'PRI_lep_eta',
        'PRI_jet_leading_eta',
        'PRI_jet_subleading_eta',  #replace with abs values
        'PRI_lep_px',
        'PRI_lep_py',
        'PRI_lep_pz',
        'PRI_lep_px_abs',
        'PRI_lep_py_abs',  #these raw values are noisy
        'PRI_tau_px',
        'PRI_tau_py',
        'PRI_tau_pz',
        'PRI_tau_pz_abs',
        'PRI_jet_leading_px',
        'PRI_jet_leading_py',
        'PRI_jet_leading_pz',  #leading pxyz has separation but abs
        'PRI_jet_subleading_px',
        'PRI_jet_subleading_py',
        'PRI_jet_subleading_pz',
    ]
    #experiment if these phi values makes no sense: TRUE phi itself is really noisy as expected
    cols_new = [c for c in cols_new if c not in black_list]
    df_new = df_new[cols_new]
    #print df_new.columns
    X_new = df_new[cols_new[2:-1]].values
    labels = df_new['Label'].values
    weights = df_new['Weight'].values

    #load test data for better scaling
    #df_test = pd.read_csv(test_file)
    #df_test.replace(-999.0,0.)
    #df_test_data = add_features(df_test)
    #X_test = df_test_data.values[:,1:]

    #tree-based feature selection
    #treeClf = ExtraTreesClassifier()
    #print 'feature selection'
    #treeClf = AdaBoostClassifier(n_estimators=1000,learning_rate=0.1)
    #X_new = treeClf.fit(X, labels).transform(X)
    #print treeClf.feature_importances_

    #print X_new.shape

    #scaler = StandardScaler().fit(X_new)
    #standardize the training along with the test scale
    #scaler = StandardScaler().fit(np.vstack((X_new, X_test)))
    #scaler = MinMaxScaler(feature_range=(-10,10)).fit(np.vstack((X_new, X_test)))
    #scaler = Binarizer().fit(np.vstack((X_new, X_test)))
    #X_new = scaler.transform(X_new)

    return X_new, labels, weights
Exemple #6
0
from add_features import add_features
from make_msci_regions import add_msci_regions
from calculate_returns2 import add_returns

if True:
    start = time.time()
    print("Started data processing...")
    refresh_data()
    print("Adding MSCI regions...")
    add_msci_regions()
    print("Checking dates and prices...")
    check_data_and_prices()
    print("Calculating returns...")
    add_returns()
    print("Calculating volatility, beta, ratios, market cap...")
    add_features()
    end = time.time()
    print("Done! Task took {} seconds.\n\n".format(end - start))

    print(
        "WARN: RUN data_process_msci FROM ml_advisor TO ENCODE regions AND sectors"
    )

    del start, end
if True:
    fundamentals_2016 = pd.read_hdf(
        "../sources/fundamentals_2016_with_feat_msci_regions.hdf5",
        "dataset1/x")
    fundamentals_2017 = pd.read_hdf(
        "../sources/fundamentals_2017_with_feat_msci_regions.hdf5",
        "dataset1/x")