Exemple #1
0
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import load_data as ld


training_file = "../data/census_income_learn.csv"
metadata_file = "../data/census_income_metadata.txt"
df = ld.prepare_dataframe(training_file, metadata_file=metadata_file)



dpi=150
figsize=(64, 48)
plt.figure(figsize=figsize, dpi=dpi)
axes = pd.tools.plotting.scatter_matrix(df, alpha=0.02, figsize=figsize)
plt.tight_layout()
plt.savefig('./figures/pairwise.png', dpi=dpi)



## list numerical vs categorical variables
colnames =  df.columns.values
is_numerical = np.array([df[c].is_numeric() for c in colnames])
is_categorical = np.logical_not(is_numerical)
numerical_variables = list(colnames[is_numerical])
numerical_variables.remove("detailed industry recode")
numerical_variables.remove("detailed occupation recode")

Exemple #2
0
    print "Score summary: ", round(float(np.trace(confusion_score))/len(y_valid), 3)*100., "%"
    # read confusion matrix as follows:
    # true = earn 50000+
    # (expected false, predicted false) (expected false, predicted true)
    # (expected true, predicted false) (expected true, predicted true)







## LOAD DATA
print "loading data..."
### basic operation on load data
train = ld.prepare_dataframe(TRAINING_FILE, metadata_file=METADATA_FILE)
valid = ld.prepare_dataframe(VALIDATION_FILE, metadata_file=METADATA_FILE)
train, valid = feat.engineer(train,valid) #pre-process data

### shortcuts
features_train = train.drop(PREDICTION_COLNAME, axis=1)
features_valid = valid.drop(PREDICTION_COLNAME, axis=1)
target_train = train[PREDICTION_COLNAME]
target_valid = valid[PREDICTION_COLNAME]



## SELECT FEATURES
### this is supposed to be step1 in sklearn pipeline, but pipeline bugs with python-2.7
print "selecting features..."
features_train, features_valid = selector.reduce_dimension(features_train, features_valid)