Example #1
0
        lb = LabelBinarizer()
        feature_numeric = lb.fit_transform(df[[feature]])
    else:
        feature_numeric = lb.transform(df[[feature]])
    col_names = map(lambda x: feature + "_" + str(x).strip(),
                    list(lb.classes_))
    if lb.classes_.shape[0] == 2:
        col_names = col_names[:1]
    feature_df = pd.DataFrame(feature_numeric,
                              columns=col_names,
                              index=df.index)
    df = df.join(feature_df)
    return [lb, df]


orig = pd.read_csv_sync(DATA_PATH + 'adult_with_colnames.csv', index_col=0)
[train, test] = cross_validation.train_test_split_sync(orig,
                                                       test_size=0.3,
                                                       random_state=501)

[lb, train] = oneHotEncoding(None, "workclass", train)
cols = [col for col in train.columns if "workclass_" in col]
[lb2, train] = oneHotEncoding(None, "sex", train)
cols = [col for col in train.columns if "sex_" in col]
train = train.drop(["workclass", "sex"], axis=1)
new_cols = [
    col for col in train.columns if "workclass_" in col or "sex_" in col
]

logreg = linear_model.LogisticRegression(C=10)
features = ['capital-gain', 'capital-loss', 'age'] + new_cols
Example #2
0
from modeldb.sklearn_native import SyncableMetrics

ROOT_DIR = '../../../../server/'
DATA_PATH = '../../../../data/'

name = "test1"
author = "author"
description = "kaggle-iris-script"
# Creating a new project
syncer_obj = Syncer(NewOrExistingProject(name, author, description),
                    NewOrExistingExperiment("expName", "expDesc"),
                    NewExperimentRun("iris test"))
"""
Cleaning up data first.
"""
iris_data = pd.read_csv_sync(DATA_PATH + 'iris-data.csv', na_values=['NA'])

iris_data.loc[iris_data['class'] == 'versicolor', 'class'] = 'Iris-versicolor'
iris_data.loc[iris_data['class'] == 'Iris-setossa', 'class'] = 'Iris-setosa'

# This line drops any 'Iris-setosa' rows with a separal width less than 2.5 cm
iris_data = iris_data.loc[(iris_data['class'] != 'Iris-setosa') |
                          (iris_data['sepal_width_cm'] >= 2.5)]
iris_data.loc[iris_data['class'] == 'Iris-setosa', 'sepal_width_cm'].hist()

iris_data.loc[(iris_data['class'] == 'Iris-versicolor')
              & (iris_data['sepal_length_cm'] < 1.0)]

iris_data.loc[(iris_data['class'] == 'Iris-versicolor') &
              (iris_data['sepal_length_cm'] < 1.0), 'sepal_length_cm'] *= 100.0
Example #3
0
def run_otto_workflow():
    name = "test1"
    author = "author"
    description = "kaggle-otto-script"
    # Creating a new project
    syncer_obj = Syncer(NewOrExistingProject(name, author, description),
                        NewOrExistingExperiment("expName", "expDesc"),
                        NewExperimentRun("otto test"))

    # Import Data
    # Note: This dataset is not included in the repo because of Kaggle
    # restrictions.
    # It can be downloaded from
    # https://www.kaggle.com/c/otto-group-product-classification-challenge/data
    X = pd.read_csv_sync(DATA_PATH + 'otto-train.csv')
    syncer_obj.add_tag(X, "original otto csv data")
    X = X.drop_sync('id', axis=1)

    syncer_obj.add_tag(X, "dropped id column")
    # Extract target
    # Encode it to make it manageable by ML algo
    y = X.target.values

    y = LabelEncoder().fit_transform_sync(y)

    # Remove target from train, else it's too easy ...
    X = X.drop_sync('target', axis=1)

    syncer_obj.add_tag(X, "data with dropped id and target columns")

    # Split Train / Test
    x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
        X, y, test_size=0.20, random_state=36)

    syncer_obj.add_tag(x_test, "testing data")
    syncer_obj.add_tag(x_train, "training data")
    # First, we will train and apply a Random Forest WITHOUT calibration
    # we use a BaggingClassifier to make 5 predictions, and average
    # because that's what CalibratedClassifierCV do behind the scene,
    # and we want to compare things fairly, i.e. be sure that averaging several
    # models
    # is not what explains a performance difference between no calibration,
    # and calibration.

    clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)

    clfbag = BaggingClassifier(clf, n_estimators=5)
    clfbag.fit_sync(x_train, y_train)

    y_preds = clfbag.predict_proba_sync(x_test)

    SyncableMetrics.compute_metrics(clfbag,
                                    log_loss,
                                    y_test,
                                    y_preds,
                                    x_test,
                                    "",
                                    "",
                                    eps=1e-15,
                                    normalize=True)
    # print("loss WITHOUT calibration : ", log_loss(
    #     ytest, ypreds, eps=1e-15, normalize=True))

    # Now, we train and apply a Random Forest WITH calibration
    # In our case, 'isotonic' worked better than default 'sigmoid'
    # This is not always the case. Depending of the case, you have to test the
    # two possibilities

    clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
    calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5)
    calibrated_clf.fit_sync(x_train, y_train)
    y_preds = calibrated_clf.predict_proba_sync(x_test)
    SyncableMetrics.compute_metrics(calibrated_clf,
                                    log_loss,
                                    y_test,
                                    y_preds,
                                    x_test,
                                    "",
                                    "",
                                    eps=1e-15,
                                    normalize=True)

    # print("loss WITH calibration : ", log_loss(
    #     ytest, ypreds, eps=1e-15, normalize=True))

    print(" ")
    print("Conclusion : in our case, calibration improved"
          "performance a lot ! (reduced loss)")
    syncer_obj.sync()
    return syncer_obj, x_train, x_test
Example #4
0
'''
Source: http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
'''

# modeldb start

name = "simple sample"
author = "srinidhi"
description = "simple LR for credit default prediction"
syncer_obj = Syncer(NewOrExistingProject(name, author, description),
                    DefaultExperiment(), NewExperimentRun("credit test"))

# modeldb end

# modeldb start
df = pd.read_csv_sync(DATA_PATH + 'credit-default.csv', skiprows=[0])
# modeldb end

target = df['default payment next month']
df = df[["LIMIT_BAL", "SEX", "EDUCATION", "MARRIAGE", "AGE"]]

x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
    df, target, test_size=0.3)

lr = linear_model.LogisticRegression(C=2)

# modeldb start
lr.fit_sync(x_train, y_train)
# modeldb end

# modeldb start
ROOT_DIR = '../../../../server/'
DATA_PATH = '../../../../data/'

name = "test1"
author = "author"
description = "kaggle-titanic-script"
# Creating a new project
syncer_obj = Syncer(NewOrExistingProject(name, author, description),
                    NewOrExistingExperiment("expName", "expDesc"),
                    NewExperimentRun("titanic test"))

# Read the training set csv file.
# Note: This dataset is not included in the repo because of Kaggle
# restrictions.
# It can be downloaded from https://www.kaggle.com/c/titanic/data
titanic = pd.read_csv_sync(DATA_PATH + 'titanic-train.csv')

# =====================Preprocessing the data=====================
# Fill the missing value in "Age".
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())

# Converting the Sex Column to numeric value
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
# Converting the Embarked Column
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

# NOTE: .loc commands don't create a new dataframe id
Example #6
0
from modeldb.sklearn_native.ModelDbSyncer import *
from modeldb.sklearn_native import SyncableMetrics

DATA_PATH = '../../../../data/'
# Pipelining: This chains a PCA and logistic regression, and uses the UCI
# Census Adult dataset.

name = "pipeline census"
author = "srinidhi"
description = "census data"
syncer_obj = Syncer(
    NewOrExistingProject(name, author, description),
    DefaultExperiment(),
    NewExperimentRun("Abc"))

df = pd.read_csv_sync(DATA_PATH + 'adult.data.csv')
new_df = pd.DataFrame()
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
              'marital_status', 'occupation', 'relationship', 'race', 'sex',
              'capital_gain', 'capital_loss', 'hours_per_week',
              'native_country', 'income_level']

le = preprocessing.LabelEncoder()

# Assigning 0.0 to represent incomes <=50K, and 1.0 to represent incomes >50K
df['income_level'] = df['income_level'].str.strip()
df['income_level'] = df['income_level'].replace(['<=50K'], [0.0])
df['income_level'] = df['income_level'].replace(['>50K'], [1.0])

# calling labelEncoder on any columns that are object types
for coltype, colname in zip(df.dtypes, df.columns):