def predict(data):

    # Sample input
    # {"RECENCY":11,
    #         "HISTORY":204,
    #         "USED_DISCOUNT":0,
    #         "USED_BOGO":1,
    #         "IS_REFERRAL":1,
    #         "SCORE":0.766}
    #
    # Sample Output
    # {"result": 1}

    df = pd.DataFrame(data, index=[0])

    df.columns = [
        'recency', 'history', 'used_discount', 'used_bogo', 'is_referral',
        'score'
    ]

    df = df.astype('float')

    prediction = float(clf.predict(df)[0])
    probability = float(clf.predict_proba(df)[0][1])

    # Track model metrics
    cdsw.track_metric('prediction', prediction)
    cdsw.track_metric('conversion_probability', probability)

    return {'prediction': prediction, 'conversion_probability': probability}
Exemple #2
0
def predict_cancelled(args):
    inputs = args["feature"].split(",")
    inputs[3] = int(inputs[3])
    inputs[4] = int(inputs[4])
    inputs[5] = int(inputs[5])
    inputs[6] = int(inputs[6])
    inputs[7] = int(inputs[7])

    input_cols = [
        "OP_CARRIER",
        "ORIGIN",
        "DEST",
        "CRS_DEP_TIME",
        "CRS_ELAPSED_TIME",
        "DISTANCE",
        "WEEK",
        "HOUR",
    ]
    input_df = pd.DataFrame([inputs], columns=input_cols)

    input_transformed = ct.transform(input_df)

    prediction = pipe.predict(input_transformed)
    cdsw.track_metric("input_data", args)
    cdsw.track_metric("prediction", int(prediction[0]))

    return {"prediction": int(prediction[0])}
Exemple #3
0
def predict(args):

    cdsw.track_metric("input", args)
    petal_length = float(args.get('petal_length'))

    result = model.predict([[petal_length]])
    cdsw.track_metric("predict_result", result[0][0])
    modified_result = result + 1
    cdsw.track_aggregate_metrics({"modified_result": agg_result},
                                 start_timestamp_ms,
                                 end_timestamp_ms,
                                 model_deployment_crn=Deployment_CRN)
    return result[0][0]
def explain(args):
    data = dict(ChainMap(args, em.default_data))
    data = em.cast_dct(data)

    #Do the prediction and provide weights for the reasoning
    probability, explanation = em.explain_dct(data)

    #NEW! Track our inputs
    for key in data:
        if isinstance(data[key], numpy.int64) or isinstance(
                data[key], numpy.float64):
            cdsw.track_metric(key, data[key].item())
        else:
            cdsw.track_metric(key, data[key])

    #NEW! Track our prediction
    cdsw.track_metric('probability', probability)

    #NEW! Track explanation
    cdsw.track_metric('explanation', explanation)

    return {
        'data': dict(data),
        'probability': probability,
        'explanation': explanation
    }
def predict(args):
    # Track the input.
    cdsw.track_metric("input", args)

    # If this model involved features, ie transformations of the
    # raw input, they could be tracked as well.
    # cdsw.track_metric("feature_vars", {"a":1,"b":23})

    petal_length = float(args.get('petal_length'))
    result = model.predict([[petal_length]])

    # Track the output.
    cdsw.track_metric("predict_result", result[0][0])
    return result[0][0]
Exemple #6
0
def predict(args):

    data = pd.DataFrame(args, index=[0])
    data = data.astype(float)
    prediction = clf.predict(data)
    probability = clf.predict_proba(data)

    # Track inputs
    #cdsw.track_metric('input_data', data)

    # Track our prediction
    cdsw.track_metric('probability', int(probability[0][0]))

    # Track explanation
    cdsw.track_metric('explanation', int(prediction[0]))

    return {'prediction': prediction, 'probability': probability}
Exemple #7
0
def predict_cancelled(args):
  inputs = args['feature'].split(",")
  inputs[1] = int(inputs[1])
  inputs[4] = int(inputs[4])
  inputs[5] = int(inputs[5])
  inputs[6] = int(inputs[6])
  inputs[7] = int(inputs[7])
  inputs[8] = int(inputs[8])

  input_cols = ['OP_CARRIER','OP_CARRIER_FL_NUM','ORIGIN','DEST','CRS_DEP_TIME','CRS_ELAPSED_TIME','DISTANCE','WEEK','HOUR']
  input_df = pd.DataFrame([inputs],columns=input_cols )

  input_transformed = ct.transform(input_df)
  
  prediction = pipe.predict(input_transformed)
  cdsw.track_metric('input_data', args)
  cdsw.track_metric('prediction', int(prediction[0]))
  
  return {
    "prediction" : int(prediction[0])
  }
def predict_cancelled(args):
    inputs = args["feature"].split(",")
    inputs[3] = int(inputs[3])
    inputs[4] = int(inputs[4])


    input_cols = [
        "OP_CARRIER",
        "ORIGIN",
        "DEST",
        "WEEK",
        "HOUR",
    ]
    input_df = pd.DataFrame([inputs], columns=input_cols)

    input_transformed = ct.transform(input_df)

    probas = pipe.predict_proba(input_transformed)
    prediction = np.argmax(probas)
    proba = round(probas[0][prediction], 2)
    
    cdsw.track_metric("input_data", args)
    cdsw.track_metric("prediction", int(prediction))
    cdsw.track_metric("proba", str(proba))
    
    response = {"prediction": int(prediction), "proba": str(proba)}

    return response
Exemple #9
0
def explain(args):
    data = dict(ChainMap(args, em.default_data))
    data = em.cast_dct(data)
    probability, explanation = em.explain_dct(data)
    
    #NEW! Track our inputs
#    for key in data:
#      if isinstance(data[key], numpy.int64) or isinstance(data[key], numpy.float64):
#        cdsw.track_metric(key, data[key].item())
#      else:
#        cdsw.track_metric(key, data[key])

    cdsw.track_metric('input_data', data)
    
    #NEW! Track our prediction
    cdsw.track_metric('probability', probability)
    
    #NEW! Track explanation
    cdsw.track_metric('explanation', explanation)
    
    return {
        'data': dict(data),
        'probability': probability,
        'explanation': explanation
        }
Exemple #10
0
def predict_cancelled(args):
    inputs = args['feature'].split(",")
    inputs[1] = int(inputs[1])
    inputs[4] = int(inputs[4])
    inputs[5] = int(inputs[5])
    inputs[6] = int(inputs[6])
    inputs[7] = int(inputs[7])
    inputs[8] = int(inputs[8])

    input_cols = [
        'uniquecarrier', 'flightnum', 'origin', 'dest', 'crsdeptime',
        'crselapsedtime', 'distance', 'week', 'hour'
    ]
    input_df = pd.DataFrame([inputs], columns=input_cols)

    input_transformed = ct.transform(input_df)

    prediction = pipe.predict(input_transformed)
    cdsw.track_metric('input_data', args)
    cdsw.track_metric('prediction', int(prediction[0]))

    return {"prediction": int(prediction[0])}
Exemple #11
0
def predict(data):

    df = pd.DataFrame(data, index=[0])
    df.columns = [
        'acc_now_delinq', 'acc_open_past_24mths', 'annual_inc', 'avg_cur_bal',
        'funded_amnt'
    ]

    df = df.astype('float')

    tracked_data = df.astype('str').to_dict('records')[0]

    prediction = str(clf.predict(df)[0])

    # Track prediction
    cdsw.track_metric("prediction", str(prediction))

    cdsw.track_metric("data", df.to_json())

    return {'input_data': str(tracked_data), 'prediction': str(prediction)}


# To test this is a session, comment out the `@cdsw.model_metrics`  line,
# uncomment the and run the two rows below.
# x={"StreamingTV":"No","MonthlyCharges":70.35,"PhoneService":"No","PaperlessBilling":"No","Partner":"No","OnlineBackup":"No","gender":"Female","Contract":"Month-to-month","TotalCharges":1397.475,"StreamingMovies":"No","DeviceProtection":"No","PaymentMethod":"Bank transfer (automatic)","tenure":29,"Dependents":"No","OnlineSecurity":"No","MultipleLines":"No","InternetService":"DSL","SeniorCitizen":"No","TechSupport":"No"}
# explain(x)

## Wrap up
#
# We've now covered all the steps to **deploying and serving Models**, including the
# requirements, limitations, and how to set up, test, and use them.
# This is a powerful way to get data scientists' work in use by other people quickly.
#
# In the next part of the project we will explore how to launch a **web application**
# served through CML.
# Your team is busy building models to solve problems.
# CML-hosted Applications are a simple way to get these solutions in front of
# stakeholders quickly.
Exemple #12
0
def predict(args):

    data = pd.DataFrame(args, index=[0])
    prediction = pipeline.predict(data)
    probability = clf.predict_proba(data)

    #Track individual inputs -- Already available
    cdsw.track_metric('input_data', data)

    # Track probability -- Already available
    cdsw.track_metric('probability', int(probability[0][0]))

    # Track prediction -- Already available
    cdsw.track_metric('prediction', int(prediction[0]))

    return {'prediction': prediction, 'probability': probability}
Exemple #13
0
def explain(args):
    data = dict(ChainMap(args, em.default_data))
    data = em.cast_dct(data)
    probability, explanation = em.explain_dct(data)

    # Track inputs
    cdsw.track_metric("input_data", data)

    # Track our prediction
    cdsw.track_metric("probability", probability)

    # Track explanation
    cdsw.track_metric("explanation", explanation)

    return {
        "data": dict(data),
        "probability": probability,
        "explanation": explanation
    }
def explain(args):
    data = dict(ChainMap(args, em.default_data))
    data = em.cast_dct(data)
    probability, explanation = em.explain_dct(data)
    
    # Track inputs
    cdsw.track_metric('input_data', data)
    
    # Track our prediction
    cdsw.track_metric('probability', probability)
    
    # Track explanation
    cdsw.track_metric('explanation', explanation)
    
    return {
        'data': dict(data),
        'probability': probability,
        'explanation': explanation
        }
def explain(args):
    data = dict(ChainMap(args, em.default_data))
    data = em.cast_dct(data)
    probability, explanation = em.explain_dct(data)

    # Track inputs
    cdsw.track_metric('input_data', data)

    # Track our prediction
    cdsw.track_metric('probability', probability)

    # Track explanation
    cdsw.track_metric('explanation', explanation)

    return {
        'data': dict(data),
        'probability': probability,
        'explanation': explanation
    }


# To test this is a session, comment out the `@cdsw.model_metrics`  line,
# uncomment the and run the two rows below.
#x={"StreamingTV":"No","MonthlyCharges":70.35,"PhoneService":"No","PaperlessBilling":"No","Partner":"No","OnlineBackup":"No","gender":"Female","Contract":"Month-to-month","TotalCharges":1397.475,"StreamingMovies":"No","DeviceProtection":"No","PaymentMethod":"Bank transfer (automatic)","tenure":29,"Dependents":"No","OnlineSecurity":"No","MultipleLines":"No","InternetService":"DSL","SeniorCitizen":"No","TechSupport":"No"}
#explain(x)

## Wrap up
#
# We've now covered all the steps to **deploying and serving Models**, including the
# requirements, limitations, and how to set up, test, and use them.
# This is a powerful way to get data scientists' work in use by other people quickly.
#
# In the next part of the project we will explore how to launch a **web application**
# served through CML.
# Your team is busy building models to solve problems.
# CML-hosted Applications are a simple way to get these solutions in front of
# stakeholders quickly.
#Retrieving Paramaters from the Best RF Model 
param_BestModel_NumIter = bestBGTModel._java_obj.getMaxIter()
param_BestModel_Depth = bestBGTModel._java_obj.getMaxDepth()

#Feature Importance
impFeatures = gbtmodel.stages[-1].featureImportances
zipFeaturesToImportanceValue = zip(impFeatures, input_cols)
FeautureRankings = set(zipFeaturesToImportanceValue)
sortedFeaturRankings = sorted(FeautureRankings, reverse=True)

"Gradient Boost Tree - Feature Rankings Sorted By Importance Value %s" % (sortedFeaturRankings)
"When summed together, the values equal 1.0"

#Return Paramaters to CDSW User Interface
cdsw.track_metric("auroc", auroc)
cdsw.track_metric("aupr", aupr)
cdsw.track_metric("F1", f1score)
cdsw.track_metric("WeightedPrecision", weightedPrecision)
cdsw.track_metric("weightedRecall", weightedRecall)
cdsw.track_metric("numIter",param_BestModel_NumIter)
cdsw.track_metric("maxDepth",param_BestModel_Depth)
cdsw.track_metric("cvFolds",user_gbt_param_numFolds)


from pyspark.mllib.evaluation import BinaryClassificationMetrics
gbt_labelPredictionSet = gbt_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label))
gbtmetrics = BinaryClassificationMetrics(gbt_labelPredictionSet)

#Save GBT Model to Disk
gbtmodel.write().overwrite().save("models/spark/gbt")
#Retrieving Paramaters from the Best RF Model 
param_BestModel_NumTrees = bestRFModel._java_obj.getNumTrees()
param_BestModel_Depth = bestRFModel._java_obj.getMaxDepth()
param_BestModel_impurity = bestRFModel._java_obj.getImpurity()

#Feature Importance
impFeatures = rfmodel.stages[-1].featureImportances
zipFeaturesToImportanceValue = zip(impFeatures, input_cols)
FeautureRankings = set(zipFeaturesToImportanceValue)
sortedFeaturRankings = sorted(FeautureRankings, reverse=True)

"Random Forest - Feature Rankings Sorted By Importance Value %s" % (sortedFeaturRankings)
"When summed together, the values equal 1.0"

#Return Paramaters to CDSW User Interface
cdsw.track_metric("auroc", auroc)
cdsw.track_metric("aupr", aupr)
cdsw.track_metric("F1", f1score)
cdsw.track_metric("WeightedPrecision", weightedPrecision)
cdsw.track_metric("weightedRecall", weightedRecall)
cdsw.track_metric("numTrees",param_BestModel_NumTrees)
cdsw.track_metric("maxDepth",param_BestModel_Depth)
cdsw.track_metric("impurity",param_BestModel_impurity)
cdsw.track_metric("cvFolds",user_rf_param_numFolds)


from pyspark.mllib.evaluation import BinaryClassificationMetrics
labelPredictionSet = rf_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label))
metrics = BinaryClassificationMetrics(labelPredictionSet)

#Save RF Model to Disk
Exemple #18
0
print("test", test_score)    
print(classification_report(y_test, pipe.predict(X_test)))
data[labels.name + ' probability'] = pipe.predict_proba(X)[:, 1]


# Create LIME Explainer
feature_names = list(ce.columns_)
categorical_features = list(ce.cat_columns_ix_.values())
categorical_names = {i: ce.classes_[c]
                     for c, i in ce.cat_columns_ix_.items()}
class_names = ['No ' + labels.name, labels.name]
explainer = LimeTabularExplainer(ce.transform(data),
                                 feature_names=feature_names,
                                 class_names=class_names,
                                 categorical_features=categorical_features,
                                 categorical_names=categorical_names)    


# Create and save the combined Logistic Regression and LIME Explained Model.
explainedmodel = ExplainedModel(data=data, labels=labels, model_name='telco_linear',
                                categoricalencoder=ce, pipeline=pipe,
                                explainer=explainer,data_dir=data_dir)
explainedmodel.save()


# If running as as experiment, this will track the metrics and add the model trained in this training run to the experiment history.
cdsw.track_metric("train_score",round(train_score,2))
cdsw.track_metric("test_score",round(test_score,2))
cdsw.track_metric("model_path",explainedmodel.model_path)
cdsw.track_file(explainedmodel.model_path)
Exemple #19
0
import cdsw
from pyspark.sql import SparkSession
from pyspark.sql.types import *
"""#uncomment for experiments
# # Get parameters for experiments
# Declare parameters 
param_numTrees= int(sys.argv[1])
param_maxDepth=int(sys.argv[2])
param_impurity=sys.argv[3]

#track parameters in experiments
cdsw.track_metric("numTrees",param_numTrees)
cdsw.track_metric("maxDepth",param_maxDepth)
cdsw.track_metric("impurity",param_impurity)
"""

# Comment out when using experiments
param_numTrees = 10
param_maxDepth = 15
param_impurity = "gini"


spark = SparkSession\
  .builder\
  .appName('wine-quality-analysis')\
  .config("spark.executor.memory","2g")\
  .config("spark.executor.cores","2")\
  .config("spark.executor.instances","3")\
  .config("spark.hadoop.fs.s3a.metadatastore.impl","org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore")\
  .config("spark.hadoop.fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")\
  .config("spark.hadoop.fs.s3a.delegation.token.binding","")\
Exemple #20
0
results['best_score'] = gs.best_score_
results['best_params'] = [gs.best_params_]
results['n_splits'] = gs.n_splits_
results['scorer'] = gs.scorer_
results['cv_results'] = [gs.cv_results_]
results['grid'] = [grid]

results_df = pd.DataFrame(results, index=[0])

results_df['timestamp'] = results_df['timestamp'].astype(str)
results_df['clf'] = results_df['clf'].astype(str)
results_df['best_score'] = results_df['best_score'].astype(int)
results_df['best_params'] = results_df['best_params'].astype(str)
results_df['n_splits'] = results_df['n_splits'].astype(int)
results_df['scorer'] = results_df['scorer'].astype(str)
results_df['cv_results'] = results_df['cv_results'].astype(str)
results_df['grid'] = results_df['grid'].astype(str)

print("Best Accuracy Score")
print(results)
cdsw.track_metric("Best Accuracy Score", results_df['best_score'].astype(int))

spark.sql("CREATE TABLE IF NOT EXISTS default.experiment_outcomes (TIMESTAMP STRING, CLASSIFIER STRING, \
            BEST_SCORE INTEGER, BEST_PARAMS STRING, N_SPLITS INT, SCORER STRING, CV_RESULTS STRING, GRID STRING)")

experiments_df = spark.createDataFrame(results_df)
  
experiments_df.write.insertInto("default.experiment_outcomes",overwrite = False) 
    

Exemple #21
0
filtered, vocab = cutoff(sorted_vocab, tokenized, freqs)

sentiments, messages = balance_classes(sentiments, filtered)

token_ids = convert_to_token_ids(messages, vocab)

print(type(token_ids))
print(len(token_ids) * 0.5)
split_idx = int(len(token_ids) * 0.5)

split_frac = 0.98  # for small data
#split_frac = 0.8 # for big data
train_features, train_labels, tf, tl, vf, vl = split_data(
    token_ids, sentiments, vocab, split_frac=split_frac)

model = create_model(train_features, train_labels, vocab)

acc, loss = train_model(model, train_features, train_labels,
                        print_every=1)  # for small data
#acc, loss = train_model(model, train_features, train_labels) # for big data

import cdsw
model_filename = "model.torch"
vocab_filename = "vocab.pickle"
cdsw.track_file(model_filename)
cdsw.track_file(vocab_filename)

cdsw.track_metric("Accuracy", acc)
cdsw.track_metric("Loss", loss)
                    as_pandas=True,
                    seed=123)

#Plot the importance of each feature
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

#Test the final error
final_rmse = cv_results["test-rmse-mean"].iat[-1]
print("Root Mean Std. Err : ", final_rmse)

#Show predictions for next week using forecasted car production
data_newforecast = xgb.DMatrix(data=df_forecast)
new_preds = xg_reg.predict(data_newforecast)
print("Predicted weekly production for Part No ", part_no, ": ", new_preds[0])

#Save model as pickle file
picklefile = part_no + '.pickle'
pickle.dump(xg_reg, open(picklefile, 'wb'))

cdsw.track_metric('RSME', final_rmse)
cdsw.track_metric('Estimated Part Production', new_preds[0])
"""
def predictParts():
    loaded_model = pickle.load(open(picklefile, 'rb'))
    result = loaded_model.predict(data_newforecast)
    print(result)
    return result
    """
Exemple #23
0
    ax.set_facecolor('white')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Accuracy')
    ax.set_title('Classifier Accuracy')
    fig.set_facecolor('white')

    plt.plot('epoch',
             'acc',
             label="Accuracy",
             data=df_model,
             markersize=12,
             color='skyblue',
             linewidth=1)
    plt.plot('epoch',
             'val_acc',
             label="Validation Accuracy",
             data=df_model,
             markersize=12,
             color='blue',
             linewidth=1)

    legend = plt.legend(loc="lower right", facecolor='white', framealpha=1)

    plt.show()


plot_classifier_acc()

cdsw.track_metric("Accuracy", 0.9)
cdsw.track_metric("AUC", 0.89)
Exemple #24
0
X = cc_data[cc_data.Day < 4].iloc[:, 3:len(cc_data.columns) - 1]
y = cc_data[cc_data.Day < 4].iloc[:, len(cc_data.columns) - 1]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

param_numTrees = int(sys.argv[1])
param_maxDepth = int(sys.argv[2])
param_impurity = sys.argv[3]

randF = RandomForestClassifier(n_jobs=10,
                               n_estimators=param_numTrees,
                               max_depth=param_maxDepth,
                               criterion=param_impurity,
                               random_state=0)

randF.fit(X_train, y_train)

predictions_rand = randF.predict(X_test)
auroc = roc_auc_score(y_test, predictions_rand)
ap = average_precision_score(y_test, predictions_rand)

cdsw.track_metric("auroc", round(auroc, 2))
cdsw.track_metric("ap", round(ap, 2))

pickle.dump(randF, open("cc_model_check.pkl", "wb"))

cdsw.track_file("cc_model_check.pkl")
Exemple #25
0
results = dict()
results['timestamp'] = run_time_suffix
results['clf'] = clf
results['best_estimator'] = gs.best_estimator_
results['best_score'] = gs.best_score_
results['best_params'] = [gs.best_params_]
results['n_splits'] = gs.n_splits_
results['scorer'] = gs.scorer_
results['cv_results'] = [gs.cv_results_]
results['grid'] = [grid]

dump(clf, "models/clf_" + run_time_suffix + ".joblib")

# Tracking metric but if need to reuse the metric it's better to write to file?
cdsw.track_metric("Best Accuracy", results['best_score'])

from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName("PythonSQL")\
    .config("spark.hadoop.fs.s3a.s3guard.ddb.region","us-east-1")\
    .config("spark.yarn.access.hadoopFileSystems","s3a://demo-aws-1/")\
    .config("spark.hadoop.yarn.resourcemanager.principal",os.getenv("HADOOP_USER_NAME"))\
    .getOrCreate()

spark.sql(
    "CREATE TABLE IF NOT EXISTS default.experiment_outcomes (BEST_SCORE FLOAT, N_SPLITS INT)"
)
Exemple #26
0
    "total_day_calls", "total_day_charge", "total_eve_calls",
    "total_eve_charge", "total_night_calls", "total_night_charge",
    "total_intl_calls", "total_intl_charge", "number_customer_service_calls"
]

param_numTrees = int(sys.argv[1])
param_maxDepth = int(sys.argv[2])
param_impurity = 'gini'

randF = RandomForestClassifier(n_jobs=10,
                               n_estimators=param_numTrees,
                               max_depth=param_maxDepth,
                               criterion=param_impurity,
                               random_state=0)

cdsw.track_metric("numTrees", param_numTrees)
cdsw.track_metric("maxDepth", param_maxDepth)
cdsw.track_metric("impurity", param_impurity)

randF.fit(pdTrain[features], pdTrain['label'])

predictions = randF.predict(pdTest[features])

#temp = randF.predict_proba(pdTest[features])

pd.crosstab(pdTest['label'],
            predictions,
            rownames=['Actual'],
            colnames=['Prediction'])

list(zip(pdTrain[features], randF.feature_importances_))
Exemple #27
0
test_features = data_features[486:]
test_labels = data_labels[486:]

# Train model

model = ElasticNetCV(normalize=True,
                     l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
                     random_state=288793)
model.fit(train_features, train_labels)

# # Results

# ## Hyperparameters selected by CV

model.l1_ratio_
cdsw.track_metric("l1_ratio", model.l1_ratio_)

model.alpha_
cdsw.track_metric("alpha", model.alpha_)

# ## Model coefficients
model.intercept_
cdsw.track_metric("intercept", model.intercept_)

zip(feature_cols, model.coef_)
for i in range(0, len(feature_cols)):
    cdsw.track_metric(feature_cols[i], model.coef_[i])

# ## r squared scores
r_train = model.score(train_features, train_labels)
r_train
Exemple #28
0
df = pd.get_dummies(df, columns=["channel", "offer"], drop_first=True)
df = df.drop(columns=['zip_code'])

y = df['conversion']
X = df.drop(columns=['conversion'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Using experiment instance set in CICD.py
data_dir = "data"
clf = LogisticRegression()
grid = {'C': [0.1, 1]}

# Simplified Gridsearch... To do: unroll more gs metrics and track them
gs = GridSearchCV(clf, grid, scoring='accuracy')
gs.fit(X_train, y_train)

results = gs.best_score_

#Todo - do this more rigourously following best practices. Pull more metrics out.
#Todo - could pass more scoring criteria from calling script, dynamically, even creating custom scoring functions
print("Best Accuracy Score")
print(results)

run_time_suffix = datetime.datetime.now()
run_time_suffix = run_time_suffix.strftime("%d%m%Y%H%M%S")

dump(clf, "models/clf_" + run_time_suffix + ".joblib")

cdsw.track_metric("Best Accuracy", results)
Exemple #29
0
results['best_score'] = gs.best_score_
results['best_params'] = [gs.best_params_]
results['n_splits'] = gs.n_splits_
results['scorer'] = gs.scorer_
results['cv_results'] = [gs.cv_results_]
results['grid'] = [grid]

results_df = pd.DataFrame(results, index=[0])

results_df['timestamp'] = results_df['timestamp'].astype(str)
results_df['clf'] = results_df['clf'].astype(str)
results_df['best_score'] = results_df['best_score'].astype(int)
results_df['best_params'] = results_df['best_params'].astype(str)
results_df['n_splits'] = results_df['n_splits'].astype(int)
results_df['scorer'] = results_df['scorer'].astype(str)
results_df['cv_results'] = results_df['cv_results'].astype(str)
results_df['grid'] = results_df['grid'].astype(str)

print("Best Accuracy Score")
print(results)
cdsw.track_metric("Best Accuracy Score", gs.best_score_)

spark.sql(
    "CREATE TABLE IF NOT EXISTS default.experiment_outcomes (TIMESTAMP STRING, CLASSIFIER STRING, \
            BEST_SCORE INTEGER, BEST_PARAMS STRING, N_SPLITS INT, SCORER STRING, CV_RESULTS STRING, GRID STRING)"
)

experiments_df = spark.createDataFrame(results_df)

experiments_df.write.insertInto("default.experiment_outcomes", overwrite=False)
def predict(args):
  split=args["feature"].split(";")
  features=[list(map(float,split[:11]))]
  features_df = spark.createDataFrame(features, schema)#.collect()
  features_list = features_df.collect()

  # Let's track the inputs to the model
  for x in features_list:
    cdsw.track_metric("fixedAcidity", x["fixedAcidity"])
    cdsw.track_metric("volatileAcidity", x["volatileAcidity"])
    cdsw.track_metric("citricAcid", x["citricAcid"])
    cdsw.track_metric("residualSugar", x["residualSugar"])
    cdsw.track_metric("chlorides", x["chlorides"])
    cdsw.track_metric("freeSulfurDioxide", x["freeSulfurDioxide"])
    cdsw.track_metric("totalSulfurDioxide", x["totalSulfurDioxide"])
    cdsw.track_metric("density", x["density"])
    cdsw.track_metric("pH", x["pH"])
    cdsw.track_metric("sulphates", x["sulphates"])
    cdsw.track_metric("Alcohol", x["Alcohol"])

  resultdf=model.transform(features_df).toPandas()["prediction"][0]

  if resultdf == 1.0:
    to_return = {"result": "Poor"}
  else:
    to_return = {"result" : "Excellent"}

  # Let's track the prediction we're making
  cdsw.track_metric("prediction", to_return["result"])

  return to_return