def predict(data): # Sample input # {"RECENCY":11, # "HISTORY":204, # "USED_DISCOUNT":0, # "USED_BOGO":1, # "IS_REFERRAL":1, # "SCORE":0.766} # # Sample Output # {"result": 1} df = pd.DataFrame(data, index=[0]) df.columns = [ 'recency', 'history', 'used_discount', 'used_bogo', 'is_referral', 'score' ] df = df.astype('float') prediction = float(clf.predict(df)[0]) probability = float(clf.predict_proba(df)[0][1]) # Track model metrics cdsw.track_metric('prediction', prediction) cdsw.track_metric('conversion_probability', probability) return {'prediction': prediction, 'conversion_probability': probability}
def predict_cancelled(args): inputs = args["feature"].split(",") inputs[3] = int(inputs[3]) inputs[4] = int(inputs[4]) inputs[5] = int(inputs[5]) inputs[6] = int(inputs[6]) inputs[7] = int(inputs[7]) input_cols = [ "OP_CARRIER", "ORIGIN", "DEST", "CRS_DEP_TIME", "CRS_ELAPSED_TIME", "DISTANCE", "WEEK", "HOUR", ] input_df = pd.DataFrame([inputs], columns=input_cols) input_transformed = ct.transform(input_df) prediction = pipe.predict(input_transformed) cdsw.track_metric("input_data", args) cdsw.track_metric("prediction", int(prediction[0])) return {"prediction": int(prediction[0])}
def predict(args): cdsw.track_metric("input", args) petal_length = float(args.get('petal_length')) result = model.predict([[petal_length]]) cdsw.track_metric("predict_result", result[0][0]) modified_result = result + 1 cdsw.track_aggregate_metrics({"modified_result": agg_result}, start_timestamp_ms, end_timestamp_ms, model_deployment_crn=Deployment_CRN) return result[0][0]
def explain(args): data = dict(ChainMap(args, em.default_data)) data = em.cast_dct(data) #Do the prediction and provide weights for the reasoning probability, explanation = em.explain_dct(data) #NEW! Track our inputs for key in data: if isinstance(data[key], numpy.int64) or isinstance( data[key], numpy.float64): cdsw.track_metric(key, data[key].item()) else: cdsw.track_metric(key, data[key]) #NEW! Track our prediction cdsw.track_metric('probability', probability) #NEW! Track explanation cdsw.track_metric('explanation', explanation) return { 'data': dict(data), 'probability': probability, 'explanation': explanation }
def predict(args): # Track the input. cdsw.track_metric("input", args) # If this model involved features, ie transformations of the # raw input, they could be tracked as well. # cdsw.track_metric("feature_vars", {"a":1,"b":23}) petal_length = float(args.get('petal_length')) result = model.predict([[petal_length]]) # Track the output. cdsw.track_metric("predict_result", result[0][0]) return result[0][0]
def predict(args): data = pd.DataFrame(args, index=[0]) data = data.astype(float) prediction = clf.predict(data) probability = clf.predict_proba(data) # Track inputs #cdsw.track_metric('input_data', data) # Track our prediction cdsw.track_metric('probability', int(probability[0][0])) # Track explanation cdsw.track_metric('explanation', int(prediction[0])) return {'prediction': prediction, 'probability': probability}
def predict_cancelled(args): inputs = args['feature'].split(",") inputs[1] = int(inputs[1]) inputs[4] = int(inputs[4]) inputs[5] = int(inputs[5]) inputs[6] = int(inputs[6]) inputs[7] = int(inputs[7]) inputs[8] = int(inputs[8]) input_cols = ['OP_CARRIER','OP_CARRIER_FL_NUM','ORIGIN','DEST','CRS_DEP_TIME','CRS_ELAPSED_TIME','DISTANCE','WEEK','HOUR'] input_df = pd.DataFrame([inputs],columns=input_cols ) input_transformed = ct.transform(input_df) prediction = pipe.predict(input_transformed) cdsw.track_metric('input_data', args) cdsw.track_metric('prediction', int(prediction[0])) return { "prediction" : int(prediction[0]) }
def predict_cancelled(args): inputs = args["feature"].split(",") inputs[3] = int(inputs[3]) inputs[4] = int(inputs[4]) input_cols = [ "OP_CARRIER", "ORIGIN", "DEST", "WEEK", "HOUR", ] input_df = pd.DataFrame([inputs], columns=input_cols) input_transformed = ct.transform(input_df) probas = pipe.predict_proba(input_transformed) prediction = np.argmax(probas) proba = round(probas[0][prediction], 2) cdsw.track_metric("input_data", args) cdsw.track_metric("prediction", int(prediction)) cdsw.track_metric("proba", str(proba)) response = {"prediction": int(prediction), "proba": str(proba)} return response
def explain(args): data = dict(ChainMap(args, em.default_data)) data = em.cast_dct(data) probability, explanation = em.explain_dct(data) #NEW! Track our inputs # for key in data: # if isinstance(data[key], numpy.int64) or isinstance(data[key], numpy.float64): # cdsw.track_metric(key, data[key].item()) # else: # cdsw.track_metric(key, data[key]) cdsw.track_metric('input_data', data) #NEW! Track our prediction cdsw.track_metric('probability', probability) #NEW! Track explanation cdsw.track_metric('explanation', explanation) return { 'data': dict(data), 'probability': probability, 'explanation': explanation }
def predict_cancelled(args): inputs = args['feature'].split(",") inputs[1] = int(inputs[1]) inputs[4] = int(inputs[4]) inputs[5] = int(inputs[5]) inputs[6] = int(inputs[6]) inputs[7] = int(inputs[7]) inputs[8] = int(inputs[8]) input_cols = [ 'uniquecarrier', 'flightnum', 'origin', 'dest', 'crsdeptime', 'crselapsedtime', 'distance', 'week', 'hour' ] input_df = pd.DataFrame([inputs], columns=input_cols) input_transformed = ct.transform(input_df) prediction = pipe.predict(input_transformed) cdsw.track_metric('input_data', args) cdsw.track_metric('prediction', int(prediction[0])) return {"prediction": int(prediction[0])}
def predict(data): df = pd.DataFrame(data, index=[0]) df.columns = [ 'acc_now_delinq', 'acc_open_past_24mths', 'annual_inc', 'avg_cur_bal', 'funded_amnt' ] df = df.astype('float') tracked_data = df.astype('str').to_dict('records')[0] prediction = str(clf.predict(df)[0]) # Track prediction cdsw.track_metric("prediction", str(prediction)) cdsw.track_metric("data", df.to_json()) return {'input_data': str(tracked_data), 'prediction': str(prediction)} # To test this is a session, comment out the `@cdsw.model_metrics` line, # uncomment the and run the two rows below. # x={"StreamingTV":"No","MonthlyCharges":70.35,"PhoneService":"No","PaperlessBilling":"No","Partner":"No","OnlineBackup":"No","gender":"Female","Contract":"Month-to-month","TotalCharges":1397.475,"StreamingMovies":"No","DeviceProtection":"No","PaymentMethod":"Bank transfer (automatic)","tenure":29,"Dependents":"No","OnlineSecurity":"No","MultipleLines":"No","InternetService":"DSL","SeniorCitizen":"No","TechSupport":"No"} # explain(x) ## Wrap up # # We've now covered all the steps to **deploying and serving Models**, including the # requirements, limitations, and how to set up, test, and use them. # This is a powerful way to get data scientists' work in use by other people quickly. # # In the next part of the project we will explore how to launch a **web application** # served through CML. # Your team is busy building models to solve problems. # CML-hosted Applications are a simple way to get these solutions in front of # stakeholders quickly.
def predict(args): data = pd.DataFrame(args, index=[0]) prediction = pipeline.predict(data) probability = clf.predict_proba(data) #Track individual inputs -- Already available cdsw.track_metric('input_data', data) # Track probability -- Already available cdsw.track_metric('probability', int(probability[0][0])) # Track prediction -- Already available cdsw.track_metric('prediction', int(prediction[0])) return {'prediction': prediction, 'probability': probability}
def explain(args): data = dict(ChainMap(args, em.default_data)) data = em.cast_dct(data) probability, explanation = em.explain_dct(data) # Track inputs cdsw.track_metric("input_data", data) # Track our prediction cdsw.track_metric("probability", probability) # Track explanation cdsw.track_metric("explanation", explanation) return { "data": dict(data), "probability": probability, "explanation": explanation }
def explain(args): data = dict(ChainMap(args, em.default_data)) data = em.cast_dct(data) probability, explanation = em.explain_dct(data) # Track inputs cdsw.track_metric('input_data', data) # Track our prediction cdsw.track_metric('probability', probability) # Track explanation cdsw.track_metric('explanation', explanation) return { 'data': dict(data), 'probability': probability, 'explanation': explanation }
def explain(args): data = dict(ChainMap(args, em.default_data)) data = em.cast_dct(data) probability, explanation = em.explain_dct(data) # Track inputs cdsw.track_metric('input_data', data) # Track our prediction cdsw.track_metric('probability', probability) # Track explanation cdsw.track_metric('explanation', explanation) return { 'data': dict(data), 'probability': probability, 'explanation': explanation } # To test this is a session, comment out the `@cdsw.model_metrics` line, # uncomment the and run the two rows below. #x={"StreamingTV":"No","MonthlyCharges":70.35,"PhoneService":"No","PaperlessBilling":"No","Partner":"No","OnlineBackup":"No","gender":"Female","Contract":"Month-to-month","TotalCharges":1397.475,"StreamingMovies":"No","DeviceProtection":"No","PaymentMethod":"Bank transfer (automatic)","tenure":29,"Dependents":"No","OnlineSecurity":"No","MultipleLines":"No","InternetService":"DSL","SeniorCitizen":"No","TechSupport":"No"} #explain(x) ## Wrap up # # We've now covered all the steps to **deploying and serving Models**, including the # requirements, limitations, and how to set up, test, and use them. # This is a powerful way to get data scientists' work in use by other people quickly. # # In the next part of the project we will explore how to launch a **web application** # served through CML. # Your team is busy building models to solve problems. # CML-hosted Applications are a simple way to get these solutions in front of # stakeholders quickly.
#Retrieving Paramaters from the Best RF Model param_BestModel_NumIter = bestBGTModel._java_obj.getMaxIter() param_BestModel_Depth = bestBGTModel._java_obj.getMaxDepth() #Feature Importance impFeatures = gbtmodel.stages[-1].featureImportances zipFeaturesToImportanceValue = zip(impFeatures, input_cols) FeautureRankings = set(zipFeaturesToImportanceValue) sortedFeaturRankings = sorted(FeautureRankings, reverse=True) "Gradient Boost Tree - Feature Rankings Sorted By Importance Value %s" % (sortedFeaturRankings) "When summed together, the values equal 1.0" #Return Paramaters to CDSW User Interface cdsw.track_metric("auroc", auroc) cdsw.track_metric("aupr", aupr) cdsw.track_metric("F1", f1score) cdsw.track_metric("WeightedPrecision", weightedPrecision) cdsw.track_metric("weightedRecall", weightedRecall) cdsw.track_metric("numIter",param_BestModel_NumIter) cdsw.track_metric("maxDepth",param_BestModel_Depth) cdsw.track_metric("cvFolds",user_gbt_param_numFolds) from pyspark.mllib.evaluation import BinaryClassificationMetrics gbt_labelPredictionSet = gbt_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label)) gbtmetrics = BinaryClassificationMetrics(gbt_labelPredictionSet) #Save GBT Model to Disk gbtmodel.write().overwrite().save("models/spark/gbt")
#Retrieving Paramaters from the Best RF Model param_BestModel_NumTrees = bestRFModel._java_obj.getNumTrees() param_BestModel_Depth = bestRFModel._java_obj.getMaxDepth() param_BestModel_impurity = bestRFModel._java_obj.getImpurity() #Feature Importance impFeatures = rfmodel.stages[-1].featureImportances zipFeaturesToImportanceValue = zip(impFeatures, input_cols) FeautureRankings = set(zipFeaturesToImportanceValue) sortedFeaturRankings = sorted(FeautureRankings, reverse=True) "Random Forest - Feature Rankings Sorted By Importance Value %s" % (sortedFeaturRankings) "When summed together, the values equal 1.0" #Return Paramaters to CDSW User Interface cdsw.track_metric("auroc", auroc) cdsw.track_metric("aupr", aupr) cdsw.track_metric("F1", f1score) cdsw.track_metric("WeightedPrecision", weightedPrecision) cdsw.track_metric("weightedRecall", weightedRecall) cdsw.track_metric("numTrees",param_BestModel_NumTrees) cdsw.track_metric("maxDepth",param_BestModel_Depth) cdsw.track_metric("impurity",param_BestModel_impurity) cdsw.track_metric("cvFolds",user_rf_param_numFolds) from pyspark.mllib.evaluation import BinaryClassificationMetrics labelPredictionSet = rf_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label)) metrics = BinaryClassificationMetrics(labelPredictionSet) #Save RF Model to Disk
print("test", test_score) print(classification_report(y_test, pipe.predict(X_test))) data[labels.name + ' probability'] = pipe.predict_proba(X)[:, 1] # Create LIME Explainer feature_names = list(ce.columns_) categorical_features = list(ce.cat_columns_ix_.values()) categorical_names = {i: ce.classes_[c] for c, i in ce.cat_columns_ix_.items()} class_names = ['No ' + labels.name, labels.name] explainer = LimeTabularExplainer(ce.transform(data), feature_names=feature_names, class_names=class_names, categorical_features=categorical_features, categorical_names=categorical_names) # Create and save the combined Logistic Regression and LIME Explained Model. explainedmodel = ExplainedModel(data=data, labels=labels, model_name='telco_linear', categoricalencoder=ce, pipeline=pipe, explainer=explainer,data_dir=data_dir) explainedmodel.save() # If running as as experiment, this will track the metrics and add the model trained in this training run to the experiment history. cdsw.track_metric("train_score",round(train_score,2)) cdsw.track_metric("test_score",round(test_score,2)) cdsw.track_metric("model_path",explainedmodel.model_path) cdsw.track_file(explainedmodel.model_path)
import cdsw from pyspark.sql import SparkSession from pyspark.sql.types import * """#uncomment for experiments # # Get parameters for experiments # Declare parameters param_numTrees= int(sys.argv[1]) param_maxDepth=int(sys.argv[2]) param_impurity=sys.argv[3] #track parameters in experiments cdsw.track_metric("numTrees",param_numTrees) cdsw.track_metric("maxDepth",param_maxDepth) cdsw.track_metric("impurity",param_impurity) """ # Comment out when using experiments param_numTrees = 10 param_maxDepth = 15 param_impurity = "gini" spark = SparkSession\ .builder\ .appName('wine-quality-analysis')\ .config("spark.executor.memory","2g")\ .config("spark.executor.cores","2")\ .config("spark.executor.instances","3")\ .config("spark.hadoop.fs.s3a.metadatastore.impl","org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore")\ .config("spark.hadoop.fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")\ .config("spark.hadoop.fs.s3a.delegation.token.binding","")\
results['best_score'] = gs.best_score_ results['best_params'] = [gs.best_params_] results['n_splits'] = gs.n_splits_ results['scorer'] = gs.scorer_ results['cv_results'] = [gs.cv_results_] results['grid'] = [grid] results_df = pd.DataFrame(results, index=[0]) results_df['timestamp'] = results_df['timestamp'].astype(str) results_df['clf'] = results_df['clf'].astype(str) results_df['best_score'] = results_df['best_score'].astype(int) results_df['best_params'] = results_df['best_params'].astype(str) results_df['n_splits'] = results_df['n_splits'].astype(int) results_df['scorer'] = results_df['scorer'].astype(str) results_df['cv_results'] = results_df['cv_results'].astype(str) results_df['grid'] = results_df['grid'].astype(str) print("Best Accuracy Score") print(results) cdsw.track_metric("Best Accuracy Score", results_df['best_score'].astype(int)) spark.sql("CREATE TABLE IF NOT EXISTS default.experiment_outcomes (TIMESTAMP STRING, CLASSIFIER STRING, \ BEST_SCORE INTEGER, BEST_PARAMS STRING, N_SPLITS INT, SCORER STRING, CV_RESULTS STRING, GRID STRING)") experiments_df = spark.createDataFrame(results_df) experiments_df.write.insertInto("default.experiment_outcomes",overwrite = False)
filtered, vocab = cutoff(sorted_vocab, tokenized, freqs) sentiments, messages = balance_classes(sentiments, filtered) token_ids = convert_to_token_ids(messages, vocab) print(type(token_ids)) print(len(token_ids) * 0.5) split_idx = int(len(token_ids) * 0.5) split_frac = 0.98 # for small data #split_frac = 0.8 # for big data train_features, train_labels, tf, tl, vf, vl = split_data( token_ids, sentiments, vocab, split_frac=split_frac) model = create_model(train_features, train_labels, vocab) acc, loss = train_model(model, train_features, train_labels, print_every=1) # for small data #acc, loss = train_model(model, train_features, train_labels) # for big data import cdsw model_filename = "model.torch" vocab_filename = "vocab.pickle" cdsw.track_file(model_filename) cdsw.track_file(vocab_filename) cdsw.track_metric("Accuracy", acc) cdsw.track_metric("Loss", loss)
as_pandas=True, seed=123) #Plot the importance of each feature xgb.plot_importance(xg_reg) plt.rcParams['figure.figsize'] = [5, 5] plt.show() #Test the final error final_rmse = cv_results["test-rmse-mean"].iat[-1] print("Root Mean Std. Err : ", final_rmse) #Show predictions for next week using forecasted car production data_newforecast = xgb.DMatrix(data=df_forecast) new_preds = xg_reg.predict(data_newforecast) print("Predicted weekly production for Part No ", part_no, ": ", new_preds[0]) #Save model as pickle file picklefile = part_no + '.pickle' pickle.dump(xg_reg, open(picklefile, 'wb')) cdsw.track_metric('RSME', final_rmse) cdsw.track_metric('Estimated Part Production', new_preds[0]) """ def predictParts(): loaded_model = pickle.load(open(picklefile, 'rb')) result = loaded_model.predict(data_newforecast) print(result) return result """
ax.set_facecolor('white') ax.set_xlabel('Epoch') ax.set_ylabel('Accuracy') ax.set_title('Classifier Accuracy') fig.set_facecolor('white') plt.plot('epoch', 'acc', label="Accuracy", data=df_model, markersize=12, color='skyblue', linewidth=1) plt.plot('epoch', 'val_acc', label="Validation Accuracy", data=df_model, markersize=12, color='blue', linewidth=1) legend = plt.legend(loc="lower right", facecolor='white', framealpha=1) plt.show() plot_classifier_acc() cdsw.track_metric("Accuracy", 0.9) cdsw.track_metric("AUC", 0.89)
X = cc_data[cc_data.Day < 4].iloc[:, 3:len(cc_data.columns) - 1] y = cc_data[cc_data.Day < 4].iloc[:, len(cc_data.columns) - 1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) param_numTrees = int(sys.argv[1]) param_maxDepth = int(sys.argv[2]) param_impurity = sys.argv[3] randF = RandomForestClassifier(n_jobs=10, n_estimators=param_numTrees, max_depth=param_maxDepth, criterion=param_impurity, random_state=0) randF.fit(X_train, y_train) predictions_rand = randF.predict(X_test) auroc = roc_auc_score(y_test, predictions_rand) ap = average_precision_score(y_test, predictions_rand) cdsw.track_metric("auroc", round(auroc, 2)) cdsw.track_metric("ap", round(ap, 2)) pickle.dump(randF, open("cc_model_check.pkl", "wb")) cdsw.track_file("cc_model_check.pkl")
results = dict() results['timestamp'] = run_time_suffix results['clf'] = clf results['best_estimator'] = gs.best_estimator_ results['best_score'] = gs.best_score_ results['best_params'] = [gs.best_params_] results['n_splits'] = gs.n_splits_ results['scorer'] = gs.scorer_ results['cv_results'] = [gs.cv_results_] results['grid'] = [grid] dump(clf, "models/clf_" + run_time_suffix + ".joblib") # Tracking metric but if need to reuse the metric it's better to write to file? cdsw.track_metric("Best Accuracy", results['best_score']) from pyspark.sql import SparkSession spark = SparkSession\ .builder\ .appName("PythonSQL")\ .config("spark.hadoop.fs.s3a.s3guard.ddb.region","us-east-1")\ .config("spark.yarn.access.hadoopFileSystems","s3a://demo-aws-1/")\ .config("spark.hadoop.yarn.resourcemanager.principal",os.getenv("HADOOP_USER_NAME"))\ .getOrCreate() spark.sql( "CREATE TABLE IF NOT EXISTS default.experiment_outcomes (BEST_SCORE FLOAT, N_SPLITS INT)" )
"total_day_calls", "total_day_charge", "total_eve_calls", "total_eve_charge", "total_night_calls", "total_night_charge", "total_intl_calls", "total_intl_charge", "number_customer_service_calls" ] param_numTrees = int(sys.argv[1]) param_maxDepth = int(sys.argv[2]) param_impurity = 'gini' randF = RandomForestClassifier(n_jobs=10, n_estimators=param_numTrees, max_depth=param_maxDepth, criterion=param_impurity, random_state=0) cdsw.track_metric("numTrees", param_numTrees) cdsw.track_metric("maxDepth", param_maxDepth) cdsw.track_metric("impurity", param_impurity) randF.fit(pdTrain[features], pdTrain['label']) predictions = randF.predict(pdTest[features]) #temp = randF.predict_proba(pdTest[features]) pd.crosstab(pdTest['label'], predictions, rownames=['Actual'], colnames=['Prediction']) list(zip(pdTrain[features], randF.feature_importances_))
test_features = data_features[486:] test_labels = data_labels[486:] # Train model model = ElasticNetCV(normalize=True, l1_ratio=[.1, .5, .7, .9, .95, .99, 1], random_state=288793) model.fit(train_features, train_labels) # # Results # ## Hyperparameters selected by CV model.l1_ratio_ cdsw.track_metric("l1_ratio", model.l1_ratio_) model.alpha_ cdsw.track_metric("alpha", model.alpha_) # ## Model coefficients model.intercept_ cdsw.track_metric("intercept", model.intercept_) zip(feature_cols, model.coef_) for i in range(0, len(feature_cols)): cdsw.track_metric(feature_cols[i], model.coef_[i]) # ## r squared scores r_train = model.score(train_features, train_labels) r_train
df = pd.get_dummies(df, columns=["channel", "offer"], drop_first=True) df = df.drop(columns=['zip_code']) y = df['conversion'] X = df.drop(columns=['conversion']) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # Using experiment instance set in CICD.py data_dir = "data" clf = LogisticRegression() grid = {'C': [0.1, 1]} # Simplified Gridsearch... To do: unroll more gs metrics and track them gs = GridSearchCV(clf, grid, scoring='accuracy') gs.fit(X_train, y_train) results = gs.best_score_ #Todo - do this more rigourously following best practices. Pull more metrics out. #Todo - could pass more scoring criteria from calling script, dynamically, even creating custom scoring functions print("Best Accuracy Score") print(results) run_time_suffix = datetime.datetime.now() run_time_suffix = run_time_suffix.strftime("%d%m%Y%H%M%S") dump(clf, "models/clf_" + run_time_suffix + ".joblib") cdsw.track_metric("Best Accuracy", results)
results['best_score'] = gs.best_score_ results['best_params'] = [gs.best_params_] results['n_splits'] = gs.n_splits_ results['scorer'] = gs.scorer_ results['cv_results'] = [gs.cv_results_] results['grid'] = [grid] results_df = pd.DataFrame(results, index=[0]) results_df['timestamp'] = results_df['timestamp'].astype(str) results_df['clf'] = results_df['clf'].astype(str) results_df['best_score'] = results_df['best_score'].astype(int) results_df['best_params'] = results_df['best_params'].astype(str) results_df['n_splits'] = results_df['n_splits'].astype(int) results_df['scorer'] = results_df['scorer'].astype(str) results_df['cv_results'] = results_df['cv_results'].astype(str) results_df['grid'] = results_df['grid'].astype(str) print("Best Accuracy Score") print(results) cdsw.track_metric("Best Accuracy Score", gs.best_score_) spark.sql( "CREATE TABLE IF NOT EXISTS default.experiment_outcomes (TIMESTAMP STRING, CLASSIFIER STRING, \ BEST_SCORE INTEGER, BEST_PARAMS STRING, N_SPLITS INT, SCORER STRING, CV_RESULTS STRING, GRID STRING)" ) experiments_df = spark.createDataFrame(results_df) experiments_df.write.insertInto("default.experiment_outcomes", overwrite=False)
def predict(args): split=args["feature"].split(";") features=[list(map(float,split[:11]))] features_df = spark.createDataFrame(features, schema)#.collect() features_list = features_df.collect() # Let's track the inputs to the model for x in features_list: cdsw.track_metric("fixedAcidity", x["fixedAcidity"]) cdsw.track_metric("volatileAcidity", x["volatileAcidity"]) cdsw.track_metric("citricAcid", x["citricAcid"]) cdsw.track_metric("residualSugar", x["residualSugar"]) cdsw.track_metric("chlorides", x["chlorides"]) cdsw.track_metric("freeSulfurDioxide", x["freeSulfurDioxide"]) cdsw.track_metric("totalSulfurDioxide", x["totalSulfurDioxide"]) cdsw.track_metric("density", x["density"]) cdsw.track_metric("pH", x["pH"]) cdsw.track_metric("sulphates", x["sulphates"]) cdsw.track_metric("Alcohol", x["Alcohol"]) resultdf=model.transform(features_df).toPandas()["prediction"][0] if resultdf == 1.0: to_return = {"result": "Poor"} else: to_return = {"result" : "Excellent"} # Let's track the prediction we're making cdsw.track_metric("prediction", to_return["result"]) return to_return