X = cc_data[cc_data.Day < 4].iloc[:, 3:len(cc_data.columns) - 1] y = cc_data[cc_data.Day < 4].iloc[:, len(cc_data.columns) - 1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) param_numTrees = int(sys.argv[1]) param_maxDepth = int(sys.argv[2]) param_impurity = sys.argv[3] randF = RandomForestClassifier(n_jobs=10, n_estimators=param_numTrees, max_depth=param_maxDepth, criterion=param_impurity, random_state=0) randF.fit(X_train, y_train) predictions_rand = randF.predict(X_test) auroc = roc_auc_score(y_test, predictions_rand) ap = average_precision_score(y_test, predictions_rand) cdsw.track_metric("auroc", round(auroc, 2)) cdsw.track_metric("ap", round(ap, 2)) pickle.dump(randF, open("cc_model_check.pkl", "wb")) cdsw.track_file("cc_model_check.pkl")
# Pull a sample of the dataset into an in-memory # Pandas dataframe. Use a smaller dataset for a quick demo. flight_df_local = spark.sql("SELECT * FROM `default`.`flights`").limit( set_size).toPandas() from sklearn.preprocessing import OneHotEncoder from sklearn.linear_model import LogisticRegression # Put the data into the array format required by scikit-learn's linreg. # Use one-hot encoding for the categorical variables X_onehot = flight_df_local[["UniqueCarrier", "Origin", "Dest"]] X_num = flight_df_local[["Distance", "CRSDepTime"]] y = (flight_df_local["DepDelay"] > 0) enc = OneHotEncoder(sparse=False) X_transform = enc.fit_transform(X_onehot) reg = LogisticRegression().fit(np.hstack([X_transform, X_num]), y) accuracy = reg.score(np.hstack([X_transform, X_num]), y) cdsw.track_metric("Accuracy", accuracy) import joblib with open(model_path + model_name, 'wb') as fid: joblib.dump(reg, fid) cdsw.track_file(model_path + model_name) with open(model_path + enc_name, 'wb') as fid: joblib.dump(enc, fid) cdsw.track_file(model_path + enc_name)
def train_model( path, x_columns, y_column, output_path="models", test_size=0.3, random_state=None, cross_validation_folds=5, verbose=2, ): logging.info("Begin `train_model`") logging.debug(f"Load Data from {path}") logging.debug(f"x_columns: {x_columns}") logging.debug(f"y_column: {y_column}") df = load_data(path, x_columns + [y_column]) logging.info("Preprocessing Inputs") df, new_X_columns = preprocess_model_inputs(df, x_columns, y_column) logging.debug(f"new_X_columns: {new_X_columns}") X = df[new_X_columns] y = df[y_column] logging.info("Splitting data into test and train sets") logging.info(f"test_size: {test_size}") logging.info(f"random_state: {random_state}") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=random_state, stratify=y, ) grid_params = get_grid_search_params() logging.info("Getting Grid Search CV Pipeline") logging.debug(f"cross_validation_folds: {cross_validation_folds}") pipeline = model_grid_search_cv( new_X_columns, get_base_estimator(random_state), grid_params, verbose=verbose, cross_validation_folds=cross_validation_folds) try: pipeline.fit(X_train, y_train) except Exception as e: logging.exception("Exception during pipeline fitting") raise e logging.info("Finished Grid Search CV") logging.info(f"Best Score: {pipeline['grid_search_cv'].best_score_}") logging.info(f"Best Params: {pipeline['grid_search_cv'].best_params_}") write_out_model_params(pipeline["grid_search_cv"].best_params_, output_path) eval_predictions("train", pipeline, X_train, y_train, output_path) eval_predictions("test", pipeline, X_test, y_test, output_path) logging.info(f"Writing out model to {output_path}/trained.model") joblib.dump(pipeline, f"{output_path}/trained.model") cdsw.track_file(f"{output_path}/trained.model") logging.info("End `train_model`")
"Gradient Boost Tree - Feature Rankings Sorted By Importance Value %s" % (sortedFeaturRankings) "When summed together, the values equal 1.0" #Return Paramaters to CDSW User Interface cdsw.track_metric("auroc", auroc) cdsw.track_metric("aupr", aupr) cdsw.track_metric("F1", f1score) cdsw.track_metric("WeightedPrecision", weightedPrecision) cdsw.track_metric("weightedRecall", weightedRecall) cdsw.track_metric("numIter",param_BestModel_NumIter) cdsw.track_metric("maxDepth",param_BestModel_Depth) cdsw.track_metric("cvFolds",user_gbt_param_numFolds) from pyspark.mllib.evaluation import BinaryClassificationMetrics gbt_labelPredictionSet = gbt_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label)) gbtmetrics = BinaryClassificationMetrics(gbt_labelPredictionSet) #Save GBT Model to Disk gbtmodel.write().overwrite().save("models/spark/gbt") !rm -r -f models/spark/gbt !rm -r -f models/spark_gbt.tar !hdfs dfs -get models/spark/gbt !hdfs dfs -get models/ !tar -cvf models/spark_gbt.tar models/spark/gbt cdsw.track_file("models/spark_gbt.tar") spark.stop()
model.l1_ratio_ cdsw.track_metric("l1_ratio", model.l1_ratio_) model.alpha_ cdsw.track_metric("alpha", model.alpha_) # ## Model coefficients model.intercept_ cdsw.track_metric("intercept", model.intercept_) zip(feature_cols, model.coef_) for i in range(0, len(feature_cols)): cdsw.track_metric(feature_cols[i], model.coef_[i]) # ## r squared scores r_train = model.score(train_features, train_labels) r_train cdsw.track_metric("r_train", r_train) r_test = model.score(test_features, test_labels) r_test cdsw.track_metric("r_test", r_test) # ## Persist model during experiment filename = 'bikeshare_model.pkl' joblib.dump(model, filename) cdsw.track_file(filename) #timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f') #joblib.dump(model, 'bikeshare_model_' + timestamp + '.pkl')
filtered, vocab = cutoff(sorted_vocab, tokenized, freqs) sentiments, messages = balance_classes(sentiments, filtered) token_ids = convert_to_token_ids(messages, vocab) print(type(token_ids)) print(len(token_ids) * 0.5) split_idx = int(len(token_ids) * 0.5) split_frac = 0.98 # for small data #split_frac = 0.8 # for big data train_features, train_labels, tf, tl, vf, vl = split_data( token_ids, sentiments, vocab, split_frac=split_frac) model = create_model(train_features, train_labels, vocab) acc, loss = train_model(model, train_features, train_labels, print_every=1) # for small data #acc, loss = train_model(model, train_features, train_labels) # for big data import cdsw model_filename = "model.torch" vocab_filename = "vocab.pickle" cdsw.track_file(model_filename) cdsw.track_file(vocab_filename) cdsw.track_metric("Accuracy", acc) cdsw.track_metric("Loss", loss)
print("test", test_score) print(classification_report(y_test, pipe.predict(X_test))) data[labels.name + ' probability'] = pipe.predict_proba(X)[:, 1] # Create LIME Explainer feature_names = list(ce.columns_) categorical_features = list(ce.cat_columns_ix_.values()) categorical_names = {i: ce.classes_[c] for c, i in ce.cat_columns_ix_.items()} class_names = ['No ' + labels.name, labels.name] explainer = LimeTabularExplainer(ce.transform(data), feature_names=feature_names, class_names=class_names, categorical_features=categorical_features, categorical_names=categorical_names) # Create and save the combined Logistic Regression and LIME Explained Model. explainedmodel = ExplainedModel(data=data, labels=labels, model_name='telco_linear', categoricalencoder=ce, pipeline=pipe, explainer=explainer,data_dir=data_dir) explainedmodel.save() # If running as as experiment, this will track the metrics and add the model trained in this training run to the experiment history. cdsw.track_metric("train_score",round(train_score,2)) cdsw.track_metric("test_score",round(test_score,2)) cdsw.track_metric("model_path",explainedmodel.model_path) cdsw.track_file(explainedmodel.model_path)
#Return Paramaters to CDSW User Intemlpace cdsw.track_metric("auroc", auroc) cdsw.track_metric("aupr", aupr) cdsw.track_metric("F1", f1score) cdsw.track_metric("WeightedPrecision", weightedPrecision) cdsw.track_metric("weightedRecall", weightedRecall) #cdsw.track_metric("Layers",param_BestModel_Layers) #cdsw.track_metric("maxIter",param_BestModel_Iter) cdsw.track_metric("cvFolds",user_mlp_param_numFolds) from pyspark.mllib.evaluation import BinaryClassificationMetrics labelPredictionSet = mlp_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label)) metrics = BinaryClassificationMetrics(labelPredictionSet) #metrics.areaUnderPR #Save MLP Model to Disk mlpmodel.write().overwrite().save("models/spark/mlp") !rm -r -f models/spark/mlp !rm -r -f models/spark_mlp.tar !hdfs dfs -get models/spark/mlp !hdfs dfs -get models/ !tar -cvf models/spark_mlp.tar models/spark/mlp cdsw.track_file("models/spark_mlp.tar") spark.stop()
#Retrieving Paramaters from the Best RF Model param_BestModel_Iter = bestSVMModel._java_obj.getMaxIter() #Return Paramaters to CDSW User Interface cdsw.track_metric("auroc", auroc) cdsw.track_metric("aupr", aupr) cdsw.track_metric("F1", f1score) cdsw.track_metric("WeightedPrecision", weightedPrecision) cdsw.track_metric("weightedRecall", weightedRecall) cdsw.track_metric("maxIter",param_BestModel_Iter) cdsw.track_metric("cvFolds",user_svm_param_numFolds) from pyspark.mllib.evaluation import BinaryClassificationMetrics labelPredictionSet = svm_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label)) metrics = BinaryClassificationMetrics(labelPredictionSet) #Save SVM Model to Disk svmModel.write().overwrite().save("models/spark/svm") !rm -r -f models/spark/svm !rm -r -f models/spark_svm.tar !hdfs dfs -get models/spark/svm !hdfs dfs -get models/ !tar -cvf models/spark_svm.tar models/spark/svm cdsw.track_file("models/spark_svm.tar") spark.stop()
#results['best_parameters'] = gs.best_params_ results = dict() results['best_score'] = gs.best_score_ results['n_splits'] = gs.n_splits_ #results['scorer'] = gs.scorer_ #results['best_parameters'] = gs.best_params_ results_df = pd.DataFrame(results, index=[0]) #Todo - do this more rigourously following best practices. Pull more metrics out. #Todo - could pass more scoring criteria from calling script, dynamically, even creating custom scoring functions print("Best Accuracy Score") print(results) run_time_suffix = datetime.datetime.now() run_time_suffix = run_time_suffix.strftime("%d%m%Y%H%M%S") dump(clf, "models/clf_" + run_time_suffix + ".joblib") # Tracking metric but if need to reuse the metric it's better to write to file? cdsw.track_metric("Best Accuracy", results['best_score']) # Tracking metric in db import sqlite3 conn = sqlite3.connect('newdb.db') results_df.to_sql(name='experiments', if_exists='append', con=conn) #pd.read_sql('select * from experiment_results', conn) cdsw.track_file('newdb.db')
numTrees= param_numTrees, maxDepth =param_maxDepth, maxBins=500, predictionCol="prediction") labelConv = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelInd.labels) pipeline = Pipeline(stages=[ucInd, oInd, dInd,wInd,labelInd,encoder, assembler,classifier, labelConv]) (train, test) = df.randomSplit([0.7, 0.3]) model = pipeline.fit(train) predictions = model.transform(test) predictions.head() # ## Model Evalution # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="delayCatInd", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) rfModel = model.stages[-1] print(rfModel) # summary only cdsw.track_metric("accuracy", accuracy) model.write().overwrite().save("flightdelay/rfcModel") !rm -r -f /home/cdsw/models/spark/rfcModel !rm -r -f /home/cdsw/models/spark_rfc.tar !hdfs dfs -get models/spark/rfcModel /home/cdsw/models/spark/rfcModel !tar -cvf spark_rfc.tar /home/cdsw/models/spark cdsw.track_file("/home/cdsw/models/spark_rfc.tar") spark.stop()
ID.build_vocab(train_data) # TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE) TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.twitter.27B.25d", unk_init=torch.Tensor.normal_) SENTIMENT.build_vocab(train_data) AIRLINE.build_vocab(train_data) print(TEXT.vocab.freqs.most_common(20)) # save this - need for model prediction outfile = open(model_dir + 'vocab_index.pkl', 'wb') pickle.dump(TEXT.vocab.stoi, outfile, -1) outfile.close() cdsw.track_file(model_dir + 'vocab_index.pkl') # check labels, 0 is negative, 1 is positive print(SENTIMENT.vocab.stoi) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') BATCH_SIZE = 32 train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( (train_data, valid_data, test_data), sort_key=lambda x: x.text, # sort by text batch_size=BATCH_SIZE, device=device)
ID.build_vocab(train_data) # TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE) TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.twitter.27B.25d", unk_init=torch.Tensor.normal_) SENTIMENT.build_vocab(train_data) AIRLINE.build_vocab(train_data) print(TEXT.vocab.freqs.most_common(20)) # save this - need for model prediction outfile = open(model_dir + 'vocab_index.pkl', 'wb') pickle.dump(TEXT.vocab.stoi, outfile, -1) outfile.close() cdsw.track_file(model_dir + 'vocab_index.pkl') # check labels, 0 is negative, 1 is positive print(SENTIMENT.vocab.stoi) # In[8]: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') BATCH_SIZE = 32 train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( (train_data, valid_data, test_data), sort_key=lambda x: x.text, # sort by text batch_size=BATCH_SIZE, device=device)
# Read and prepare data: flights_pd = pd.read_csv('data/flights.csv') flights_clean_pd = flights_pd \ .filter(['dep_delay', 'arr_delay']) \ .dropna() \ .loc[flights_pd.dep_delay < 400, :] features = flights_clean_pd.filter(['dep_delay']) targets = flights_clean_pd.filter(['arr_delay']) train_x, test_x, train_y, test_y = train_test_split(features, targets, test_size=0.2) # Specify the model and train it using the training # sample: model = LinearRegression(fit_intercept=fit_intercept) model.fit(train_x, train_y) # Evaluate the model using the test sample. Track the # value of R-squared (rounded to four digits after the # decimal) to compare experiment results: r2 = model.score(test_x, test_y) cdsw.track_metric('R_squared', round(r2, 4)) # Save the model for future use: dump(model, 'model.joblib') cdsw.track_file('model.joblib')
#This code will train a model based on arguments passed to the Experiments feature of CML. #args for experiment: wine_hiveS3 gb s3a://ml-field/demo/wine/ us-west-2 from churnexplainer import train from churnexplainer.data import dataset, load_dataset import cdsw #os.environ['MODEL_TYPE'] = 'gb' os.gentenv('MODEL_TYPE', sys.argv[1]) #os.environ['DATASET'] = 'wine_hiveS3' os.gentenv('DATASET', sys.argv[2]) #os.environ['S3_BUCKET'] = 's3a://ml-field/demo/wine/' os.gentenv('S3_BUCKET', sys.argv[3]) #os.environ['S3_BUCKET_REGION'] = 'us-west-2' os.gentenv('S3_BUCKET_REGION', sys.argv[4]) train_score, test_score, model_path = train.experiment_and_save() cdsw.track_metric("train_score",round(train_score,2)) cdsw.track_metric("test_score",round(test_score,2)) cdsw.track_metric("model_path",model_path) cdsw.track_file(model_path)
import sys import cdsw args = len(sys.argv) - 1 sum = 0 x = 1 while (args >= x): print("Argument %i:%s" % (x, sys.argv[x])) sum = sum + int(sys.argv[x]) x = x + 1 print("Sum of the numbers is: %i." % sum) #Trace metric cdsw.track_metric("sum", sum) #Trace File file = open('add_result.txt', 'w') file.write("Sum of the numbers is: %i." % sum) file.close() cdsw.track_file('add_result.txt')
cdsw.track_metric("numTrees", param_numTrees) cdsw.track_metric("maxDepth", param_maxDepth) cdsw.track_metric("impurity", param_impurity) randF.fit(pdTrain[features], pdTrain['label']) predictions = randF.predict(pdTest[features]) #temp = randF.predict_proba(pdTest[features]) pd.crosstab(pdTest['label'], predictions, rownames=['Actual'], colnames=['Prediction']) list(zip(pdTrain[features], randF.feature_importances_)) y_true = pdTest['label'] y_scores = predictions auroc = roc_auc_score(y_true, y_scores) ap = average_precision_score(y_true, y_scores) print(auroc, ap) cdsw.track_metric("auroc", auroc) cdsw.track_metric("ap", ap) pickle.dump(randF, open("models/sklearn_rf.pkl", "wb")) cdsw.track_file("models/sklearn_rf.pkl")
auroc = evaluator.evaluate(mlp_predictions, {evaluator.metricName: "areaUnderROC"}) aupr = evaluator.evaluate(mlp_predictions, {evaluator.metricName: "areaUnderPR"}) "The AUROC is %s and the AUPR is %s" % (auroc, aupr) f1score = multiEvaluator.evaluate(mlp_predictions, {multiEvaluator.metricName: "f1"}) weightedPrecision = multiEvaluator.evaluate(mlp_predictions, {multiEvaluator.metricName: "weightedPrecision"}) weightedRecall = multiEvaluator.evaluate(mlp_predictions, {multiEvaluator.metricName: "weightedRecall"}) "The F1 score: %s the Weighted Precision: %s the Weighted Recall is %s" % (f1score, weightedPrecision, weightedRecall) #Return Paramaters to CDSW User Interface cdsw.track_metric("auroc", auroc) cdsw.track_metric("aupr", aupr) cdsw.track_metric("F1", f1score) cdsw.track_metric("WeightedPrecision", weightedPrecision) cdsw.track_metric("weightedRecall", weightedRecall) cdsw.track_metric("cvFolds",user_mlp_param_numFolds) #Save MLP Model to Disk mlpmodel.write().overwrite().save("models/spark/stackedmlp") !rm -r -f models/spark/stackedmlp !rm -r -f models/spark_stackedmlp.tar !hdfs dfs -get models/spark/stackedmlp !hdfs dfs -get models/ !tar -cvf models/spark_stackedmlp.tar models/spark/stackedmlp cdsw.track_file("models/spark_stackedmlp.tar") spark.stop()
# # Step 4 Evaluate Model from pyspark.ml.evaluation import BinaryClassificationEvaluator #Predict on Test Data predictions = model.transform(testData) #Evaluate evaluator = BinaryClassificationEvaluator() auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}) aupr = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}) print("The AUROC is {:f} and the AUPR is {:f}".format(auroc, aupr)) #Track metric value in CDSW cdsw.track_metric("auroc", auroc) cdsw.track_metric("aupr", aupr) # # Save Model for deployement #model.write().overwrite().save("models/spark") #bring model back into project and tar it #!rm -rf models/ #!mkdir models #!hdfs dfs -get ./models/spark models/ #!tar -cvf models/spark_rf.tar models/spark #!rm -r -f models/spark #!mv models/spark_rf.tar spark_rf.tar cdsw.track_file("spark_rf.tar") spark.stop()
sortedFeaturRankings = sorted(FeautureRankings, reverse=True) "Random Forest - Feature Rankings Sorted By Importance Value %s" % (sortedFeaturRankings) "When summed together, the values equal 1.0" #Return Paramaters to CDSW User Interface cdsw.track_metric("auroc", auroc) cdsw.track_metric("aupr", aupr) cdsw.track_metric("F1", f1score) cdsw.track_metric("WeightedPrecision", weightedPrecision) cdsw.track_metric("weightedRecall", weightedRecall) cdsw.track_metric("numTrees",param_BestModel_NumTrees) from pyspark.mllib.evaluation import BinaryClassificationMetrics rf_labelPredictionSet = rf_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label)) rfmetrics = BinaryClassificationMetrics(rf_labelPredictionSet) #Save Stacked Model to Disk rf_model.write().overwrite().save("models/spark/vanilla") !rm -r -f models/spark !rm -r -f models/spark_rf_vanilla.tar !hdfs dfs -get models/spark !hdfs dfs -get models/ !tar -cvf models/spark_rf._vanilla.tar models/spark/vanilla cdsw.track_file("models/spark_rf_vanilla.tar") spark.stop()
"When summed together, the values equal 1.0" #Return Paramaters to CDSW User Interface cdsw.track_metric("auroc", auroc) cdsw.track_metric("aupr", aupr) cdsw.track_metric("F1", f1score) cdsw.track_metric("WeightedPrecision", weightedPrecision) cdsw.track_metric("weightedRecall", weightedRecall) cdsw.track_metric("numTrees",param_BestModel_NumTrees) cdsw.track_metric("maxDepth",param_BestModel_Depth) cdsw.track_metric("impurity",param_BestModel_impurity) cdsw.track_metric("cvFolds",user_rf_param_numFolds) from pyspark.mllib.evaluation import BinaryClassificationMetrics labelPredictionSet = rf_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label)) metrics = BinaryClassificationMetrics(labelPredictionSet) #Save RF Model to Disk rfmodel.write().overwrite().save("models/spark/rf") !rm -r -f models/spark/rf !rm -r -f models/spark_rf.tar !hdfs dfs -get models/spark/rf !hdfs dfs -get models/ !tar -cvf models/spark_rf.tar models/spark/rf cdsw.track_file("models/spark_rf.tar") spark.stop()
cdsw.track_metric("numTrees",param_numTrees) cdsw.track_metric("maxDepth",param_maxDepth) cdsw.track_metric("impurity",param_impurity) # Fit and Predict randF.fit(pdTrain[features], pdTrain['label']) predictions=randF.predict(pdTest[features]) #temp = randF.predict_proba(pdTest[features]) pd.crosstab(pdTest['label'], predictions, rownames=['Actual'], colnames=['Prediction']) list(zip(pdTrain[features], randF.feature_importances_)) y_true = pdTest['label'] y_scores = predictions auroc = roc_auc_score(y_true, y_scores) ap = average_precision_score (y_true, y_scores) print(auroc, ap) cdsw.track_metric("auroc", auroc) cdsw.track_metric("ap", ap) pickle.dump(randF, open("iot_model.pkl","wb")) cdsw.track_file("iot_model.pkl") time.sleep(15) print("Slept for 15 seconds.")
cdsw.track_metric("maxDepth", param_maxDepth) cdsw.track_metric("impurity", param_impurity) randF.fit(pdTrain[features], pdTrain['label']) predictions = randF.predict(pdTest[features]) #temp = randF.predict_proba(pdTest[features]) pd.crosstab(pdTest['label'], predictions, rownames=['Actual'], colnames=['Prediction']) list(zip(pdTrain[features], randF.feature_importances_)) y_true = pdTest['label'] y_scores = predictions auroc = roc_auc_score(y_true, y_scores) ap = average_precision_score(y_true, y_scores) print(auroc, ap) cdsw.track_metric("auroc", auroc) cdsw.track_metric("ap", ap) pickle.dump(randF, open("sklearn_rf.pkl", "wb")) cdsw.track_file("sklearn_rf.pkl") time.sleep(20) print("Slept for 20 seconds.")
(start, end) = (start, end) if start < end else (end, start) split_point = find_split_point(start, end, precision_rate(start), precision_rate(end)) print('\nSplit point:', split_point) # Update the deployed model split point import subprocess subprocess.call([ "sed", "-i", 's/split_point=.*/split_point=' + str(round(split_point, 3)) + "/ ", "/home/cdsw/4_model_deploy.py" ]) # model precision rate precision1 = (loss1 < split_point).sum().item() / float(len(loss1)) precision2 = (loss2 > split_point).sum().item() / float(len(loss2)) print('Precision rate for normal cases:', precision1) print('Precision rate for fraud cases:', precision2) print('Overall precision:', (precision1 + precision2) / 2) torch.save(model.state_dict(), 'model/creditcard-fraud.model') # track experiment metrics # If running as as experiment, this will track the metrics and add the model trained in this # training run to the experiment history. cdsw.track_metric("split_point", round(split_point, 2)) cdsw.track_metric("precision", round(((precision1 + precision2) / 2), 2)) cdsw.track_file('model/creditcard-fraud.model') cdsw.track_file('model/cc_scaler.pkl')
# __________________________________________________________________________________________________ # # 11. MODEL EVALUATION # Predict Model pred_rf_test = rf.predict(X_test) # => melaukan prediksi dengan data test # __________________________________________________________________________________________________ #cdsw.track_metric("accuracy", accuracy_score(y_test, pred_rf_test)) print(accuracy_score(y_test, pred_rf_test)) # => Evaluasi model yang dibuat dengan melihat tingkat akurasi yang dihasilkan # __________________________________________________________________________________________________ probs = rf.predict_proba(X_test) probs = probs[:, 1] #cdsw.track_metric("auc", roc_auc_score(y_test, probs)) print(roc_auc_score(y_test, probs)) # => Evaluasi model yang dibuat dengan melihat nilai Area Under Curve (AUC) # __________________________________________________________________________________________________ pickle.dump(rf, open("sklearn_rf_large.pkl", "wb")) cdsw.track_file("sklearn_rf_large.pkl") # => Save model # __________________________________________________________________________________________________
features = [ "intl_plan_indexed", "account_length", "number_vmail_messages", "total_day_calls", "total_day_charge", "total_eve_calls", "total_eve_charge", "total_night_calls", "total_night_charge", "total_intl_calls", "total_intl_charge", "number_customer_service_calls" ] randF = RandomForestClassifier(n_jobs=10, n_estimators=args.trees) randF.fit(pdTrain[features], pdTrain['label']) predictions = randF.predict(pdTest[features]) ## Feature Importance list(zip(pdTrain[features], randF.feature_importances_)) ## AUROC y_true = pdTest['label'] y_scores = predictions auroc = roc_auc_score(y_true, y_scores) ap = average_precision_score(y_true, y_scores) print(auroc, ap) cdsw.track_metric("auroc", auroc) cdsw.track_metric("ap", ap) ## Serialize and track model pickle.dump(randF, open("model.pkl", "wb")) cdsw.track_file("model.pkl") ## Stop Spark spark.stop()