def train_and_predict(self,x_train, x_test, y_train, y_test,clf,drop_cols): """ Output is a dictionary y_prob => Array probability values for prediction feature_importance => features ranked by their Importance feature_Weight => weight of features """ labelEncoder = preprocessing.LabelEncoder() labelEncoder.fit(y_train) if len(drop_cols) > 0: x_train = drop_columns(x_train,drop_cols) x_test = drop_columns(x_test,drop_cols) y_train = labelEncoder.transform(y_train) classes = labelEncoder.classes_ transformed = labelEncoder.transform(classes) labelMapping = dict(zip(transformed,classes)) clf = clf.fit(x_train, y_train) y_score = clf.predict(x_test) y_score = labelEncoder.inverse_transform(y_score) y_prob = clf.predict_proba(x_test) y_prob = [0]*len(y_score) feature_importance = dict(sorted(zip(x_train.columns,clf.feature_importances_),key=lambda x: x[1],reverse=True)) for k, v in feature_importance.iteritems(): feature_importance[k] = CommonUtils.round_sig(v) return {"trained_model":clf,"actual":y_test,"predicted":y_score,"probability":y_prob,"feature_importance":feature_importance,"featureList":list(x_train.columns),"labelMapping":labelMapping}
def train_and_predict(self,x_train, x_test, y_train, y_test,clf,plot_flag,print_flag,drop_cols): """ Output is a dictionary y_prob => Array probability values for prediction results => Array of predicted class feature_importance => features ranked by their Importance feature_Weight => weight of features """ labelEncoder = preprocessing.LabelEncoder() labelEncoder.fit(y_train) if len(drop_cols) > 0: x_train = drop_columns(x_train,drop_cols) x_test = drop_columns(x_test,drop_cols) y_train = labelEncoder.transform(y_train) classes = labelEncoder.classes_ transformed = labelEncoder.transform(classes) labelMapping = dict(zip(transformed,classes)) clf = clf.fit(x_train, y_train) y_score = clf.predict(x_test) y_score = labelEncoder.inverse_transform(y_score) y_prob = clf.predict_proba(x_test) results = pd.DataFrame({"actual":y_test,"predicted":y_score,"prob":list(y_prob)}) feature_importance = dict(sorted(zip(x_train.columns,clf.feature_importances_),key=lambda x: x[1],reverse=True)) for k, v in feature_importance.iteritems(): feature_importance[k] = CommonUtils.round_sig(v) # if print_flag: # print("Classification Table") # print(pd.crosstab(results.actual, results.predicted, rownames=['actual'], colnames=['preds'])) # # fpr = dict() # tpr = dict() # roc_auc = dict() # # fpr["response"], tpr["response"], _ = roc_curve(y_test, y_score) # roc_auc["response"] = auc(fpr["response"], tpr["response"]) # if plot_flag == True: # plt.figure() # lw = 2 # plt.plot(fpr['response'], tpr['response'], color='darkorange', # lw=lw, label='ROC curve (area = %0.2f)' % roc_auc['response']) # plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') # plt.xlim([0.0, 1.0]) # plt.ylim([0.0, 1.05]) # plt.xlabel('False Positive Rate') # plt.ylabel('True Positive Rate') # plt.title('ROC Curve') # plt.legend(loc="lower right") # plt.show() # return {"y_prob":y_prob,"results":results,"feature_importance":feature_importance, # "feature_weight":importances,"auc":roc_auc["response"],"trained_model":clf} return {"trained_model":clf,"actual":y_test,"predicted":y_score,"probability":y_prob,"feature_importance":feature_importance,"featureList":list(x_train.columns),"labelMapping":labelMapping}
def __init__(self, left_hinge_value=0.0, q1_value=0.0, median=0.0, q3_value=0.0, right_hinge_value=0.0, num_left_outliers=0, num_right_outliers=0, q1_freq=0, q2_freq=0, q3_freq=0, q4_freq=0): self.splits = { FivePointSummary.LEFT_HINGE: utils.round_sig(left_hinge_value, 3), FivePointSummary.Q1: utils.round_sig(q1_value, 3), FivePointSummary.Q2: utils.round_sig(median, 3), FivePointSummary.MEDIAN: utils.round_sig(median, 3), FivePointSummary.Q3: utils.round_sig(q3_value, 3), FivePointSummary.RIGHT_HINGE: utils.round_sig(right_hinge_value, 3) } self.outliers = { FivePointSummary.LEFT_OUTLIERS: num_left_outliers, FivePointSummary.RIGHT_OUTLIERS: num_right_outliers } self.freq = { FivePointSummary.Q1: q1_freq, FivePointSummary.Q2: q2_freq, FivePointSummary.Q3: q3_freq, FivePointSummary.Q4: q4_freq }
def get_top_articles(self,pandasDf): relevantDf = pandasDf[["time","source","title","overallSentiment"]] relevantDf["sentimentPerChange"] = relevantDf["overallSentiment"].pct_change() relevantDf = relevantDf.fillna(0) relevantDf = relevantDf.sort_values(by=['overallSentiment'],ascending=False) topIncrease = relevantDf.iloc[0:3] #top3 relevantDf = relevantDf.sort_values(by=['overallSentiment'],ascending=True) topDecrease = relevantDf.iloc[0:3] #top3 outDf = pd.concat([topIncrease,topDecrease]) outDf["time"] = outDf["time"].apply(self.change_date_format) output = [["Date","Source","Title","Sentiment","% increase/ Decrease"]] for idx,dfRow in outDf.iterrows(): row = [dfRow["time"],dfRow["source"],dfRow["title"],CommonUtils.round_sig(dfRow["overallSentiment"],sig=2),str(CommonUtils.round_sig(dfRow["sentimentPerChange"],sig=2))+"%"] output.append(row) return output
def set_summary_stats(self, num_values=0, min_value=0.0, max_value=0.0, total=0.0, mean=0.0, variance=0.0, std_dev=0.0, skew=0.0, kurtosis=0.0): self.n = num_values self.min = utils.round_sig(min_value, 3) self.max = utils.round_sig(max_value, 3) self.total = utils.round_sig(total, 3) self.mean = utils.round_sig(mean, 3) self.var = utils.round_sig(variance, 3) self.std_dev = utils.round_sig(std_dev, 3) self.skew = utils.round_sig(skew, 3) self.kurtosis = utils.round_sig(kurtosis, 3)
def get_key_days_and_impactful_articles(self,pandasDf,stockPriceData): relevantDf1 = stockPriceData[["date","closePerChange"]] relevantDf1.columns = ["time","closePerChange"] relevantDf2 = pandasDf[["time","source","title","overallSentiment"]] merged = pd.merge(relevantDf2,relevantDf1,on="time",how="inner") merged = merged.sort_values(by=['closePerChange'],ascending=False) topIncrease = merged.iloc[0:2] #top2 # print topIncrease.shape merged = merged.sort_values(by=['closePerChange'],ascending=True) topDecrease = merged.iloc[0:2] #top2 # print topDecrease.shape outDf = pd.concat([topIncrease,topDecrease]) outDf["time"] = outDf["time"].apply(self.change_date_format) # print outDf.shape output = [["Date","% increase/ Decrease stock Price","Source","Title","Sentiment"]] for idx,dfRow in outDf.iterrows(): row = [dfRow["time"],str(CommonUtils.round_sig(dfRow["closePerChange"],sig=2))+"%",dfRow["source"],dfRow["title"],CommonUtils.round_sig(dfRow["overallSentiment"],sig=2)] output.append(row) return output
def Train(self): st_global = time.time() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = filter(lambda x:x["algorithmSlug"]==GLOBALSETTINGS.MODEL_SLUG_MAPPING["generalizedlinearregression"],algosToRun)[0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) print categorical_columns result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [x for x in numerical_columns if x != result_column] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print "model_path",model_path pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression") pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values()))) # print indexed.select([result_column,"features"]).show(5) MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath) glinr = GeneralizedLinearRegression(labelCol=result_column, featuresCol='features',predictionCol="prediction") if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345) paramGrid = ParamGridBuilder()\ .addGrid(glinr.regParam, [0.1, 0.01]) \ .addGrid(glinr.fitIntercept, [False, True])\ .build() crossval = CrossValidator(estimator=glinr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column), numFolds=numFold) st = time.time() cvModel = crossval.fit(indexed) trainingTime = time.time()-st print "cvModel training takes",trainingTime bestModel = cvModel.bestModel elif validationDict["name"] == "trainAndtest": trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345) st = time.time() fit = glinr.fit(trainingData) trainingTime = time.time()-st print "time to train",trainingTime bestModel = fit print bestModel.explainParams() print bestModel.extractParamMap() print bestModel.params print 'Best Param (regParam): ', bestModel._java_obj.getRegParam() print 'Best Param (MaxIter): ', bestModel._java_obj.getMaxIter() # modelPmmlPipeline = PMMLPipeline([ # ("pretrained-estimator", objs["trained_model"]) # ]) # try: # modelPmmlPipeline.target_field = result_column # modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column]) # sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True) # pmmlfile = open(pmml_filepath,"r") # pmmlText = pmmlfile.read() # pmmlfile.close() # self._result_setter.update_pmml_object({self._slug:pmmlText}) # except: # pass coefficientsArray = [(name, bestModel.coefficients[idx]) for idx, name in featureMapping] MLUtils.save_pipeline_or_model(bestModel,model_filepath) transformed = bestModel.transform(validationData) transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType())) transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference") transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape") sampleData = None nrows = transformed.count() if nrows > 100: sampleData = transformed.sample(False, float(100)/nrows, seed=420) else: sampleData = transformed print sampleData.show() evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column) metrics = {} metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"}) metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"}) metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"}) metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"}) runtime = round((time.time() - st_global),2) # print transformed.count() mapeDf = transformed.select("mape") # print mapeDf.show() mapeStats = MLUtils.get_mape_stats(mapeDf,"mape") mapeStatsArr = mapeStats.items() mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0])) # print mapeStatsArr quantileDf = transformed.select("prediction") # print quantileDf.show() quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction") quantileSummaryArr = quantileSummaryDict.items() quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0])) # print quantileSummaryArr self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("Generalized Linear Regression") self._model_summary.set_algorithm_display_name("Generalized Linear Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.toPandas().to_dict()) self._model_summary.set_coefficinets_array(coefficientsArray) self._model_summary.set_feature_list(list(x_train.columns)) # print CommonUtils.convert_python_object_to_json(self._model_summary) modelSummaryJson = { "dropdown":{ "name":self._model_summary.get_algorithm_name(), "accuracy":CommonUtils.round_sig(self._model_summary.get_model_evaluation_metrics()["r2"]), "slug":self._model_summary.get_slug() }, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict() } glinrCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in glinrCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({"generalizedlinearregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_generalized_linear_regression_model_summary(modelSummaryJson) self._result_setter.set_glinr_cards(glinrCards)
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print(categorical_columns) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() levels = df[result_column].unique() clf = SVC(kernel='linear', probability=True) labelEncoder = preprocessing.LabelEncoder() labelEncoder.fit(np.concatenate([y_train, y_test])) y_train = pd.Series(labelEncoder.transform(y_train)) y_test = labelEncoder.transform(y_test) classes = labelEncoder.classes_ transformed = labelEncoder.transform(classes) labelMapping = dict(list(zip(transformed, classes))) inverseLabelMapping = dict(list(zip(classes, transformed))) posLabel = inverseLabelMapping[self._targetLevel] appType = self._dataframe_context.get_app_type() print(appType, labelMapping, inverseLabelMapping, posLabel, self._targetLevel) if algoSetting.is_hyperparameter_tuning_enabled(): hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name() params_grid = algoSetting.get_params_dict_hyperparameter() params_grid = { k: v for k, v in list(params_grid.items()) if k in clf.get_params() } print(params_grid) if hyperParamAlgoName == "gridsearchcv": clfGrid = GridSearchCV(clf, params_grid) gridParams = clfGrid.get_params() hyperParamInitParam = { k: v for k, v in list(hyperParamInitParam.items()) if k in gridParams } clfGrid.set_params(**hyperParamInitParam) #clfGrid.fit(x_train,y_train) grid_param = {} grid_param['params'] = ParameterGrid(params_grid) #bestEstimator = clfGrid.best_estimator_ modelFilepath = "/".join(model_filepath.split("/")[:-1]) sklearnHyperParameterResultObj = SklearnGridSearchResult( grid_param, clf, x_train, x_test, y_train, y_test, appType, modelFilepath, levels, posLabel, evaluationMetricDict) resultArray = sklearnHyperParameterResultObj.train_and_save_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": sklearnHyperParameterResultObj.get_ignore_list(), "hideColumns": sklearnHyperParameterResultObj.get_hide_columns(), "metricColName": sklearnHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": sklearnHyperParameterResultObj.get_keep_columns() }) elif hyperParamAlgoName == "randomsearchcv": clfRand = RandomizedSearchCV(clf, params_grid) clfRand.set_params(**hyperParamInitParam) bestEstimator = None else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results( self._slug, None) algoParams = algoSetting.get_params_dict() algoParams = { k: v for k, v in list(algoParams.items()) if k in list(clf.get_params().keys()) } clf.set_params(**algoParams) print("!" * 50) print(clf.get_params()) print("!" * 50) if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT[ "value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 kFoldClass = SkleanrKFoldResult( numFold, clf, x_train, x_test, y_train, y_test, appType, levels, posLabel, evaluationMetricDict=evaluationMetricDict) kFoldClass.train_and_save_result() kFoldOutput = kFoldClass.get_kfold_result() bestEstimator = kFoldClass.get_best_estimator() elif validationDict["name"] == "trainAndtest": clf.fit(x_train, y_train) bestEstimator = clf # clf.fit(x_train, y_train) # bestEstimator = clf trainingTime = time.time() - st y_score = bestEstimator.predict(x_test) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0] * len(y_score) # overall_precision_recall = MLUtils.calculate_overall_precision_recall(y_test,y_score,targetLevel = self._targetLevel) # print overall_precision_recall accuracy = metrics.accuracy_score(y_test, y_score) if len(levels) <= 2: precision = metrics.precision_score(y_test, y_score, pos_label=posLabel, average="binary") recall = metrics.recall_score(y_test, y_score, pos_label=posLabel, average="binary") auc = metrics.roc_auc_score(y_test, y_score) elif len(levels) > 2: precision = metrics.precision_score(y_test, y_score, pos_label=posLabel, average="macro") recall = metrics.recall_score(y_test, y_score, pos_label=posLabel, average="macro") # auc = metrics.roc_auc_score(y_test,y_score,average="weighted") auc = None y_score = labelEncoder.inverse_transform(y_score) y_test = labelEncoder.inverse_transform(y_test) featureImportance = {} feature_importance = dict( sorted(zip(x_train.columns, bestEstimator.feature_importances_), key=lambda x: x[1], reverse=True)) for k, v in feature_importance.items(): feature_importance[k] = CommonUtils.round_sig(v) objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": feature_importance, "featureList": list(x_train.columns), "labelMapping": labelMapping } if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".pkl") joblib.dump(objs["trained_model"], "/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass cat_cols = list(set(categorical_columns) - {result_column}) overall_precision_recall = MLUtils.calculate_overall_precision_recall( objs["actual"], objs["predicted"], targetLevel=self._targetLevel) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Svm") self._model_summary.set_algorithm_display_name( "Support Vector Machine") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix( MLUtils.calculate_confusion_matrix(objs["actual"], objs["predicted"])) self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy( round(metrics.accuracy_score(objs["actual"], objs["predicted"]), 2)) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats( overall_precision_recall["classwise_stats"]) self._model_summary.set_model_precision( overall_precision_recall["precision"]) self._model_summary.set_model_recall( overall_precision_recall["recall"]) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split( overall_precision_recall["prediction_split"]) self._model_summary.set_validation_method("Train and Test") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features( [col for col in x_train.columns if col != result_column]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)))) self._model_summary.set_num_trees(100) self._model_summary.set_num_rules(300) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": self._model_summary.get_model_accuracy(), "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": resultArray[0]["Accuracy"], "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } svmCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] for card in svmCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "svm": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_svm_model_summary(modelSummaryJson) self._result_setter.set_rf_cards(svmCards) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")
def train_and_save_models(self): tableOutput = [] evaluationMetric = self.evaluationMetricDict["name"] for idx, paramsObj in enumerate(self.resultDf["params"]): st = time.time() estimator = self.estimator.set_params(**paramsObj) estimator.fit(self.x_train, self.y_train) y_score = estimator.predict(self.x_test) modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - len(str(idx + 1))) + str(idx + 1) print "#" * 100 print "Feature Importance ", modelName try: print estimator.feature_importances_ except: print "Feature Importance Not Defined" print "#" * 100 slug = self.modelFilepath.split("/")[-1] algoName = GLOBALSETTINGS.SLUG_MODEL_DISPLAY_NAME_MAPPING[slug] joblib.dump(estimator, self.modelFilepath + "/" + modelName + ".pkl") row = { "Model Id": modelName, "Slug": slug, "Selected": "False", "alwaysSelected": "False", "Run Time(Secs)": CommonUtils.round_sig(time.time() - st), "comparisonMetricUsed": None, "algorithmName": algoName } # row = {"Model Id":modelName,"Slug":slug,"Selected":"False","Run Time(Secs)":str(CommonUtils.round_sig(time.time()-st))} algoEvaluationMetrics = {} if self.appType == "REGRESSION": algoEvaluationMetrics["R-Squared"] = metrics.r2_score( self.y_test, y_score) algoEvaluationMetrics["MSE"] = metrics.mean_squared_error( self.y_test, y_score) algoEvaluationMetrics["MAE"] = metrics.mean_absolute_error( self.y_test, y_score) algoEvaluationMetrics["RMSE"] = sqrt( algoEvaluationMetrics["MSE"]) row["comparisonMetricUsed"] = self.evaluationMetricDict[ "displayName"] elif self.appType == "CLASSIFICATION": algoEvaluationMetrics["Accuracy"] = metrics.accuracy_score( self.y_test, y_score) row["comparisonMetricUsed"] = self.evaluationMetricDict[ "displayName"] if len(self.levels) <= 2: algoEvaluationMetrics[ "Precision"] = metrics.precision_score( self.y_test, y_score, pos_label=self.posLabel, average="binary") algoEvaluationMetrics["Recall"] = metrics.recall_score( self.y_test, y_score, pos_label=self.posLabel, average="binary") algoEvaluationMetrics["ROC-AUC"] = metrics.roc_auc_score( self.y_test, y_score) elif len(self.levels) > 2: algoEvaluationMetrics[ "Precision"] = metrics.precision_score( self.y_test, y_score, pos_label=self.posLabel, average="macro") algoEvaluationMetrics["Recall"] = metrics.recall_score( self.y_test, y_score, pos_label=self.posLabel, average="macro") algoEvaluationMetrics["ROC-AUC"] = "NA" algoEvaluationMetrics = { k: CommonUtils.round_sig(v) for k, v in algoEvaluationMetrics.items() } # algoEvaluationMetrics = {k:str(CommonUtils.round_sig(v)) for k,v in algoEvaluationMetrics.items()} row.update(algoEvaluationMetrics) paramsObj = dict([(k, str(v)) if (v == None) | (v in [True, False]) else (k, v) for k, v in paramsObj.items()]) row.update(paramsObj) tableOutput.append(row) if self.appType == "REGRESSION": if self.evaluationMetricDict["name"] == "r2": tableOutput = sorted(tableOutput, key=lambda x: float(x[tableOutput[0][ "comparisonMetricUsed"]]), reverse=True) else: tableOutput = sorted(tableOutput, key=lambda x: float(x[tableOutput[0][ "comparisonMetricUsed"]]), reverse=False) elif self.appType == "CLASSIFICATION": if (len(self.levels) > 2) & (self.evaluationMetricDict["name"] == "roc_auc"): defaultComparisonMetric = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC] tableOutput = sorted( tableOutput, key=lambda x: float(x[defaultComparisonMetric]), reverse=True) else: tableOutput = sorted(tableOutput, key=lambda x: float(x[tableOutput[0][ "comparisonMetricUsed"]]), reverse=True) if self.appType == "REGRESSION": self.keepColumns += ["RMSE", "MAE", "MSE", "R-Squared"] elif self.appType == "CLASSIFICATION": self.keepColumns += ["Accuracy", "Precision", "Recall", "ROC-AUC"] self.keepColumns += paramsObj.keys() self.keepColumns.append("Selected") bestMod = tableOutput[0] bestMod["Selected"] = "True" bestMod["alwaysSelected"] = "True" tableOutput[0] = bestMod return tableOutput
except Exception, e: stockDict.pop(current_stock, None) print "Failed for : ", current_stock, " with error : ", str(e) # print "#"*100 self._stockNameList = working_stock_list number_stocks = len(self._stockNameList) if number_stocks == 0: return {} stockPriceTrendArray = [] dateList = stockPriceTrendDict[self._stockNameList[0]].keys() stockPriceTrendArray = stockPriceTrendDict[self._stockNameList[0]].items() capNameList = [self.get_capitalized_name(x) for x in self._stockNameList] capNameDict = dict(zip(self._stockNameList,capNameList)) stockPriceTrendArray = [{"date":obj[0],capNameList[0]:CommonUtils.round_sig(obj[1],sig=2)} for obj in stockPriceTrendArray] for obj in stockPriceTrendArray: for stockName in self._stockNameList[1:]: # print "stockName : ", stockName # print "obj : ", obj # print "stockPriceTrendDict[stockName] : ", stockPriceTrendDict[stockName] # print "stockPriceTrendDict[stockName]obj date : ", stockPriceTrendDict[stockName][obj["date"]] # print "_"*50 stock_price_dates = stockPriceTrendDict[stockName].keys() if obj["date"] not in stock_price_dates: if len(stock_price_dates) > 0 : stockPriceTrendDict[stockName][obj["date"]] = sum([stockPriceTrendDict[stockName][key] for key in stock_price_dates])/len(stock_price_dates) else: stockPriceTrendDict[stockName][obj["date"]] = 0.0 obj.update({capNameDict[stockName]:CommonUtils.round_sig(stockPriceTrendDict[stockName][obj["date"]],sig=2)})