def build_sentiment(classifier, name, with_proba=True): pipeline = PMMLPipeline([ ("tf-idf", TfidfVectorizer( analyzer="word", preprocessor=None, strip_accents=None, lowercase=True, token_pattern=None, tokenizer=Splitter(), stop_words="english", ngram_range=(1, 2), norm=None, dtype=(numpy.float32 if isinstance( classifier, RandomForestClassifier) else numpy.float64))), ("selector", SelectorProxy(SelectPercentile(chi2, percentile=10))), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) store_pkl(pipeline, name + ".pkl") score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"]) if (with_proba == True): score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns=["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis=1) store_csv(score, name + ".csv")
def build_audit(classifier, name, with_proba=True): mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Employment", [ LabelBinarizer(), SelectFromModel(EstimatorProxy( DecisionTreeClassifier(random_state=13)), threshold="1.25 * mean") ]), ("Education", [ LabelBinarizer(), SelectorProxy( SelectFromModel(EstimatorProxy( RandomForestClassifier(random_state=13, n_estimators=3)), threshold="median")) ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]), ("Occupation", [LabelBinarizer(), SelectorProxy(SelectKBest(k=3))]), ("Income", ContinuousDomain()), ("Gender", LabelEncoder()), ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain()) ]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns=["probability_0", "probability_1"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def build_iris(classifier, name, with_proba=True): pipeline = PMMLPipeline([ ("union", FeatureUnion([("normal_scale", DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()), ])), ("log_scale", DataFrameMapper([(iris_X.columns.values, FunctionTransformer(numpy.log10))])) ])), ("scaler", RobustScaler()), ("pca", IncrementalPCA(n_components=3, whiten=True)), ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability(setosa)", "probability(versicolor)", "probability(virginica)" ]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_audit(classifier, name, with_proba = True, **kwargs): continuous_mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Income", ContinuousDomain()), ("Hours", ContinuousDomain()) ]) categorical_mapper = DataFrameMapper([ ("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]), ("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]), ("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]), ("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]), ("Deductions", [CategoricalDomain(), LabelEncoder()]), ]) pipeline = PMMLPipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if(with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")
def build_audit_dict(classifier, name, with_proba=True): pipeline = PMMLPipeline([("dict-transformer", DictVectorizer()), ("classifier", classifier)]) pipeline.fit(audit_dict_X, audit_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_dict_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame( pipeline.predict_proba(audit_dict_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
class XgbModel(object): def __init__(self, train, train_label, test, test_label): self.__train = train self.__train_label = train_label self.__test = test self.__test_label = test_label self.__bst = None self.__feat_imp = None self.__test_preds = None self.__test_predictions = None self.__output = None def train(self): self.__bst = XGBClassifier(objective="binary:logistic") self.__bst = PMMLPipeline([("estimator", self.__bst)]) self.__bst.fit(self.__train, self.__train_label, estimator__eval_metric="auc") def predict(self): self.__test_preds = self.__bst.predict_proba(self.__test)[:, 1] self.__test_predictions = self.__bst.predict(self.__test) def feature_importances(self): self.__feat_imp = (pd.Series( self.__bst.feature_importances_, ["gbc", "rf", "ab", "lr"]).sort_values(ascending=False)) self.__feat_imp.plot(kind="bar", title="Feature Importances") plt.ylabel("Feature Importance Score") plt.show() def evaluate(self): print("auc : %.4f" % roc_auc_score(self.__test_label, self.__test_preds)) print("accuracy score : %.4f" % accuracy_score(self.__test_label, self.__test_predictions)) def evaluate_output(self): self.__output = np.hstack( (self.__test, self.__test_label.reshape( (-1, 1)), self.__test_preds.reshape((-1, 1)))) pd.DataFrame( self.__output).to_csv("C:\\Users\\Dell\\Desktop\\output.csv") def xgbmodel_output(self): joblib.dump(self.__bst, "C:\\Users\\Dell\\Desktop\\bstML.pkl.z", compress=True)
def build_versicolor(classifier, name, with_proba=True): mapper = DataFrameMapper([((versicolor_columns[:-1], [ContinuousDomain(), RobustScaler()]))]) pipeline = PMMLPipeline([("mapper", mapper), ("transformer", PolynomialFeatures(degree=3)), ("selector", SelectKBest(k="all")), ("classifier", classifier)]) pipeline.fit(versicolor_X, versicolor_y) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns=["probability_0", "probability_1"]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_audit_na(classifier, name, with_proba = True): mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["Age", "Income", "Hours"]] + [([column], [CategoricalDomain(missing_values = None), CategoricalImputer(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) if(with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")
def build_iris(classifier, name, with_proba=True): mapper = DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()), ]) pipeline = PMMLPipeline([("mapper", mapper), ("scaler", RobustScaler()), ("pca", IncrementalPCA(n_components=3, whiten=True)), ("classifier", classifier)]) pipeline.fit(iris_X, iris_y) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability_setosa", "probability_versicolor", "probability_virginica" ]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_audit_na(classifier, name, with_proba=True): employment_mapping = { "Consultant": "Private", "PSFederal": "Public", "PSLocal": "Public", "PSState": "Public", "SelfEmp": "Private", "Private": "Private" } gender_mapping = {"Female": 0, "Male": 1} mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values=None), Imputer()]) for column in ["Age", "Income", "Hours"]] + [("Employment", [ CategoricalDomain(missing_values=None), CategoricalImputer(), LookupTransformer(employment_mapping, "Other"), PMMLLabelBinarizer() ])] + [([column], [ CategoricalDomain(missing_values=None), CategoricalImputer(), PMMLLabelBinarizer() ]) for column in ["Education", "Marital", "Occupation"]] + [("Gender", [ CategoricalDomain(missing_values=None), CategoricalImputer(), LookupTransformer(gender_mapping, None) ])]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_na_X, audit_na_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def PMML_creation(train_path, test_path, pmml_predictions, pmml_path,path,acct_id): logging.info('PMML creation Started.') data = pd.read_csv(r'' + train_path) data2 = pd.read_csv(r'' + test_path) features = ['avg_delay_categorical', 'variance_categorical', 'LMH_cumulative', 'avg_of_invoices_closed', 'avg_of_all_delays', 'payment_count_quarter_q1', 'payment_count_quarter_q2', 'payment_count_quarter_q3', 'payment_count_quarter_q4', 'invoice_count_quarter_q1', 'invoice_count_quarter_q2', 'invoice_count_quarter_q3', 'invoice_count_quarter_q4', 'number_invoices_closed'] #rf = RandomForestClassifier(n_estimators=100,random_state =42, class_weight = {0: 1, 1:1}, max_depth = 8, max_features =0.5) #duracell #rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight={0: 1, 1: 1}, max_depth=8,max_features=0.5, min_weight_fraction_leaf=0.1) #gettyimages # rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight={0: 1, 1: 1}, max_depth=8,max_features=0.5) #milliken #rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 1, 1: 2}, max_depth=8,max_features=0.5) #graybar # rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight={0: 1, 1: 1}, max_depth=7, # max_features=0.4, min_samples_split=4, min_samples_leaf=3, # min_weight_fraction_leaf=0.1) # rf = RandomForestClassifier(n_estimators=250, random_state=42, class_weight={0: 1, 1: 2}, max_depth=7, # max_features=0.4, min_samples_leaf=4, min_weight_fraction_leaf=0.2) # rf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=10, random_state=42, class_weight={0: 1, 1: 2}, # # criterion='gini', max_depth=7, max_features=0.4, n_jobs=-1, # # min_weight_fraction_leaf=0.4) # rf = RandomForestClassifier(n_estimators=320, max_leaf_nodes=20, random_state=42, class_weight={0:1, 1:2}, # criterion='gini', max_depth=7, max_features=0.4,n_jobs=-1) # # rf = xgboost.XGBClassifier(random_state=42, n_estimators=206, min_samples_split=10, min_samples_leaf=6, # max_features='sqrt', max_depth=1, learning_rate=0.0015) # rf = xgboost.XGBClassifier(random_state=42, n_estimators=302, min_samples_split=10, min_samples_leaf=10, # max_features='sqrt', max_depth=1, learning_rate=0.0074) # rf=xgboost.XGBClassifier(random_state=42, n_estimators=145, min_samples_split=24, min_samples_leaf=1, # max_features='sqrt', max_depth=33, learning_rate=0.0077) # rf = RandomForestClassifier(random_state=42, n_estimators=400, min_weight_fraction_leaf=0.3, min_samples_split=24, # min_samples_leaf=1, max_features='sqrt', max_depth=1, criterion='entropy') #rf= xgboost.XGBClassifier(random_state=42,n_estimators=106,min_samples_split=24, min_samples_leaf= 16, max_features= 'auto',max_depth=1, learning_rate= 0.0044) #rf = LGBMClassifier(class_weight={0: 1, 1: 5}, max_depth=10, num_leaves=1000, min_data_in_leaf=500, # learning_rate=0.08) #rf = xgboost.XGBClassifier(random_state=42,n_estimators=106,min_samples_split=24, min_samples_leaf= 16, max_features= 'sqrt',max_depth=35, learning_rate= 0.0077) #rf= xgboost.XGBClassifier(random_state=42,n_estimators=400,min_samples_split=34, min_samples_leaf= 16, max_features= 'sqrt',max_depth=35, learning_rate= 0.0099) #rf= xgboost.XGBClassifier(random_state=42,n_estimators=200,min_samples_split=14, min_samples_leaf= 1, max_features= 'sqrt',max_depth=20, learning_rate= 0.0077) #final_report = pd.read_csv(path+'/account_'+acct_id+'/summary.csv') model = joblib.load(path+'/account_'+acct_id+'/trained_model/model.pkl') #model_name = str(model).split('(')[0] params = model.get_params() classifier= type(model)() rf = classifier.set_params(**params) print("-"*100) print(rf) #rf = type(model)(model.get_params) print((rf.get_params())) print("-"*100) print(model.get_params()) # rf= xgboost.XGBClassifier(random_state=42, n_estimators=320, min_samples_split=5, min_samples_leaf=6, # max_features='log2', max_depth=50, learning_rate=0.0093) # rf = xgboost.XGBClassifier(random_state=42, n_estimators=445, min_samples_split=5, min_samples_leaf=8, # max_features='sqrt', max_depth=1, learning_rate=0.00959591836734694) mapper = DataFrameMapper([('avg_delay_categorical', None), ('variance_categorical', None), ('LMH_cumulative', None), ('avg_of_invoices_closed', None), ('avg_of_all_delays', None), ('payment_count_quarter_q1', None), ('payment_count_quarter_q2', None), ('payment_count_quarter_q3', None), ('payment_count_quarter_q4', None), ('invoice_count_quarter_q1', None), ('invoice_count_quarter_q2', None), ('invoice_count_quarter_q3', None), ('invoice_count_quarter_q4', None), ('number_invoices_closed', None) ]) labels = data.loc[:, 'output'] labels.name = 'output' data = data[features].astype('double') print(data.dtypes) pipeline = PMMLPipeline([("mapper", mapper), ("estimator", rf)]) pickle_pipeline = Pipeline([("mapper", mapper), ("model", rf)]) pipeline.fit(data, labels) pickle_pipeline.fit(data, labels) predictions = pipeline.predict(data2[features]) predictions_prob = pipeline.predict_proba(data2[features]) data2['PMML_predictions'] = predictions for i in range(0, data2.shape[0]): data2.at[i, 'PMML_pred_proba_0'] = predictions_prob[i][0] data2.at[i, 'PMML_pred_proba_1'] = predictions_prob[i][1] data2.to_csv(pmml_predictions, index=False) sklearn2pmml(pipeline, r"" + pmml_path + '_PIPELINED' + ".pmml", debug=True) joblib.dump(pickle_pipeline, r"" + pmml_path + "_PIPELINED.pkl") logging.info('PMML created of size ' + str(file_size(r"" + pmml_path + ".pmml")))
'FIN_KUNNR': 'payer', 'FIN_PAID_AMT': 'paid_amount' }) # test['ship_to'] = test['ship_to'].astype('str').str.split('.').str[0] # # test_transformations = pd.DataFrame(mapper.fit_transform(test),columns=['create_minus_claim_date', 'category_history', 'cal_cust_history', 'ZZ_CLAIMDATE_SIMP_DT_month', 'ship_to_history', 'original_with_avg_dispute', 'rank_xref_in_kunnr', 'b_value', 'rank_kunwe_in_kunnr']) # # test_transformations.to_csv('test_transformations.csv') test_result = pd.DataFrame() test_result['output'] = pipeline.predict(test) # test_result['predict_proba1'] = pipeline.predict_proba(test)[:, 0] test_result['predict_proba2'] = pipeline.predict_proba(test)[:, 1] test_result['actual_result'] = test['labels'] from sklearn.metrics import classification_report print( classification_report(test_result['actual_result'], test_result['output'])) from sklearn2pmml import sklearn2pmml # # #sklearn2pmml(pipeline, "only_b_value.pmml", user_classpath=[r"D:\jesus\sap\sklearn2pmml-plugin-1.0-SNAPSHOT.jar"],debug=True) # sklearn2pmml( pipeline,
pipeline.fit(train,train['labels']) #test = pd.read_csv(r'Data/UDM_DISPUTE_20171231-20180202.csv') test = pd.read_csv('validation.csv') test['main_output']=(test['FIN_PAID_AMT']>(0.01*test['FIN_ORIGINAL_AMT'])) test['labels']=test['main_output'].map({True:-1,False:1}) test = test.rename(columns={'ZZ_CLAIMDATE_SIMP_DT': 'customer_claim_date', 'CREATE_TIME': 'deduction_created_date','ZZ_XREF3': 'product_category','KUNWE': 'ship_to','FIN_ORIGINAL_AMT': 'original_dispute_amount','FIN_KUNNR': 'payer','FIN_PAID_AMT': 'paid_amount'}) from sklearn.linear_model import LogisticRegression test_result = pd.DataFrame() test_result['output'] = pipeline.predict(test.head(1790)) # test_result['predict_proba1'] = pipeline.predict_proba(test.head(1790))[:,0] test_result['predict_proba2'] = pipeline.predict_proba(test.head(1790))[:,1] test_result['actual_result'] = test['labels'].head(1790) from sklearn.metrics import classification_report print(classification_report(test_result['actual_result'],test_result['output'])) #pipeline.predict() # # test_real = pd.read_csv('test_real3.csv',encoding='latin') # # test_results = pd.DataFrame(columns=['result','prob1','prob2']) #