def load_model_config(model_path, log_name=None): import json from .utils.config_utils import load_json config = load_json(model_path) if log_name is not None: logger = get_logger(log_name) logger.info(log_name) logger.info("\n" + json.dumps( config, sort_keys=True, indent=4, separators=(',', ':'))) return config
# -*- coding:utf-8 -*- import os, os.path as osp import numpy as np from cfxgb.lib.utils.log_utils import get_logger from cfxgb.lib.utils.cache_utils import name2path LOGGER = get_logger("cfxgb") def check_dir(path): d = osp.abspath(osp.join(path, osp.pardir)) if not osp.exists(d): os.makedirs(d) class BaseClassifierWrapper(object): def __init__(self, name, est_class, est_args): """ name: str) Used for debug and as the filename this model may be saved in the disk """ self.name = name self.est_class = est_class self.est_args = est_args self.cache_suffix = ".pkl" self.est = None def _init_estimator(self): """ You can re-implement this function when inherient this class
def main(args): #Logging logger = get_logger("cfxgb") ################################################################################################################ #ARGUMENT CHECK ################################################################################################################ if args.Dataset is None: logger.error("Dataset required") exit(0) if args.ParentCols < 0: logger.error("Enter valid levels") exit(0) if args.parameters is None: logger.error("Model Parameters required") exit(0) else: config = load_json(args.parameters) logger.info("Loaded JSON") logger.info( "JSON ----------------------------------------------------------------------------------" ) json1 = json.dumps(config, indent=4, separators=(". ", " = ")) logger.info(json1) logger.info( "END OF JSON----------------------------------------------------------------------------" ) ################################################################################################################ #DATASET ################################################################################################################ if not osp.exists(args.Dataset): full_path = osp.join('Datasets', args.Dataset + '.csv') if not osp.exists(full_path): logger.error("Enter valid Dataset") exit(0) else: full_path = args.Dataset logger.info(args.Dataset + " used") data = pd.read_csv(full_path) if (args.ignore): logger.info("First column ignored") data = data.iloc[:, 1:] logger.info("Data Read Complete") ################################################################################################################ ################################################################################################################ #Extra Columns ################################################################################################################ if (args.ParentCols): logger.info("{} level(s) of parent nodes will be added. ".format( args.ParentCols)) else: logger.info("Parent nodes not considered") ################################################################################################################ ################################################################################################################ #Sample ################################################################################################################ if (args.sample): weights = data.groupby( data.columns[-1])[data.columns[-1]].transform('count') if (len(np.unique(weights)) == 1): logging.info("Equal weights already.") data = data.sample(n=args.sample, random_state=0) else: sum = np.sum(np.unique(weights)) weights = sum - weights data = data.sample(n=args.sample, weights=weights, random_state=0) logger.info("Distribution after sampling : \n{}".format( data.iloc[:, -1].value_counts())) ################################################################################################################ ################################################################################################################ # X,y ################################################################################################################ X = data.iloc[:, :-1] y = data.iloc[:, -1] ################################################################################################################ ################################################################################################################ #Feature Selection (Initial) ################################################################################################################ if (args.featureSelect): logger.info("Feature Selection - Initial") clf = XGBClassifier(n_estimators=100, learning_rate=0.3, max_depth=4, verbosity=0, random_state=0, n_jobs=-1) rfe = RFECV(clf, step=1, cv=5, verbose=0) X = rfe.fit_transform(X, y) ################################################################################################################ ################################################################################################################ #TRAIN TEST SPLIT ################################################################################################################ X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) #stratify = y logger.info("Train Test Split complete") ################################################################################################################ #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #TRAINING #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #SAMPLING #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# if (args.RandomSamp): rus = RandomUnderSampler(random_state=0) X_train, y_train = rus.fit_resample(X_train, y_train) logger.info("Applied Random Under-Sampling") else: logger.info("No Random Under-Sampling") X_train = np.array(X_train) y_train = np.array(y_train) y_test = np.array(y_test) X_test = np.array(X_test) #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #MODEL #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #CFXGB cfxgb = CFXGB(config, args) #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #CASCADED FOREST AS TRANSFORMER #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# X_train_enc = cfxgb.get_encoded(X_train, y_train) X_test_enc = cfxgb.transform(X_test) #Final Transformation X_train_enc, X_test_enc = cfxgb.finalTransform(X_train, X_train_enc, X_test, X_test_enc) # X_train_enc = pd.DataFrame(X_train_enc) # X_train_enc.to_csv("X_train_enc.csv") # X_test_enc = pd.DataFrame(X_train_enc) # X_test_enc.to_csv("X_test_enc.csv") logger.info("X_train_enc.shape={}, X_test_enc.shape={}".format( X_train_enc.shape, X_test_enc.shape)) #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #XGBOOST #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# y_pred = cfxgb.classify(X_train_enc, y_train, X_test_enc, y_test) logger.info("Confusion Matrix - \n{}".format( confusion_matrix(y_test, y_pred))) logger.info("\nClassification Report - \n{}".format( classification_report(y_test, y_pred))) logger.info("Accuracy - {}\n".format(accuracy_score(y_test, y_pred))) fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred) logger.info("AUC ") auc = metrics.auc(fpr, tpr) logger.info(auc) logger.info("Time - {}".format(time.time() - t)) logger.info("Arguments used in this run : {}".format(str(sys.argv))) logging.shutdown()
import numpy as np from scipy.sparse import issparse from cfxgb.lib.utils.log_utils import get_logger LOGGER = get_logger('cfxgb') def load_model_config(model_path, log_name=None): import json from .utils.config_utils import load_json config = load_json(model_path) if log_name is not None: logger = get_logger(log_name) logger.info(log_name) logger.info("\n" + json.dumps( config, sort_keys=True, indent=4, separators=(',', ':'))) return config def concat_datas(datas): if type(datas) != list: return datas for i, data in enumerate(datas): datas[i] = data.reshape((data.shape[0], -1)) return np.concatenate(datas, axis=1) def data_norm(X_train, X_test): X_mean = np.mean(X_train, axis=0) X_std = np.std(X_train, axis=0)
import os, os.path as osp import numpy as np from sklearn.model_selection import KFold, StratifiedKFold from xgboost import XGBClassifier from cfxgb.lib.utils.log_utils import get_logger from cfxgb.lib.utils.cache_utils import name2path LOGGER = get_logger("gcforest.estimators.kfold_wrapper") class KFoldWrapper(object): """ K-Fold Wrapper """ def __init__(self, name, n_folds, est_class, est_args, args, random_state=None): """ Parameters ---------- n_folds (int): Number of folds. If n_folds=1, means no K-Fold est_class (class): Class of estimator args: