class XGB(BaseModel): def __init__(self): self.clf = XGBClassifier( n_estimators=200, max_depth=20, learning_rate=0.1, random_state=0, booster="gbtree", use_label_encoder=False, ) def train(self, X_train, Y_train): X_train, Y_train = do_rebalance(X_train, Y_train) self.clf.fit(X_train, Y_train) def test(self, X_test, Y_test): Y_prob = self.clf.predict_proba(X_test) auc = metrics.roc_auc_score(Y_test, Y_prob[:, 1]) def predict(self, X): Y_prob = self.clf.predict_proba(X) return Y_prob def load_model(self, model_path): self.clf.load_model(model_path) # with open(model_path, "rb+") as file: # self.clf = pickle.load(file) def save_model(self, model_path): self.clf.save_model(model_path)
class WrappedXGBClassifier(WrappedModel): def base_init_finished(self): self.reset() def fit(self, X, y): self._value.fit(X, y, **self._fit_kwargs) return self def reset(self): from xgboost import XGBClassifier self._value = XGBClassifier(**self._init_kwargs) def predict(self, X): return self._value.predict(X) def predict_proba(self, X): if self._pos_index is None: raise Exception('predict_proba need pos_index') return self._value.predict_proba(X)[:, self._pos_index] def dump(self, dirpath, name): self.value.save_model(pathjoin(dirpath, name + '.bin')) return self def load(self, dirpath, name): self._value.load_model(pathjoin(dirpath, name + '.bin')) return self
class XGBoost_Ranker(): def __init__(self, timestamp, load=True): self.model = XGBClassifier() self.model.load_model(timestamp + '.file') self.factor = 1.0 def set_factor(self, factor): self.factor = factor def rank_features(self, features): _features = np.copy(features) for f in _features: f[1] *= self.factor f[4] *= self.factor f[5] *= self.factor # return np.array([0, 1, 2, 3, 4]) test_x = [] for i in range(len(_features)): for j in range(len(_features)): if i == j: continue test_x.append( np.concatenate((_features[i], _features[j]), axis=0)) test_x = np.array(test_x) print(test_x.shape) y = self.model.predict(test_x).reshape(len(_features), len(_features) - 1) y = np.sum(y, axis=1) # print(y) return np.argsort(y)[::-1]
class StabilityClassifier(): def __init__(self, modelfile='spock.json'): pwd = os.path.dirname(__file__) self.model = XGBClassifier() self.model.load_model(pwd + '/models/'+modelfile) def check_errors(self, sim): if sim.N_real < 4: raise AttributeError("SPOCK Error: SPOCK only applicable to systems with 3 or more planets") def predict_stable(self, sim): triofeatures, stable = self.generate_features(sim) if stable == False: return 0 trioprobs = self.predict_from_features(triofeatures) return trioprobs.min() # minimum prob among all trios tested def generate_features(self, sim): sim = sim.copy() init_sim_parameters(sim) self.check_errors(sim) trios = [[i,i+1,i+2] for i in range(1,sim.N_real-2)] # list of adjacent trios featureargs = [10000, 80, trios] triofeatures, stable = features(sim, featureargs) return triofeatures, stable def predict_from_features(self, triofeatures): # xgboost model expects a 2D array of shape (Npred, Nfeatures) where Npred is number of samples to predict, Nfeatures is # of features per sample featurevals = np.array([[val for val in features.values()] for features in triofeatures]) return self.model.predict_proba(featurevals)[:,1] # take 2nd column for probability it belongs to stable class
def load_modele(path): '''Renvoie le modele en tant qu\'objet à partir du chemin''' if 'GradientBoosting' in str(path): #print('Chargement XGBOOST') model = XGBClassifier() model.load_model(path) return model else: #print('Chargement Pickle') return pickle.load(open(path, 'rb'))
def export_model(amnt_data, client): model = XGBClassifier() model.load_model('boa.model') output = open('model.pb', 'wb') pickle.dump([model, amnt_data], output) output.close() bucket_test = client.get_bucket('traina-data') blob_test = bucket_test.blob('model.pb') blob_test.upload_from_filename(filename='model.pb') os.remove('model.pb') print(Fore.GREEN+'Exported Model Sucessfully') return
def get_model(): param_path = os.path.join(STORAGE, "params.json") with open(param_path, "r") as f: json_data = f.read() params = dict(json.loads(json_data)) model = XGBClassifier(**params) model_path = os.path.join(STORAGE, "model.xgb") model.load_model(model_path) return model
class ProcessPlugin(WorkerPlugin): def __init__(self, cfg_path=os.environ.get("MODEL_CONFIG"), weights_path=os.environ.get("PP_WEIGHTS_PTH"), classes_path=os.environ.get("CLASSES_PTH")): self.cfg_path = cfg_path postprocess_weights_pth = weights_path self.postprocess_model = XGBClassifier() self.postprocess_model.load_model(postprocess_weights_pth) self.classes_pth = classes_path with open(self.classes_pth) as stream: self.classes = yaml.load(stream)["CLASSES"]
def train_lazy(): # Load the dataset X, y = load_data() # Split the data X_train, X_val, y_train, y_val = split_dataset(X, y) # # Normalize X_train = normalize(X_train) X_val = normalize(X_val) # uncomment to check the performance of the 25 models # clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None) # # fit # scores,_ = clf.fit(X_train, X_val, y_train, y_val) # # print # print(scores) # Final model # check if model exist if os.path.isfile(config.MODEL_PATH): model = XGBClassifier() model.load_model(config.MODEL_PATH) else: model = XGBClassifier() model.fit(X_train, y_train, eval_metric="error", eval_set=[(X_train, y_train), (X_val, y_val)], verbose=True) # save model model.save_model(config.MODEL_PATH) # performance on train set y_pred = model.predict(X_train) # evaluate predictions print_performance(y_train, y_pred, 'train') # performance on val set y_pred = model.predict(X_val) # evaluate predictions print_performance(y_val, y_pred, 'val') # Load the test dataset X_test, y_test = load_test_data() # # Normalize X_test = normalize(X_test) # get prediction y_pred = model.predict(X_test) # evaluate predictions print_performance(y_test, y_pred, 'test') # print plot_performance(model)
def predict_probability_of_winning(gold_diff_at_10, exp_diff_at_10, team): dirname = os.path.dirname(__file__) os.path.join(dirname, 'djangoapp/chart/utils/models/red_model.json') model = XGBClassifier() model.load_model(os.path.join(dirname, f'./models/{team}_model.json')) df = pd.DataFrame({ team + 'GoldDiff': [gold_diff_at_10], team + 'ExperienceDiff': [exp_diff_at_10] }) # values are casted into lists because pandas constructor doesnt allow scalars predicts = model.predict_proba(df) for i, col in enumerate(['redWin', 'blueWin']): df[col] = predicts[:, i] return df
def generate_shap_html(feature, user_bin, user_id): xgb_clf = XGBClassifier() xgb_clf.load_model(os.path.join(MODEL_DIRECTORY, "xgb.model")) explainer = shap.TreeExplainer(xgb_clf) values = explainer.shap_values(feature) shap.initjs() fp = shap.force_plot(explainer.expected_value[user_bin - 1], values[user_bin - 1][0], feature, show=False) shap.save_html(os.path.join(MODEL_DIRECTORY, f"User_{user_id}.html"), fp) with open(os.path.join(MODEL_DIRECTORY, f"User_{user_id}.html"), "r", encoding='utf-8') as f: html = f.read() os.remove(os.path.join(MODEL_DIRECTORY, f"User_{user_id}.html")) return str(html), values
def load_model_and_generate_evaluation_images(*, model_filename, input_path, output_path, feature_names): model = XGBClassifier() model.load_model(model_filename) frame_folders = sorted(get_frame_folders(input_path)) for frame_folder in frame_folders: frame_path = os.path.join(input_path, frame_folder) segment_names = [ name for name in os.listdir(frame_path) if name[1].isdigit() ] if len(segment_names) != 0: continue for camera_name in ["60", "180", "300"]: image_name = "camera" + camera_name + ".png" print(frame_path + "/" + image_name) image_bgr = cv.imread(os.path.join(frame_path, image_name)) features, shape = create_features(image_bgr=image_bgr, flatten=True) X = pd.DataFrame(features)[feature_names] y = model.predict(X) segments = y.reshape(shape) segments_bgr = [class2bgr(idx) for idx in segments.flatten()] segments_bgr = np.array(segments_bgr).reshape(*shape, 3) path = os.path.join(output_path, frame_folder) if not os.path.exists(path): os.makedirs(path) image_and_segments_bgr = np.concatenate([image_bgr, segments_bgr], axis=1) # cv.imwrite(filename=os.path.join(path, image_name), # img=image_bgr) segments_filename = "camera" + camera_name + "_segments" + ".png" cv.imwrite( filename=os.path.join(path, segments_filename), img=image_and_segments_bgr, )
def load_model_and_generate_evaluation_images( model_filename, input_path: pathlib.Path, output_path: pathlib.Path, feature_names, ): model = XGBClassifier() model.load_model(model_filename) for frame_folder in sorted(get_subdirectories(input_path)): segment_names = [ f.name for f in frame_folder.iterdir() if f.is_file() and f.name[1].isdigit() ] if len(segment_names) != 0: continue for camera_name in ["60", "180", "300"]: image_name = "camera" + camera_name + ".png" print(frame_folder / image_name) image_bgr = cv.imread(str(frame_folder / image_name)) features, shape = create_features(image_bgr=image_bgr, flatten=True) X = pd.DataFrame(features)[feature_names] y = model.predict(X) segments = y.reshape(shape) segments_bgr = [class2bgr(idx) for idx in segments.flatten()] segments_bgr = np.array(segments_bgr).reshape(*shape, 3) path = output_path / frame_folder.name if not path.exists(): path.mkdir(parents=True) image_and_segments_bgr = np.concatenate([image_bgr, segments_bgr], axis=1) segments_filename = "camera" + camera_name + "_segments" + ".png" cv.imwrite( filename=str(path / segments_filename), img=image_and_segments_bgr, )
def predict(inputs): main_category, category, goal, country, currency, today = inputs """ Encode inputs """ encoder = load(ENCODER_PATH) inputs2enc = np.array([category, main_category, currency, country]).reshape(1, -1) inputs_encoded = encoder.transform(inputs2enc) """ Stack """ numericals = np.array([goal, today.day, today.month]).reshape(1, -1) final_inputs = np.hstack([numericals, inputs_encoded]).astype(np.float32) """ Load model and predict """ model = XGBClassifier(seed=42) model.load_model(MODEL_PATH) result = model.predict_proba(final_inputs) return result
def predict(age, count, diagnosis, fpath='static/model/HFEA_model_{}', nmodels=5): """ Loads and predicts from the models. Parameters: ----------- age : int, Age of the patient in years. count : int, The number of Oocytes (eggs) collected following the treatment. diagnosis : str, The patients infertility diagnosis, must be one of Ovulatory disorder, Male factor, Endometriosis or Unexplained. fpath : str (default='static/model/HFEA_model_{}'), Path to the models. nmodel : int (default=5), The number of models (i.e., the number of folds used in the cross-validation proceedure during training). Returns: -------- pred, float: """ age_group = map_age_to_age_group(age) # add the four infertility diagnosis features infertility = create_infertility_feature(diagnosis) X = np.r_[[age_group, count], infertility] pred = 0 for i in range(5): clf = XGBClassifier() clf.load_model(f'static/model/HFEA_model_{i}') pred += clf.predict_proba(X.reshape((1,-1)))[:,1][0] / nmodels return pred
def predict(name, match_analysis_num, api_key): #모델 및 데이터 로드 ss = joblib.load("model/standard_scaler.pkl") xgb = XGBClassifier() xgb.load_model("model/LOL_predict_xgb.bst") match_df, player_stat, game_minute, win_lable = datapipe.collect_predict_data_by_name( name, match_analysis_num, api_key) del [[player_stat]] gc.collect if (win_lable[0] == -1): return -1, -1 elif (win_lable[0] == -404): return -404, -404 else: #승률예측 match_scaled = ss.fit_transform(match_df) win_rate = xgb.predict_proba(match_scaled) real_win_rate = win_lable.mean() predict_win_rate = win_rate[:, 1].mean() return real_win_rate, predict_win_rate
def predict_xgb(data): """Perform prediction using trained model.""" model = XGBClassifier() data = normalizator(prepare_data(data)) try: model.load_model( "cotopaxi/identification_models/proto_XGB_20201112.model") except ValueError as exc: raise CotopaxiException from exc( "[!] Cannot load machine learning classifier!" " This may be caused by incompatible version of tensorflow" " (please install tensorflow version 2.2.0)!") result = model.predict(data) unique, counts = numpy.unique(result, return_counts=True) devices = list() for unit in unique: devices.append(unit) result_dict = dict(zip(devices, counts)) result_dict = sorted(result_dict.items(), key=lambda x: x[1], reverse=True) result_class = result_dict[0][0] return result_class, result_dict, counts.sum()
def run(self): model_folder_path = '../model' model_path = os.path.join(model_folder_path, 'xgb_final.pkl') trained_model = XGBClassifier() trained_model.load_model(model_path) # read the processed features file df_test = pd.read_csv(self.input().path) # predict churn prediction = trained_model.predict(df_test) # putting prediction in dataframe as well as index ids (this is the # the submission format) submission = pd.DataFrame(data=prediction, columns=['churn']) submission['churn'] = submission['churn'].map({1: 'yes', 0: 'no'}) submission.reset_index(inplace=True) submission.rename(columns={'index': 'id'}, inplace=True) submission['id'] = submission['id'] + 1 # write submission to file submission.to_csv(self.output().path, index=False)
def stroke_predict(gender, age, hypertension, heart_disease, ever_married, work_type, Residence_type, avg_glucose_level, bmi, smoking_status): # creating the pandas dataframe and replicating previous steps a_dict = { 'gender': [int(gender)], 'age': [int(age)], 'hypertension': [int(hypertension)], 'heart_disease': [int(heart_disease)], 'ever_married': [int(ever_married)], 'work_type': [int(work_type)], 'Residence_type': [int(Residence_type)], 'avg_glucose_level': [float(avg_glucose_level)], 'bmi': [float(bmi)], 'smoking_status': [int(smoking_status)] } data = pd.DataFrame(a_dict) data['gender'] = 1 if gender == 'male' else 0 data['ever_married'] = 1 if ever_married == 'Yes' else 0 work_mapping = { 'Self_employed': 3, 'Private': 2, 'children': 1, 'Govt_job': 0 } data['work_type'] = data['work_type'].map(work_mapping) data['Residence_type'] = 1 if Residence_type == 'Urban' else 0 smoke_mapping = { 'Unknown': 0, 'formerly smoked': 1, 'never_smoked': 2, 'smokes': 3 } data['smoking_status'] = data['smoking_status'].map(smoke_mapping) # after data has been replicated xgb = XGBClassifier() xgb.load_model("weights/stroke.model") return xgb.predict(data)[0]
def start_pre(val_img_list, val_tar_list, type_class=minor_type_class): real_class_pair_list = cut_class_pair model_base_path = 'outs/' result_list = [list() for i in range(len(val_img_list))] config = Config() result_max_item_list = [(0, 0) for i in range(len(val_img_list))] for ci, class_pair in enumerate(type_class): model_path = model_base_path + 'xgboost_model_per_class' + str( class_pair) + '.pkl' print('part ', ci, ' of ', len(real_class_pair_list)) clr = XGBClassifier() clr.load_model(model_path) y_p_x = clr.predict_proba(val_img_list) pre_for_f1 = [] t_for_f1 = [] for i_ys, ys in enumerate(y_p_x): if len(val_tar_list) > 0: tail = '' mid = '' if class_pair in val_tar_list[i_ys]: tail = '-----------' if ys[1] >= 0.5: mid = '||||||||' print('ci ', ci, ' i_ys ', i_ys, ' pre ', ys, mid, ' c ', class_pair, ' t ', val_tar_list[i_ys], tail) else: print('ci ', ci, ' i_ys ', i_ys, ' pre ', ys, ' c ', class_pair) sub_result = result_list[i_ys] if ys[1] >= 0.5: sub_result.append(class_pair) pre_for_f1.append(1) else: pre_for_f1.append(0) result_list[i_ys] = sub_result max_item_idx, max_item_f = result_max_item_list[i_ys] if ys[1] > max_item_f: result_max_item_list[i_ys] = (class_pair, ys[1]) if len(val_tar_list) > 0: for tar in val_tar_list: if class_pair in tar: t_for_f1.append(1) else: t_for_f1.append(0) print('c ', class_pair, '---------f1 ', f1_score(t_for_f1, pre_for_f1, average="macro")) # print('sub ', ci, ' r:', sub_result) pre_list = [] for this_sub_i, sub_result in enumerate(result_list): print('this_sub_i ', this_sub_i, ' sub_result ', sub_result) result_i = np.zeros(28) for i_s, s in enumerate(sub_result): result_i[s] += 1 # print('result_i ', result_i) result = [] for i, r_i in enumerate(result_i): if r_i == 1 and (i in type_class): # print('i ', i, ' r_i ', r_i) result.append(i) if len(val_tar_list) > 0: print('pre ', result, ' t ', val_tar_list[this_sub_i]) pre_list.append(result) return pre_list, result_max_item_list
class FeatureClassifier(): def __init__(self, modelfile='featureclassifier.json'): pwd = os.path.dirname(__file__) self.model = XGBClassifier() self.model.load_model(pwd + '/models/'+modelfile) def check_errors(self, sim): if sim.N_real < 4: raise AttributeError("SPOCK Error: SPOCK only applicable to systems with 3 or more planets") def predict_stable(self, sim, n_jobs=-1): """ Predict whether passed simulation will be stable over 10^9 orbits of the innermost planet. Parameters: sim (rebound.Simulation): Orbital configuration to test n_jobs (int): Number of cores to use for calculation (only if passing more than one simulation). Default: Use all available cores. Returns: float: Estimated probability of stability. Will return exactly zero if configuration goes unstable within first 10^4 orbits. """ res = self.generate_features(sim, n_jobs=n_jobs) try: stable = np.array([r[1] for r in res]) features = [r[0] for r in res] Nsims = len(sim) except: stable = np.array([res[1]]) features = [res[0]] Nsims = 1 # We take the negligible hit of evaluating XGBoost for all systems, and overwrite prob=0 for ones that went unstable in the short integration at the end # array of Ntrios x 10 features to evaluate with XGboost (Nsims*Ntriospersim x 10 features) featurevals = np.array([[val for val in trio.values()] for system in features for trio in system]) probs = self.model.predict_proba(featurevals)[:,1] # take 2nd column for probability it belongs to stable class # XGBoost evaluated a flattened list of all trios, reshape so that trios in same sim grouped trios_per_sim = int(len(probs)/Nsims) probs = probs.reshape((Nsims, trios_per_sim)) # Take the minimum probability of stability within the trios for each simulation probs = np.min(probs, axis=1) # Set probabilities for systems that went unstable within short integration to exactly zero probs[~stable] = 0 if Nsims == 1: return probs[0] else: return probs def generate_features(self, sim, n_jobs=-1): """ Generates the set of summary features used by the feature classifier for prediction. Parameters: sim (rebound.Simulation): Orbital configuration to test n_jobs (int): Number of cores to use for calculation (only if passing more than one simulation). Default: Use all available cores. Returns: List of OrderedDicts: A list of sets of features for each adjacent trio of planets in system. Each set of features is an ordered dictionary of 10 summary features. See paper. stable (int): An integer for whether the N-body integration survived the 10^4 orbits (1) or went unstable (0). """ if isinstance(sim, rebound.Simulation): sim = [sim] args = [] if len(set([s.N_real for s in sim])) != 1: raise ValueError("If running over many sims at once, they must have the same number of particles!") for s in sim: s = s.copy() init_sim_parameters(s) minP = np.min([p.P for p in s.particles[1:s.N_real]]) self.check_errors(s) trios = [[j,j+1,j+2] for j in range(1,s.N_real-2)] # list of adjacent trios featureargs = [10000, 80, trios] args.append([s, featureargs]) def run(params): sim, featureargs = params triofeatures, stable = features(sim, featureargs) return triofeatures, stable if len(args) == 1: # single sim res = run(args[0]) # stable will be 0 if an orbit is hyperbolic else: if n_jobs == -1: n_jobs = cpu_count() #pool = ThreadPool(n_jobs) res = map(run, args) return list(res)
'model__colsample_bylevel': (0.01, 1.0, 'uniform'), 'model__learning_rate': (0.01, 1.0, 'log-uniform'), 'model__n_estimators': Integer(60, 400), 'model__max_depth': Integer(3, 12), # 'model__scale_pos_weight': Real(1, 1000, 'log-uniform'), only binary 'model__min_child_weight': Integer(1, 15), 'model__gamma': Real(0.1, 3), 'model__alpha': Real(0, 1), 'model__lambda': Real(0, 1), 'model__subsample': Real(0.3, 1), 'model__colsample_bytree': Real(0, 1), 'model__colsample_bynode': Real(0, 1) } xg = XGBClassifier() xg.load_model('XGBoost_model.json') xgb_search_prev = { 'model': [xg], # 'model__learning_rate': (0.01, 1.0, 'log-uniform'), # 'model__min_child_weight': (0, 10), # 'model__max_delta_step': Integer(0, 20), # 'model__colsample_bytree': (0.01, 1.0, 'uniform'), # 'model__colsample_bylevel': (0.01, 1.0, 'uniform'), # 'model__n_estimators': Integer(100, 200), # 'model__scale_pos_weight': Real(1, 1000, 'log-uniform'), # 'model__min_child_weight': Integer(1, 10), # 'model__gamma': Integer(1, 5), # 'model__subsample': Real(0.3, 1), # 'model__colsample_bytree': Real(0.1, 1), # 'model__max_depth': Integer(6, 12)
RED = '\u001b[31m' GREEN = '\u001b[32m' BLUE = '\u001b[34m' RESET = '\033[0m' xgb_mod = XGBClassifier(booster='dart', tree_method="gpu_hist", n_estimators=300, learning_rate=0.05, predictor='gpu_predictor', eval_metric='logloss', max_depth=3, gpu_id=0) xgb_mod.load_model('CVD_mod') cvd_df = pd.read_csv('cardio_train.csv', sep=';', index_col=0) cvd_df['age'] = cvd_df['age'] / 365.24 cvd_df['gender'] = cvd_df['gender'] - 1 cvd_df = cvd_df[(cvd_df['ap_lo'] <= 370) & (cvd_df['ap_lo'] > 0)] cvd_df = cvd_df[(cvd_df['ap_hi'] <= 370) & (cvd_df['ap_hi'] > 0)] cvd_df = cvd_df[cvd_df['ap_hi'] >= cvd_df['ap_lo']] cvd_df.reset_index(drop=True, inplace=True) X_train, X_test, y_train, y_test = train_test_split(cvd_df.drop(['cardio'], axis=1), cvd_df['cardio'],
class XGBoost(BaseAlgorithm): def __init__(self, algorithm_settings, problem_type): super().__init__(algorithm_settings) self.problem_type = problem_type def build(self): if self.problem_type == SupervisedTask.regression: self.build_regression_model() elif self.problem_type == SupervisedTask.classification: self.build_classification_model() else: raise TypeError('Unknown problem_type') def build_regression_model(self): from xgboost import XGBRegressor self.model = XGBRegressor( max_depth=self.algorithm_settings.max_depth, learning_rate=self.algorithm_settings.learning_rate, n_estimators=self.algorithm_settings.n_estimators, objective=self.algorithm_settings.objective, booster=self.algorithm_settings.booster, n_jobs=self.algorithm_settings.n_jobs, gamma=self.algorithm_settings.gamma, min_child_weight=self.algorithm_settings.min_child_weight, max_delta_step=self.algorithm_settings.max_delta_step, subsample=self.algorithm_settings.subsample, reg_alpha=self.algorithm_settings.reg_alpha, reg_lambda=self.algorithm_settings.reg_lambda, random_state=self.algorithm_settings.random_state) def build_classification_model(self): from xgboost import XGBClassifier self.model = XGBClassifier( max_depth=self.algorithm_settings.max_depth, learning_rate=self.algorithm_settings.learning_rate, n_estimators=self.algorithm_settings.n_estimators, objective=self.algorithm_settings.objective, booster=self.algorithm_settings.booster, n_jobs=self.algorithm_settings.n_jobs, gamma=self.algorithm_settings.gamma, min_child_weight=self.algorithm_settings.min_child_weight, max_delta_step=self.algorithm_settings.max_delta_step, subsample=self.algorithm_settings.subsample, reg_alpha=self.algorithm_settings.reg_alpha, reg_lambda=self.algorithm_settings.reg_lambda, random_state=self.algorithm_settings.random_state) def train(self, train_x, train_y, settings): self.model.fit(train_x, train_y, eval_metric=self.algorithm_settings.eval_metric) self.save(settings) def evaluate(self, test_x): prediction = self.model.predict(test_x) prediction = prediction.reshape(-1, 1) return prediction def load(self, model_path): self.model.load_model(fname=model_path) def save(self, settings): model_save_dir = os.path.join(settings.models_path, 'xgboost_models') os.makedirs(model_save_dir, exist_ok=True) model_name = self.get_model_name(settings) save_path = os.path.join(model_save_dir, model_name) self.model.save_model(fname=save_path) print(f"Model saved to: {save_path}") def get_model_name(self, settings): if settings.problem_type == SupervisedTask.regression: return 'regression_model.xgb' else: return 'classification_model.xgb'
def get_model(): xgboost_quora_model = BASE_URL + "/xgboost_xcfl_quora_model.model" x_cfl = XGBClassifier() x_cfl.load_model(xgboost_quora_model) return x_cfl
# rmse, mae, logloss, error(설명 error가 accuracy), auc(설명 accuracy친구) results = model.evals_result() # print("eval's results : ", results) # print("r2 Score : %.2f%%:" %(r2*100.0)) y_pred = model.predict(x_test) acc = accuracy_score(y_pred, y_test) print("acc : ", acc) ##################################################################################################### # import pickle # 파이썬에서 제공한다. # from joblib import dump, load # import joblib # pickle.dump(model, open("./model/xgb_save/cancer.pickle.dat", "wb")) # wb형식으로 저장하겠다. # joblib.dump(model, "./model/xgb_save/cancer.joblib.dat") model.save_model("./model/xgb_save/cancer.xgb.model") print("저장됬다.") # model2 = pickle.load(open("./model/xgb_save/cancer.pickle.dat", "rb")) # model2 = joblib.load("./model/xgb_save/cancer.joblib.dat") model2 = XGBClassifier() model2.load_model("./model/xgb_save/cancer.xgb.model") print('불러왔다.') y_pred = model2.predict(x_test) acc = accuracy_score(y_pred, y_test) print("acc : ", acc)
provides recommendations for portfolio action, backtests, and incrementally trains the given XGBoost Classifier Model''' from xgboost import XGBClassifier import cpdb import pandas as pd import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.preprocessing import Imputer from sklearn.metrics import zero_one_loss from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV btc_model = XGBClassifier() eth_model = XGBClassifier() btc_model = btc_model.load_model('btc_model.bin') eth_model = eth_model.load_model('eth_model.bin') model = { 'btc': btc_model, 'eth': eth_model, } '''Delivers a recommendation based on model classification''' def get_recommendation(coinName, features): coin_model = model.get(coinName) recommendation = coin_model.predict(features) return recommendation
def is_spam(data, mode=2, classifier='manual'): if (classifier == 'manual'): message_body = data if (mode != 2): message_body = get_email(data, mode) clean_message = clean_message_no_html(message_body, stop_words=set( stopwords.words('english'))) word_columns_df = pd.DataFrame.from_records([clean_message]) word_columns_df.index.name = 'DOC_ID' word_index = pd.Index(vocab.VOCAB_WORD) sparse_matrix = make_sparse_matrix(word_columns_df, word_index).groupby([ 'DOC_ID', 'WORD_ID' ]).sum().reset_index().to_numpy() full_matrix = make_full_matrix(sparse_matrix, vocab.shape[0]).to_numpy() spam_email_prob = PROB_SPAM ham_email_prob = 1 - PROB_SPAM # denominator = 1 for j in range(full_matrix.shape[1]): if full_matrix[0, j] > 0: if prob_token_spam[j] > 0: spam_email_prob = spam_email_prob * \ (prob_token_spam[j]**full_matrix[0, j]) if spam_email_prob == 0: spam_email_prob = prev_spam ham_email_prob = prev_ham break if prob_token_ham[j] > 0: ham_email_prob = ham_email_prob * \ (prob_token_ham[j]**full_matrix[0, j]) if ham_email_prob == 0: spam_email_prob = prev_spam ham_email_prob = prev_ham break prev_spam = spam_email_prob prev_ham = ham_email_prob # denominator = denominator * prob_all_tokens[j] # print(spam_email_prob/denominator > ham_email_prob/denominator) print(spam_email_prob > ham_email_prob) # joint_log_spam = full_matrix.dot( # np.log(prob_token_spam+0.000000000000001) - np.log(prob_all_tokens+0.000000000000001)) + np.log(PROB_SPAM) # print(joint_log_spam) # joint_log_ham = full_matrix.dot( # np.log(prob_token_ham+0.000000000000001) - np.log(prob_all_tokens+0.000000000000001)) + np.log(1 - PROB_SPAM) # print(joint_log_ham) elif (classifier == 'xgb'): xgb_classifier = XGBClassifier() xgb_classifier.load_model('./XGB.model') data_list = [] data_list.append(data) doc_term_matrix = vectorizer.transform(data_list) print(xgb_classifier.predict(doc_term_matrix)[0] == 1)
class Classifier: # for initializing train and test sets, classifier and accuracy score # Change method to gpu_hist if you want xgboost to run on a GPU def __init__(self, params={ 'objective': 'reg:squarederror', 'verbosity': 0 }): self.X_train = [] self.X_labels = [] self.test = [] self.test_labels = [] self.model = XGBClassifier(**params) self.prediction = 0 self.error = 0 def size(self): if isinstance(self.X_train, np.ndarray): return self.X_train.size return len(self.X_train) # adding the data points def input_train(self, features, feature): if isinstance(self.X_train, np.ndarray) and self.X_train.size > 0: self.X_train = self.X_train.tolist() self.X_labels = self.X_labels.tolist() self.X_train.append(features) self.X_labels.append(feature) # train the data def train(self): self.X_train = np.asarray(self.X_train) self.X_labels = np.asarray(self.X_labels) self.model.fit(self.X_train, self.X_labels) def train_eval(self, metric='error'): self.X_train = np.asarray(self.X_train) self.X_labels = np.asarray(self.X_labels) X_train, X_test, y_train, y_test = train_test_split(self.X_train, self.X_labels, test_size=0.33) self.model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=metric) evals_result = self.model.evals_result() if metric == 'error': validations = [] for val in evals_result.values(): lst = val.get("error") validations.append(sum(lst) / len(lst)) return 1 - (sum(validations) / len(validations)) else: validations = [] for val in evals_result.values(): lst = val.get(metric) validations.append(lst[-1]) return validations # input test labels if you want to check accuracy def label(self, label): self.test_labels.append(label) def input_test(self, features): if isinstance(self.test, np.ndarray) and self.test.size > 0: self.test = self.test.tolist() self.test.append(features) # test data def predict(self): if not isinstance(self.test, np.ndarray): self.test = np.asarray(self.test) self.prediction = self.model.predict(self.test) return self.prediction def predict_proba(self): if not isinstance(self.test, np.ndarray): self.test = np.asarray(self.test) self.prediction = self.model.predict_proba(self.test) return self.prediction # if you have the test labels you can check the error rate (you want error close to 0) def check_error(self): self.test_labels = np.asarray(self.test_labels) self.error = metrics.mean_absolute_error(self.test_labels, self.prediction) return self.error # save classifier def save_classifier(self, file): self.model.save_model(file) # open saved classifier def open_classifier(self, file): self.model.load_model(file) # removes all training data def clean_train(self): self.X_train = [] self.X_labels = [] # removes all testing data def clean_test(self): self.test = [] self.test_labels = []
def predict_sent(vecs, xgb_model_analyze): xgb = XGBC() xgb.load_model(xgb_model_analyze) pred = predict(vecs, w2v_model, xgb, 300) df = pd.DataFrame(pred, columns=['sent']) return df