def train(): env = my_env.MyEnv(0, realtime_mode=True) model = CatBoostClassifier() model.load_model("catboost_model.model") score = 0.0 print_interval = 1 for n_epi in range(10000): s = env.reset() done = False while not done: y_pred1 = model.predict(s, prediction_type="Probability") if deterministic: y_pred_max = int(np.argmax(y_pred1)) a = action_mapping(y_pred_max) else: a = int(np.random.choice([0, 1, 3, 4, 5], p=y_pred1)) s_prime, r, done, info = env.step(a) s = s_prime score += r if done: break if n_epi%print_interval==0 and n_epi!=0: print("# of episode :{}, avg score : {:.5f}".format(n_epi, score/print_interval)) score = 0.0 env.close()
def test_export_to_python_after_load(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=40, random_seed=0) model.fit(train_pool) pred_model = model.predict(test_pool, prediction_type='RawFormulaVal') model.save_model(OUTPUT_MODEL_PATH) model_loaded = CatBoostClassifier() model_loaded.load_model(OUTPUT_MODEL_PATH) model_loaded.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python", pool=train_pool) pred_model_loaded = model_loaded.predict(test_pool, prediction_type='RawFormulaVal') import sys import os.path module_dir = os.path.dirname(OUTPUT_PYTHON_MODEL_PATH) sys.path.insert(0, module_dir) from model import apply_catboost_model as apply_catboost_model_from_python pred_python = [] for test_line in test_pool.get_features(): float_features, cat_features = _split_features( test_line, train_pool.get_cat_feature_indices(), test_pool.get_cat_feature_hash_to_string()) pred_python.append( apply_catboost_model_from_python(float_features, cat_features)) assert _check_data(pred_model, pred_python) assert _check_data(pred_model_loaded, pred_python)
def get_predict_2020(): df_data = pd.read_csv("dvhb_data/test/test 2020/grouped_full.csv", index_col=0) # кодирую слова векторами if os.path.isfile('cult_token.txtdic'): dictionary = corpora.Dictionary.load('cult_token.txtdic') else: df_train_full = my_full_cvs("dvhb_data/train", "train_full.csv") df_train_full_new_names = ['CODE_CULT', 'CODE_GROUP', 'CENTROID', 'YEAR'] df_train_full.columns = df_train_full_new_names text = [df_train_full['CODE_CULT'].tolist()] dictionary = corpora.Dictionary(text) dictionary.save('cult_token.txtdic') # заменяем значения в столбце object_name_n на данные из словаря, а ключи берем из столбца object_type_number df_data['CODE_CULT_2019'] = df_data['CODE_CULT_2019'].map(dictionary.token2id) df_data['CODE_CULT_2018'] = df_data['CODE_CULT_2018'].map(dictionary.token2id) df_data['CODE_CULT_2017'] = df_data['CODE_CULT_2017'].map(dictionary.token2id) df_data['CODE_CULT_2016'] = df_data['CODE_CULT_2016'].map(dictionary.token2id) df_data['CODE_CULT_2015'] = df_data['CODE_CULT_2015'].map(dictionary.token2id) df_data.rename(columns={f'CODE_CULT_{2015 + i}': f'{i + 1}' for i in range(6)}, inplace=True) model = CatBoostClassifier() model.load_model("catboostmodel") predictions_valid = model.predict( df_data[['2', '3', '4', '5', 'LATITUDE', 'LONGTITUDE']].rename(columns={'2': '1', '3': '2', '4': '3', '5': '4'}) ) df_data = df_data.assign(CODE_CULT_2020=predictions_valid) df_data.rename(columns={f'{i + 1}': f'CODE_CULT_{2015 + i}' for i in range(6)}, inplace=True) df_permanent = df_data[ (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2016']) & (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2017']) & (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2018']) & (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2019'])] df_two_year = df_data[ (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2016']) & (df_data['CODE_CULT_2017'] == df_data['CODE_CULT_2018']) & (df_data['CODE_CULT_2015'] != df_data['CODE_CULT_2018']) & (df_data['CODE_CULT_2019'] != df_data['CODE_CULT_2018']) & ~df_data.index.isin(df_permanent.index)] for row in df_permanent.iterrows(): df_data.loc[row[0]]['CODE_CULT_2020'] = row[1]['CODE_CULT_2015'] for row in df_two_year.iterrows(): df_data.loc[row[0]]['CODE_CULT_2020'] = row[1]['CODE_CULT_2019'] df_data['CODE_CULT_2020'] = df_data['CODE_CULT_2020'].map(dictionary.get) df_data['CODE_CULT_2019'] = df_data['CODE_CULT_2019'].map(dictionary.get) df_data['CODE_CULT_2018'] = df_data['CODE_CULT_2018'].map(dictionary.get) df_data['CODE_CULT_2017'] = df_data['CODE_CULT_2017'].map(dictionary.get) df_data['CODE_CULT_2016'] = df_data['CODE_CULT_2016'].map(dictionary.get) df_data['CODE_CULT_2015'] = df_data['CODE_CULT_2015'].map(dictionary.get) df_data[['CODE_CULT_2015', 'CODE_CULT_2016', 'CODE_CULT_2017', 'CODE_CULT_2018', 'CODE_CULT_2019', 'CODE_CULT_2020', 'LATITUDE', 'LONGTITUDE']].to_csv('predict_2020_full.csv', index=True) df_data['CODE_CULT_2020'].to_csv('predict_2020.csv', index=True)
def gbm_predict(data): model = CatBoostClassifier() model.load_model('./models/gbm1.cbm') output = model.predict(data) return output
def submit(args): global model #model = CatBoostEnsembleModel() model = CatBoostClassifier() #task_type="GPU") print('loading {}...'.format(args.model_file)) model.load_model(args.model_file) #print(dir(model)) print(model.classes_) #print(model.feature_importances_) print(model._tree_count) print(model.learning_rate_) #exit(0) print('loading {}...'.format(args.detect_pred_file)) df_det = pd.read_csv(args.detect_pred_file) df_det['dets'] = df_det.PredictionString.map(lambda x: get_det(str(x))) print('detected objs:', df_det.dets.map(lambda x: len(x)).sum()) print('predicting...') df_sub = df_det.copy() df_sub.PredictionString = '' bg = time.time() df_sub = parallel_apply(df_sub, add_pred_string) df_sub.to_csv(args.out, columns=['ImageId', 'PredictionString'], index=False) print('Done, total time:', time.time() - bg)
def predict(self, feature_names): """ Input: feature_names: directionary of features' names Output: predict_df: Dataframe(["MachineIdentifier", "HasDetections") """ model_directory_path = Path(__file__).absolute().parents[2] / "data" / "model" / str(get_version()) preds = None FOLDS = 5 predict_df = None for fold in range(FOLDS): model_path = model_directory_path / "valid{}.model".format(fold) clf = CatBoostClassifier() clf.load_model(fname=str(model_path)) valid = "valid{}".format(fold) test_df = super().get_feature_df(feature_names, valid, "test") if predict_df is None: predict_df = test_df["MachineIdentifier"] test_df = test_df.set_index("MachineIdentifier") if preds is None: preds = self.predict_chunk(clf, test_df) / FOLDS else: preds += self.predict_chunk(clf, test_df) / FOLDS predict_df = pd.DataFrame(predict_df) predict_df["HasDetections"] = preds return predict_df return predict_df
def train_catboost_with_hyperopt(train_df, valid_df, target, features, categorical_features, catboost_options, hyperopt_options): logging.info('Running hyper parameters optimization: %s', config2json(hyperopt_options)) space = dict() for param, opts in hyperopt_options['space'].items(): expression = getattr(hp, opts['expression']) space[param] = expression(label=param, **opts['params']) fcn = get_hyperopt_objective(train_df, valid_df, target, features, categorical_features, catboost_options) trials = Trials() opt = fmin( fn=fcn, space=space, algo=tpe.suggest, trials=trials, max_evals=hyperopt_options['max_evals'] ) with open('hyperopt_trials.json', 'w') as f: json.dump(trials.results, f, indent=4) logging.info('Best parameters: %s', opt) best_trial, best_trial_result = min(enumerate(trials.results), key=lambda r: r[1]['loss']) logging.info('Best model %d: AUC=%s, model=%s' % ( best_trial, best_trial_result['quality']['valid']['auc'], best_trial_result['model']['file'])) best_model = CatBoostClassifier() best_model.load_model(best_trial_result['model']['file']) return best_trial_result['quality']['train'], best_trial_result['quality']['valid'], best_model
class CatBoostWrapper(mlflow.pyfunc.PythonModel): """ MLflow wrapper for CatBoost estimators. """ def load_context(self, context): # pylint: disable=attribute-defined-outside-init with open(context.artifacts['pipeline'], 'rb') as f: self.pipeline = pickle.load(f) with open(context.artifacts['col_config'], 'rb') as f: column_config = pickle.load(f) self.clf = CatBoostClassifier() self.clf.load_model(context.artifacts['cbm_model']) self.col_names = column_config['col_names'] self.preserve_cols = column_config['preserve_neg_vals'] def preprocess(self, data): """ Applies the pre-processing pipeline to the features given in the input dataset. :param data: Input dataset. :return: Transformed dataset. """ data = data[self.col_names] data = remove_inf_values(data) data = remove_negative_values(data, ignore_cols=self.preserve_cols) return self.pipeline.transform(data) def predict(self, context, model_input): X = self.preprocess(model_input) return self.clf.predict(X)
def load_model_list(dir_path): # print('catboost load model_list ') model_list = [] for i in range(config.num_classes): model = CatBoostClassifier() model.load_model(os.path.join(dir_path, 'model_for_class_%d.dump' % i)) model_list.append(model) return model_list
def load_model(name, alg, i): if alg == "rf": model = joblib.load("results/models/" + name + "_" + alg + "_" + str(i)) else: model = CatBoostClassifier() model.load_model("results/models/" + name + "_" + alg + "_" + str(i)) return model
def load_catboost_model(model_name: str) -> CatBoostClassifier: """Reads `model_name` from `PATH_MODELS` and returns the fitted catboost model """ test_model_from_file = CatBoostClassifier() test_model_from_file.load_model(str(PATH_MODELS / model_name)) return test_model_from_file
def predict_catboost(model_path, big_category, model_names=model_names): test_x = read_probabilties(proba_folder=os.path.join( ROOT_PROBA_FOLDER, big_category), subset='test') test_data = Pool(test_x) from_file = CatBoostClassifier() from_file.load_model(model_path) predictions = from_file.predict(test_data) return predictions
def toloka(self): model = CatBoostClassifier() model.load_model(self.args.model_path) catboost_pool = self.to_ml_input(self.test_pool.pool, "test") test_y_pred = model.predict_proba(catboost_pool) test_y_max = list() for pred_proba_y in test_y_pred: (max_index, proba) = max(enumerate(pred_proba_y), key=operator.itemgetter(1)) test_y_max.append((int(model.classes_[max_index]), proba)) self.test_pool.build_toloka_pool(test_y_max, self.args.toloka_pool)
def test_multiclass(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8) classifier.fit(pool) classifier.save_model(OUTPUT_MODEL_PATH) new_classifier = CatBoostClassifier() new_classifier.load_model(OUTPUT_MODEL_PATH) pred = new_classifier.predict_proba(pool) np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
def get_model(cls): """Get the model object for this instance, loading it if it's not already loaded.""" if cls.model == None: print('i am here') model_file = CatBoostClassifier() model_file.load_model(os.path.join(model_path, 'model-classification-prod')) with open(os.path.join(model_path, 'obj_col_categories.pkl'), 'rb') as inp: obj_col_categories = pickle.load(inp) print('Model is loaded:-') cls.model = [obj_col_categories, model_file] return cls.model
def recognition_emotion_from_voice(): classifier2 = CatBoostClassifier(iterations=1000, learning_rate=0.25, depth=5, loss_function='MultiClassOneVsAll', eval_metric="Accuracy") classifier2.load_model("stable_model") if len(os.listdir("data/voice/")) >= 3: data = At.dirsWavFeatureExtraction(["data/voice"], 1, 1, 0.05, 0.05) result = classifier2.predict(data[0][0]) result = [x[0] for x in result] return max(result, key=result.count) return None
def test_model(): model = CatBoostClassifier() #task_type="GPU") model.load_model('insideof/cat_470_167.model') df_vrd = pd.read_csv(os.path.join(DATA_DIR, 'challenge-2019-train-vrd.csv')) df_pos = df_vrd.loc[df_vrd.RelationshipLabel=='under'].copy() #df_pos = pd.read_csv('insideof/df_neg.csv').iloc[3000:] df_pos = parallel_apply(df_pos, add_features) X = df_pos.drop(['ImageID', 'RelationshipLabel'], axis=1) p = model.predict_proba(X) y = model.predict(X) print(p[:100]) print(y[:100] == 1)
def load_catboost_predictor(name): clf = CatBoostClassifier() clf.load_model(f"{name}.cbm") le = load(f"{name}_le.job") vect = load(f"{name}_vect.job") def predict(data): compressed_data = vect.transform(data) vect_data = todense(compressed_data) preds = clf.predict(vect_data).flatten().astype('int64') return le.inverse_transform(preds).flatten() return predict
def fit_predict(self, img_path): """ Create embedding for given image Arguments: img -- image to get embedding from(array from cv2.imread()) Return: pred -- predicted name for given image probas -- probabilities for every class """ detector_fa = dlib.get_frontal_face_detector() embedder = cv2.dnn.readNetFromTorch(self.embedderFile) predictor = dlib.shape_predictor(self.predictorFile) aligner = FaceAligner(predictor) img = cv2.imread(img_path) img = imutils.resize(img, width=600) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) rect = detector_fa(gray, 3) # make sure that face is detected or search again longer if len(rect) != 0: rect = rect[0] else: rect = detector_fa(gray, 5)[0] face_aligned = aligner.align(img, gray, rect) ''' (x, y, w, h) = helpers.rect_to_bb(rect) cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2) cv2.imshow('face', img) cv2.waitKey(0) ''' face_blob = cv2.dnn.blobFromImage(face_aligned, scalefactor=1. / 255, size=(96, 96), mean=(0, 0, 0), swapRB=True) embedder.setInput(face_blob) vec = embedder.forward() model = CatBoostClassifier() model.load_model('trained_model') pred = model.predict(vec) probas = model.predict_proba(vec) return pred, probas
def segment(infile, outfile): model = CatBoostClassifier() print("before laod model") model.load_model("models/catboost_1.model") print("after laod model") ff = extract_features(infile, feature_codes_1, return_style='dataframe') X = ff[['chr_position'] + feature_codes_1] y = model.predict(X) ff['predictions'] = y.astype(int) preds = ff.groupby(['line_no', 'word_no'])['predictions'].apply(list) with open(infile, 'r') as reader: sentences = reader.readlines() seg_sents = get_human_readable_segmentation(sentences, preds) with open(outfile, 'w') as segfile: segfile.writelines('\n'.join(seg_sents))
def depar(): #order of attributes MONTH,AIRLINE,ORIGIN_AIRPORT,DEST_AIRPORT,SCHEDULED_DEPARTURE,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL unisys_dep_delay_model = CatBoostClassifier() unisys_dep_delay_model.load_model("unisys_departure_delay") myclient = pymongo.MongoClient("mongodb://localhost:27017/") img = io.BytesIO() mydb = myclient["flightdb"] mycol = mydb["passengers2"] airport = pd.read_csv('airports.csv') pnr = request.form['in'] df = pd.read_csv('sample.csv') myquery = {"PNR": pnr} mydocs = mycol.find(myquery) for mydoc in mydocs: df_air = airport[(airport['IATA_CODE'] == mydoc['oair'])] lat = df_air['LATITUDE'].iloc[0] lan = df_air['LONGITUDE'].iloc[0] x = np.array([ mydoc['month'], mydoc['airline'], mydoc['oair'], mydoc['dair'], mydoc['schdep'], mydoc['schtime'], mydoc['dist'], mydoc['scharr'] ]) preds_class = unisys_dep_delay_model.predict(x) if preds_class > 0: op = "No Delay in Departure" url = "https://cdn2.iconfinder.com/data/icons/yellow-smiles/1000/Smile-Icons-02_Converted-01-512.png" colr = "#007944" else: op = "Delay in Departure" url = "https://cdn0.iconfinder.com/data/icons/emoticons-round-smileys/137/Emoticons-14-512.png" colr = "#c81912" temp_df = df[df['ORIGIN_AIRPORT'] == mydoc['oair']] df_month1 = temp_df.groupby(['AIRLINE'])[['target_departure']].mean() df_month1.reset_index(inplace=True) plt.bar(df_month1['AIRLINE'], df_month1['target_departure']) plt.xlabel('AIRLINE') plt.ylabel('% of delay') plt.savefig(img, format='jpg') img.seek(0) plt.clf() purl = base64.b64encode(img.getvalue()).decode() return render_template('depdelay.html', res=op, iurl=url, col=colr, pltu=purl, ap=mydoc['oair'], sla=lat, slo=lan)
def main(args): # get data X, y = get_gbm_database(args.telemetry_path, args.maint_path, args.machines_path, args.errors_path, args.failures_path, seq_len=args.out_seq_len, machine_id=args.machine_id, ) X_gbm = X.iloc[args.seq_len:-args.out_seq_len] y_target = y.iloc[args.seq_len:-args.out_seq_len] dm = TelemetryDataModule(path=args.telemetry_path, seq_len=args.seq_len, out_seq_len=args.out_seq_len, batch_size=X_gbm.shape[0], num_workers=args.num_workers,) dm.setup(stage="prodaction") X_lstm = dm.prodaction_dataset() # load models lstm = LSTM.load_from_checkpoint(checkpoint_path=args.checkpoint_path + '/lstm.ckpt', n_features=args.n_features, hidden_size=args.hidden_size, seq_len=args.seq_len, out_seq_len=args.out_seq_len, batch_size=X_gbm.shape[0], criterion=args.criterion, num_layers=args.num_layers, dropout=args.dropout, learning_rate=args.learning_rate, ) lstm.freeze() gbm = CatBoostClassifier() gbm.load_model(args.checkpoint_path + '/gbm.cbm') # prediction y_hat_lstm = None for (x, _) in X_lstm: y_hat_lstm = lstm(x) X_gbm = get_lstm_feature(X_gbm, y_hat_lstm) score = gbm.score(X_gbm, y_target) print('Model accuracy: {0:.2f}%'.format(score*100))
def catboost_predict_classes( data_path: InputPath('CSV'), model_path: InputPath('CatBoostModel'), predictions_path: OutputPath(), label_column: int = None, ): '''Predict classes using the CatBoost classifier model. Args: data_path: Path for the data in CSV format. model_path: Path for the trained model in binary CatBoostModel format. label_column: Column containing the label data. predictions_path: Output path for the predictions. Outputs: predictions: Class predictions in text format. Annotations: author: Alexey Volkov <*****@*****.**> ''' import tempfile from catboost import CatBoostClassifier, Pool import numpy if label_column: column_descriptions = {label_column: 'Label'} column_description_path = tempfile.NamedTemporaryFile( delete=False).name with open(column_description_path, 'w') as column_description_file: for idx, kind in column_descriptions.items(): column_description_file.write('{}\t{}\n'.format(idx, kind)) else: column_description_path = None eval_data = Pool( data_path, column_description=column_description_path, has_header=True, delimiter=',', ) model = CatBoostClassifier() model.load_model(model_path) predictions = model.predict(eval_data) numpy.savetxt(predictions_path, predictions, fmt='%s')
class Model: def __init__(self): self.model = CatBoostClassifier() self.model.load_model(os.getcwd() + '\\model.bkp') self.old_data = pd.read_excel(os.getcwd() + '\\old_data.xlsx') def predict(self, data): return self.model.predict(data) def retrain(self, data): new_data = pd.concat([self.old_data, data], axis=0, ignore_index=True) self.old_data = new_data.drop(labels=[ 'ID (Идентификатор Заявки)', 'ID (Идентификатор Клиента)', 'Дата заявки', 'Unnamed: 0', ' - count', ' - summ' ], axis='columns') X = self.old_data.drop(labels='Target (90 mob 12)', axis='columns') y = self.old_data['Target (90 mob 12)'] X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7) X.to_excel(os.getcwd() + '\\mmmm.xlsx') categorical_features_indices = [] print(categorical_features_indices, X) self.model = CatBoostClassifier( thread_count=2, iterations=50, depth=1, l2_leaf_reg=2, learning_rate=0.001, random_seed=62, od_type='Iter', od_wait=10, custom_loss=['F1', 'AUC'], auto_class_weights='Balanced', use_best_model=True, ) self.model.fit(X_train, y_train, cat_features=categorical_features_indices, eval_set=(X_val, y_val), logging_level='Silent', plot=True) self.model.save_model(os.getcwd() + '\\model.bkp') self.old_data.to_excel(os.getcwd() + '\\old_data.xlsx', index_label=False)
def run(exp_name, data_type): tes_m = feather.read_dataframe('../others/tes_m.feather') le = load_pickle('../others/label_encoder.pkl') y = le.transform(np.load('../others/train_target.npy')) distmod_mask = np.load('../others/distmod_mask.npy') ex_gal_labels = np.where(np.bincount(y[distmod_mask]) != 0)[0] gal_labels = np.where(np.bincount(y[~distmod_mask]) != 0)[0] ex_gal_index = ((tes_m['hostgal_specz'].isnull()) & (~tes_m['distmod'].isnull())).values ex_gal_spec_index = ((~tes_m['hostgal_specz'].isnull()) & (~tes_m['distmod'].isnull())).values gal_index = (tes_m['distmod'].isnull()).values fn_s = np.load('../fi/' + exp_name + '_fn_s_' + data_type + '.npy') fn_s = [el.replace('/', '_') for el in fn_s] X_test = load_arr(fn_s, 'test') model = CatBoostClassifier() model.load_model('../models/' + exp_name + '_' + data_type + '.cbm') if data_type == 'ex_gal': real_test_data = FeaturesData(X_test.astype(np.float32)[ex_gal_index]) y_pred_ex_gal = model.predict_proba(real_test_data) ex_gal_pred = np.zeros((y_pred_ex_gal.shape[0], 14)) ex_gal_pred[:, ex_gal_labels] = y_pred_ex_gal np.save('../preds/' + data_type + '_pred_' + exp_name + '.npy', ex_gal_pred) elif data_type == 'ex_gal_spec': real_test_data = FeaturesData( X_test.astype(np.float32)[ex_gal_spec_index]) y_pred_ex_gal_spec = model.predict_proba(real_test_data) ex_gal_spec_pred = np.zeros((y_pred_ex_gal_spec.shape[0], 14)) ex_gal_spec_pred[:, ex_gal_labels] = y_pred_ex_gal_spec np.save('../preds/' + data_type + '_pred_' + exp_name + '.npy', ex_gal_spec_pred) elif data_type == 'gal': real_test_data = FeaturesData(X_test.astype(np.float32)[gal_index]) y_pred_gal = model.predict_proba(real_test_data) gal_pred = np.zeros((y_pred_gal.shape[0], 14)) gal_pred[:, gal_labels] = y_pred_gal np.save('../preds/' + data_type + '_pred_' + exp_name + '.npy', gal_pred) else: raise Error gc.collect()
class CatBoostEnsembleModel: def __init__(self): self.model1 = CatBoostClassifier() self.model2 = CatBoostClassifier() self.model1.load_model('lb23578/cat_154k_144_1000.model') self.model2.load_model('lb22592/cat_0820_500_143.model') print(self.model1.classes_) def predict_with_proba(self, X, w=[0.7, 0.3]): p1 = self.model1.predict_proba(X) p2 = self.model2.predict_proba(X) prob = p1*w[0] + p2*w[1] idx = np.argmax(prob, axis=1) assert len(idx) == len(prob) labels = np.array(self.model1.classes_)[idx] assert len(labels) == len(prob) return labels, prob
def predict_from_df_prod(self, df): """ Production prediction code. """ hidden_states = self.create_hidden_states(df) # Combine the metadata with the transformer output metadata_df = df[['sc_id_cat', 'version_number', 'partisan_lean']] metadata_df.reset_index(drop=True, inplace=True) feature_extractor_df = pd.concat( [metadata_df, pd.DataFrame(hidden_states)], axis=1) # Run the Catboost Classifier. catboost_model = CatBoostClassifier() catboost_model.load_model('models/catboost.production') preds_cat = catboost_model.predict_proba(feature_extractor_df)[:, 1] return preds_cat
class CatBoost: _verbose = 200 _train_dir = DATA_CACHE_DIR _is_gpu_available = get_gpu_device_count() _task_type = "GPU" if _is_gpu_available > 0 else None _devices = "GPU" if _is_gpu_available > 0 else None def __init__(self, model_id, num_input_features, num_output_classes, model_save_path, **aux_params): self.model = CatBoostClassifier(loss_function="MultiClass", task_type=self._task_type, devices=self._devices, train_dir=self._train_dir, random_seed=SEED) self.model.set_params(**aux_params) self.model_id = model_id path = f"{model_save_path}/{model_id}" os.makedirs(path, exist_ok=True) self.model_path = path self.modelfile_save_path = os.path.join(path, STANDARD_MODEL_NAME) def load(self): self.model.load_model(self.modelfile_save_path) def save(self): self.model.save_model(self.modelfile_save_path) def fit(self, X_train, y_train, X_valid, y_valid): self.model.fit(Pool(X_train, y_train), eval_set=(X_valid, y_valid), use_best_model=True, verbose=self._verbose) self.save() def predict(self, X, load=False): if load: self.load() return self.model.predict_proba(X) def explain(self, X_train, y_train, features, classes): importances = self.model.get_feature_importance( data=Pool(X_train, y_train)) plot_importance(importances, features, self.model_path, self.model_id)
def boost_scor(t_s, sample_subm, model_paths, sol_path, model_number): cols_path = f'{model_paths}/cols_{model_number}' model_path = f'{model_paths}/model_{model_number}' cats_path = f'{model_paths}/cats_{model_number}' output_path = f'{sol_path}/sol_{model_number}.csv' test_scores = t_s.copy() cols = joblib.load(cols_path) cb2 = CatBoostClassifier() cb2.load_model(model_path) cats = joblib.load(cats_path) test_pool = Pool(test_scores[cols], cat_features=cats) test_scores['score'] = cb2.predict_proba(test_pool)[:, 1] test_scores = test_scores[['app_id', 'score']] sample_subm2 = sample_subm.merge(test_scores, on=['app_id']).drop(['product'], axis=1, errors='ignore') sample_subm2.rename(columns={'score': 'flag'}, inplace=True) sample_subm2.to_csv(output_path, index=False)
def get_model(cls): """Get the model object for this instance, loading it if it's not already loaded.""" if cls.model is None: tmp_model = CatBoostClassifier() m_path = os.path.join(model_path, 'heart.cbm') cls.model = tmp_model.load_model(m_path) # if cls.params is None: # with open(param_path, 'r') as in_str: # cls.params = json.loads(in_str.read()) return cls.model