Esempio n. 1
0
def train():
    env = my_env.MyEnv(0, realtime_mode=True)

    model = CatBoostClassifier()
    model.load_model("catboost_model.model")

    score = 0.0
    print_interval = 1

    for n_epi in range(10000):
        s = env.reset()
        done = False
        while not done:
            y_pred1 = model.predict(s, prediction_type="Probability")
            
            if deterministic:
                y_pred_max = int(np.argmax(y_pred1))
                a = action_mapping(y_pred_max)
            else:
                a = int(np.random.choice([0, 1, 3, 4, 5], p=y_pred1))            
            
            s_prime, r, done, info = env.step(a)

            s = s_prime

            score += r
            if done:
                break

        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {:.5f}".format(n_epi, score/print_interval))
            score = 0.0

    env.close()
Esempio n. 2
0
def test_export_to_python_after_load():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=40, random_seed=0)
    model.fit(train_pool)
    pred_model = model.predict(test_pool, prediction_type='RawFormulaVal')
    model.save_model(OUTPUT_MODEL_PATH)
    model_loaded = CatBoostClassifier()
    model_loaded.load_model(OUTPUT_MODEL_PATH)
    model_loaded.save_model(OUTPUT_PYTHON_MODEL_PATH,
                            format="python",
                            pool=train_pool)
    pred_model_loaded = model_loaded.predict(test_pool,
                                             prediction_type='RawFormulaVal')
    import sys
    import os.path
    module_dir = os.path.dirname(OUTPUT_PYTHON_MODEL_PATH)
    sys.path.insert(0, module_dir)
    from model import apply_catboost_model as apply_catboost_model_from_python
    pred_python = []
    for test_line in test_pool.get_features():
        float_features, cat_features = _split_features(
            test_line, train_pool.get_cat_feature_indices(),
            test_pool.get_cat_feature_hash_to_string())
        pred_python.append(
            apply_catboost_model_from_python(float_features, cat_features))
    assert _check_data(pred_model, pred_python)
    assert _check_data(pred_model_loaded, pred_python)
Esempio n. 3
0
def get_predict_2020():
    df_data = pd.read_csv("dvhb_data/test/test 2020/grouped_full.csv", index_col=0)

    # кодирую слова векторами
    if os.path.isfile('cult_token.txtdic'):
        dictionary = corpora.Dictionary.load('cult_token.txtdic')
    else:
        df_train_full = my_full_cvs("dvhb_data/train", "train_full.csv")
        df_train_full_new_names = ['CODE_CULT', 'CODE_GROUP', 'CENTROID', 'YEAR']
        df_train_full.columns = df_train_full_new_names
        text = [df_train_full['CODE_CULT'].tolist()]

        dictionary = corpora.Dictionary(text)
        dictionary.save('cult_token.txtdic')

    # заменяем значения в столбце object_name_n на данные из словаря, а ключи берем из столбца object_type_number
    df_data['CODE_CULT_2019'] = df_data['CODE_CULT_2019'].map(dictionary.token2id)
    df_data['CODE_CULT_2018'] = df_data['CODE_CULT_2018'].map(dictionary.token2id)
    df_data['CODE_CULT_2017'] = df_data['CODE_CULT_2017'].map(dictionary.token2id)
    df_data['CODE_CULT_2016'] = df_data['CODE_CULT_2016'].map(dictionary.token2id)
    df_data['CODE_CULT_2015'] = df_data['CODE_CULT_2015'].map(dictionary.token2id)

    df_data.rename(columns={f'CODE_CULT_{2015 + i}': f'{i + 1}' for i in range(6)}, inplace=True)

    model = CatBoostClassifier()
    model.load_model("catboostmodel")
    predictions_valid = model.predict(
        df_data[['2', '3', '4', '5', 'LATITUDE', 'LONGTITUDE']].rename(columns={'2': '1', '3': '2', '4': '3', '5': '4'})
    )
    df_data = df_data.assign(CODE_CULT_2020=predictions_valid)

    df_data.rename(columns={f'{i + 1}': f'CODE_CULT_{2015 + i}' for i in range(6)}, inplace=True)
    df_permanent = df_data[
        (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2016'])
        & (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2017'])
        & (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2018'])
        & (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2019'])]
    df_two_year = df_data[
        (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2016'])
        & (df_data['CODE_CULT_2017'] == df_data['CODE_CULT_2018'])
        & (df_data['CODE_CULT_2015'] != df_data['CODE_CULT_2018'])
        & (df_data['CODE_CULT_2019'] != df_data['CODE_CULT_2018'])
        & ~df_data.index.isin(df_permanent.index)]

    for row in df_permanent.iterrows():
        df_data.loc[row[0]]['CODE_CULT_2020'] = row[1]['CODE_CULT_2015']

    for row in df_two_year.iterrows():
        df_data.loc[row[0]]['CODE_CULT_2020'] = row[1]['CODE_CULT_2019']

    df_data['CODE_CULT_2020'] = df_data['CODE_CULT_2020'].map(dictionary.get)
    df_data['CODE_CULT_2019'] = df_data['CODE_CULT_2019'].map(dictionary.get)
    df_data['CODE_CULT_2018'] = df_data['CODE_CULT_2018'].map(dictionary.get)
    df_data['CODE_CULT_2017'] = df_data['CODE_CULT_2017'].map(dictionary.get)
    df_data['CODE_CULT_2016'] = df_data['CODE_CULT_2016'].map(dictionary.get)
    df_data['CODE_CULT_2015'] = df_data['CODE_CULT_2015'].map(dictionary.get)

    df_data[['CODE_CULT_2015', 'CODE_CULT_2016', 'CODE_CULT_2017', 'CODE_CULT_2018', 'CODE_CULT_2019', 'CODE_CULT_2020',
             'LATITUDE', 'LONGTITUDE']].to_csv('predict_2020_full.csv', index=True)
    df_data['CODE_CULT_2020'].to_csv('predict_2020.csv', index=True)
Esempio n. 4
0
def gbm_predict(data):
    model = CatBoostClassifier()
    model.load_model('./models/gbm1.cbm')

    output = model.predict(data)

    return output
def submit(args):
    global model
    #model = CatBoostEnsembleModel()
    model = CatBoostClassifier()  #task_type="GPU")
    print('loading {}...'.format(args.model_file))
    model.load_model(args.model_file)
    #print(dir(model))
    print(model.classes_)
    #print(model.feature_importances_)
    print(model._tree_count)
    print(model.learning_rate_)

    #exit(0)

    print('loading {}...'.format(args.detect_pred_file))
    df_det = pd.read_csv(args.detect_pred_file)
    df_det['dets'] = df_det.PredictionString.map(lambda x: get_det(str(x)))
    print('detected objs:', df_det.dets.map(lambda x: len(x)).sum())

    print('predicting...')
    df_sub = df_det.copy()
    df_sub.PredictionString = ''

    bg = time.time()
    df_sub = parallel_apply(df_sub, add_pred_string)
    df_sub.to_csv(args.out,
                  columns=['ImageId', 'PredictionString'],
                  index=False)

    print('Done, total time:', time.time() - bg)
    def predict(self, feature_names):
        """
        Input:
            feature_names: directionary of features' names
        Output:
            predict_df: Dataframe(["MachineIdentifier", "HasDetections")
        """
        model_directory_path = Path(__file__).absolute().parents[2] / "data" / "model" / str(get_version())
        preds = None
        FOLDS = 5
        predict_df = None
        for fold in range(FOLDS):
            model_path = model_directory_path / "valid{}.model".format(fold)
            clf = CatBoostClassifier()
            clf.load_model(fname=str(model_path))
            valid = "valid{}".format(fold)
            test_df = super().get_feature_df(feature_names, valid, "test")
            if predict_df is None:
                predict_df = test_df["MachineIdentifier"]
            test_df = test_df.set_index("MachineIdentifier")
            if preds is None:
                preds = self.predict_chunk(clf, test_df) / FOLDS
            else:
                preds += self.predict_chunk(clf, test_df) / FOLDS

        predict_df = pd.DataFrame(predict_df)
        predict_df["HasDetections"] = preds
        return predict_df

        return predict_df
def train_catboost_with_hyperopt(train_df, valid_df, target, features, categorical_features, catboost_options, hyperopt_options):
    logging.info('Running hyper parameters optimization: %s', config2json(hyperopt_options))

    space = dict()
    for param, opts in hyperopt_options['space'].items():
        expression = getattr(hp, opts['expression'])
        space[param] = expression(label=param, **opts['params'])

    fcn = get_hyperopt_objective(train_df, valid_df, target, features, categorical_features, catboost_options)

    trials = Trials()
    opt = fmin(
        fn=fcn,
        space=space,
        algo=tpe.suggest,
        trials=trials,
        max_evals=hyperopt_options['max_evals']
    )

    with open('hyperopt_trials.json', 'w') as f:
        json.dump(trials.results, f, indent=4)

    logging.info('Best parameters: %s', opt)

    best_trial, best_trial_result = min(enumerate(trials.results), key=lambda r: r[1]['loss'])
    logging.info('Best model %d: AUC=%s, model=%s' % (
        best_trial, best_trial_result['quality']['valid']['auc'], best_trial_result['model']['file']))

    best_model = CatBoostClassifier()
    best_model.load_model(best_trial_result['model']['file'])
    return best_trial_result['quality']['train'], best_trial_result['quality']['valid'], best_model
Esempio n. 8
0
class CatBoostWrapper(mlflow.pyfunc.PythonModel):
    """
    MLflow wrapper for CatBoost estimators.
    """
    def load_context(self, context):
        # pylint: disable=attribute-defined-outside-init
        with open(context.artifacts['pipeline'], 'rb') as f:
            self.pipeline = pickle.load(f)

        with open(context.artifacts['col_config'], 'rb') as f:
            column_config = pickle.load(f)

        self.clf = CatBoostClassifier()
        self.clf.load_model(context.artifacts['cbm_model'])
        self.col_names = column_config['col_names']
        self.preserve_cols = column_config['preserve_neg_vals']

    def preprocess(self, data):
        """
        Applies the pre-processing pipeline to the features given in the input dataset.

        :param data: Input dataset.
        :return: Transformed dataset.
        """
        data = data[self.col_names]
        data = remove_inf_values(data)
        data = remove_negative_values(data, ignore_cols=self.preserve_cols)
        return self.pipeline.transform(data)

    def predict(self, context, model_input):
        X = self.preprocess(model_input)
        return self.clf.predict(X)
Esempio n. 9
0
def load_model_list(dir_path):
    # print('catboost load model_list ')
    model_list = []
    for i in range(config.num_classes):
        model = CatBoostClassifier()
        model.load_model(os.path.join(dir_path, 'model_for_class_%d.dump' % i))
        model_list.append(model)
    return model_list
Esempio n. 10
0
def load_model(name, alg, i):
    if alg == "rf":
        model = joblib.load("results/models/" + name + "_" + alg + "_" +
                            str(i))
    else:
        model = CatBoostClassifier()
        model.load_model("results/models/" + name + "_" + alg + "_" + str(i))
    return model
Esempio n. 11
0
def load_catboost_model(model_name: str) -> CatBoostClassifier:
    """Reads `model_name` from `PATH_MODELS` and returns
    the fitted catboost model
    """
    test_model_from_file = CatBoostClassifier()

    test_model_from_file.load_model(str(PATH_MODELS / model_name))

    return test_model_from_file
Esempio n. 12
0
def predict_catboost(model_path, big_category, model_names=model_names):
    test_x = read_probabilties(proba_folder=os.path.join(
        ROOT_PROBA_FOLDER, big_category),
                               subset='test')
    test_data = Pool(test_x)
    from_file = CatBoostClassifier()
    from_file.load_model(model_path)
    predictions = from_file.predict(test_data)
    return predictions
Esempio n. 13
0
 def toloka(self):
     model = CatBoostClassifier()
     model.load_model(self.args.model_path)
     catboost_pool = self.to_ml_input(self.test_pool.pool, "test")
     test_y_pred = model.predict_proba(catboost_pool)
     test_y_max = list()
     for pred_proba_y in test_y_pred:
         (max_index, proba) = max(enumerate(pred_proba_y), key=operator.itemgetter(1))
         test_y_max.append((int(model.classes_[max_index]), proba))
     self.test_pool.build_toloka_pool(test_y_max, self.args.toloka_pool)
Esempio n. 14
0
def test_multiclass():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8)
    classifier.fit(pool)
    classifier.save_model(OUTPUT_MODEL_PATH)
    new_classifier = CatBoostClassifier()
    new_classifier.load_model(OUTPUT_MODEL_PATH)
    pred = new_classifier.predict_proba(pool)
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
Esempio n. 15
0
def test_multiclass():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8)
    classifier.fit(pool)
    classifier.save_model(OUTPUT_MODEL_PATH)
    new_classifier = CatBoostClassifier()
    new_classifier.load_model(OUTPUT_MODEL_PATH)
    pred = new_classifier.predict_proba(pool)
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
    def get_model(cls):
        """Get the model object for this instance, loading it if it's not already loaded."""
        if cls.model == None:
            print('i am here')
            model_file = CatBoostClassifier()
            model_file.load_model(os.path.join(model_path, 'model-classification-prod'))

            with open(os.path.join(model_path, 'obj_col_categories.pkl'), 'rb') as inp:
                obj_col_categories = pickle.load(inp)
                print('Model is loaded:-')
            cls.model = [obj_col_categories, model_file]
        return cls.model
def recognition_emotion_from_voice():
    classifier2 = CatBoostClassifier(iterations=1000,
                                     learning_rate=0.25,
                                     depth=5,
                                     loss_function='MultiClassOneVsAll',
                                     eval_metric="Accuracy")
    classifier2.load_model("stable_model")
    if len(os.listdir("data/voice/")) >= 3:
        data = At.dirsWavFeatureExtraction(["data/voice"], 1, 1, 0.05, 0.05)
        result = classifier2.predict(data[0][0])
        result = [x[0] for x in result]
        return max(result, key=result.count)
    return None
Esempio n. 18
0
def test_model():
    model = CatBoostClassifier() #task_type="GPU")
    model.load_model('insideof/cat_470_167.model')

    df_vrd = pd.read_csv(os.path.join(DATA_DIR, 'challenge-2019-train-vrd.csv'))
    df_pos = df_vrd.loc[df_vrd.RelationshipLabel=='under'].copy()
    #df_pos = pd.read_csv('insideof/df_neg.csv').iloc[3000:]
    df_pos = parallel_apply(df_pos, add_features)
    X = df_pos.drop(['ImageID', 'RelationshipLabel'], axis=1)
    p = model.predict_proba(X)
    y = model.predict(X)
    print(p[:100])
    print(y[:100] == 1)
Esempio n. 19
0
def load_catboost_predictor(name):
    clf = CatBoostClassifier()
    clf.load_model(f"{name}.cbm")
    le = load(f"{name}_le.job")
    vect = load(f"{name}_vect.job")

    def predict(data):
        compressed_data = vect.transform(data)
        vect_data = todense(compressed_data)
        preds = clf.predict(vect_data).flatten().astype('int64')
        return le.inverse_transform(preds).flatten()

    return predict
Esempio n. 20
0
    def fit_predict(self, img_path):
        """
        Create embedding for given image
        Arguments:
        img -- image to get embedding from(array from cv2.imread())
        Return:
        pred -- predicted name for given image
        probas -- probabilities for every class
        """

        detector_fa = dlib.get_frontal_face_detector()
        embedder = cv2.dnn.readNetFromTorch(self.embedderFile)
        predictor = dlib.shape_predictor(self.predictorFile)
        aligner = FaceAligner(predictor)

        img = cv2.imread(img_path)
        img = imutils.resize(img, width=600)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        rect = detector_fa(gray, 3)

        #  make sure that face is detected or search again longer
        if len(rect) != 0:
            rect = rect[0]
        else:
            rect = detector_fa(gray, 5)[0]

        face_aligned = aligner.align(img, gray, rect)
        '''
        (x, y, w, h) = helpers.rect_to_bb(rect)
        cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
        cv2.imshow('face', img)
        cv2.waitKey(0)
        '''

        face_blob = cv2.dnn.blobFromImage(face_aligned,
                                          scalefactor=1. / 255,
                                          size=(96, 96),
                                          mean=(0, 0, 0),
                                          swapRB=True)

        embedder.setInput(face_blob)
        vec = embedder.forward()

        model = CatBoostClassifier()
        model.load_model('trained_model')
        pred = model.predict(vec)
        probas = model.predict_proba(vec)

        return pred, probas
Esempio n. 21
0
def segment(infile, outfile):
    model = CatBoostClassifier()
    print("before laod model")
    model.load_model("models/catboost_1.model")
    print("after laod model")
    ff = extract_features(infile, feature_codes_1, return_style='dataframe')
    X = ff[['chr_position'] + feature_codes_1]
    y = model.predict(X)
    ff['predictions'] = y.astype(int)
    preds = ff.groupby(['line_no', 'word_no'])['predictions'].apply(list)
    with open(infile, 'r') as reader:
        sentences = reader.readlines()
    seg_sents = get_human_readable_segmentation(sentences, preds)
    with open(outfile, 'w') as segfile:
        segfile.writelines('\n'.join(seg_sents))
Esempio n. 22
0
def depar():
    #order of attributes MONTH,AIRLINE,ORIGIN_AIRPORT,DEST_AIRPORT,SCHEDULED_DEPARTURE,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL
    unisys_dep_delay_model = CatBoostClassifier()
    unisys_dep_delay_model.load_model("unisys_departure_delay")
    myclient = pymongo.MongoClient("mongodb://localhost:27017/")
    img = io.BytesIO()
    mydb = myclient["flightdb"]
    mycol = mydb["passengers2"]
    airport = pd.read_csv('airports.csv')
    pnr = request.form['in']
    df = pd.read_csv('sample.csv')
    myquery = {"PNR": pnr}
    mydocs = mycol.find(myquery)
    for mydoc in mydocs:
        df_air = airport[(airport['IATA_CODE'] == mydoc['oair'])]
        lat = df_air['LATITUDE'].iloc[0]
        lan = df_air['LONGITUDE'].iloc[0]
        x = np.array([
            mydoc['month'], mydoc['airline'], mydoc['oair'], mydoc['dair'],
            mydoc['schdep'], mydoc['schtime'], mydoc['dist'], mydoc['scharr']
        ])
        preds_class = unisys_dep_delay_model.predict(x)
        if preds_class > 0:
            op = "No Delay in Departure"
            url = "https://cdn2.iconfinder.com/data/icons/yellow-smiles/1000/Smile-Icons-02_Converted-01-512.png"
            colr = "#007944"
        else:
            op = "Delay in Departure"
            url = "https://cdn0.iconfinder.com/data/icons/emoticons-round-smileys/137/Emoticons-14-512.png"
            colr = "#c81912"
        temp_df = df[df['ORIGIN_AIRPORT'] == mydoc['oair']]
        df_month1 = temp_df.groupby(['AIRLINE'])[['target_departure']].mean()
        df_month1.reset_index(inplace=True)
        plt.bar(df_month1['AIRLINE'], df_month1['target_departure'])
        plt.xlabel('AIRLINE')
        plt.ylabel('% of delay')
        plt.savefig(img, format='jpg')
        img.seek(0)
        plt.clf()
        purl = base64.b64encode(img.getvalue()).decode()
        return render_template('depdelay.html',
                               res=op,
                               iurl=url,
                               col=colr,
                               pltu=purl,
                               ap=mydoc['oair'],
                               sla=lat,
                               slo=lan)
Esempio n. 23
0
def main(args):
    # get data
    X, y = get_gbm_database(args.telemetry_path,
                            args.maint_path,
                            args.machines_path,
                            args.errors_path,
                            args.failures_path,
                            seq_len=args.out_seq_len,
                            machine_id=args.machine_id,
                            )
    X_gbm = X.iloc[args.seq_len:-args.out_seq_len]
    y_target = y.iloc[args.seq_len:-args.out_seq_len]

    dm = TelemetryDataModule(path=args.telemetry_path,
                             seq_len=args.seq_len,
                             out_seq_len=args.out_seq_len,
                             batch_size=X_gbm.shape[0],
                             num_workers=args.num_workers,)
    dm.setup(stage="prodaction")
    X_lstm = dm.prodaction_dataset()
    
    # load models
    lstm = LSTM.load_from_checkpoint(checkpoint_path=args.checkpoint_path + '/lstm.ckpt',
                                     n_features=args.n_features,
                                     hidden_size=args.hidden_size,
                                     seq_len=args.seq_len,
                                     out_seq_len=args.out_seq_len,
                                     batch_size=X_gbm.shape[0],
                                     criterion=args.criterion,
                                     num_layers=args.num_layers,
                                     dropout=args.dropout,
                                     learning_rate=args.learning_rate,
                                     )
    lstm.freeze()
    
    gbm = CatBoostClassifier()
    gbm.load_model(args.checkpoint_path + '/gbm.cbm')

    # prediction
    y_hat_lstm = None
    for (x, _) in X_lstm:
        y_hat_lstm = lstm(x)

    X_gbm = get_lstm_feature(X_gbm, y_hat_lstm)

    score = gbm.score(X_gbm, y_target)

    print('Model accuracy: {0:.2f}%'.format(score*100))
Esempio n. 24
0
def catboost_predict_classes(
        data_path: InputPath('CSV'),
        model_path: InputPath('CatBoostModel'),
        predictions_path: OutputPath(),
        label_column: int = None,
):
    '''Predict classes using the CatBoost classifier model.

    Args:
        data_path: Path for the data in CSV format.
        model_path: Path for the trained model in binary CatBoostModel format.
        label_column: Column containing the label data.
        predictions_path: Output path for the predictions.

    Outputs:
        predictions: Class predictions in text format.

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    import tempfile

    from catboost import CatBoostClassifier, Pool
    import numpy

    if label_column:
        column_descriptions = {label_column: 'Label'}
        column_description_path = tempfile.NamedTemporaryFile(
            delete=False).name
        with open(column_description_path, 'w') as column_description_file:
            for idx, kind in column_descriptions.items():
                column_description_file.write('{}\t{}\n'.format(idx, kind))
    else:
        column_description_path = None

    eval_data = Pool(
        data_path,
        column_description=column_description_path,
        has_header=True,
        delimiter=',',
    )

    model = CatBoostClassifier()
    model.load_model(model_path)

    predictions = model.predict(eval_data)
    numpy.savetxt(predictions_path, predictions, fmt='%s')
Esempio n. 25
0
class Model:
    def __init__(self):
        self.model = CatBoostClassifier()
        self.model.load_model(os.getcwd() + '\\model.bkp')
        self.old_data = pd.read_excel(os.getcwd() + '\\old_data.xlsx')

    def predict(self, data):
        return self.model.predict(data)

    def retrain(self, data):
        new_data = pd.concat([self.old_data, data], axis=0, ignore_index=True)
        self.old_data = new_data.drop(labels=[
            'ID (Идентификатор Заявки)', 'ID (Идентификатор Клиента)',
            'Дата заявки', 'Unnamed: 0', '  - count', '  - summ'
        ],
                                      axis='columns')
        X = self.old_data.drop(labels='Target (90 mob 12)', axis='columns')
        y = self.old_data['Target (90 mob 12)']
        X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7)
        X.to_excel(os.getcwd() + '\\mmmm.xlsx')
        categorical_features_indices = []
        print(categorical_features_indices, X)
        self.model = CatBoostClassifier(
            thread_count=2,
            iterations=50,
            depth=1,
            l2_leaf_reg=2,
            learning_rate=0.001,
            random_seed=62,
            od_type='Iter',
            od_wait=10,
            custom_loss=['F1', 'AUC'],
            auto_class_weights='Balanced',
            use_best_model=True,
        )

        self.model.fit(X_train,
                       y_train,
                       cat_features=categorical_features_indices,
                       eval_set=(X_val, y_val),
                       logging_level='Silent',
                       plot=True)
        self.model.save_model(os.getcwd() + '\\model.bkp')
        self.old_data.to_excel(os.getcwd() + '\\old_data.xlsx',
                               index_label=False)
Esempio n. 26
0
def run(exp_name, data_type):
    tes_m = feather.read_dataframe('../others/tes_m.feather')
    le = load_pickle('../others/label_encoder.pkl')
    y = le.transform(np.load('../others/train_target.npy'))
    distmod_mask = np.load('../others/distmod_mask.npy')
    ex_gal_labels = np.where(np.bincount(y[distmod_mask]) != 0)[0]
    gal_labels = np.where(np.bincount(y[~distmod_mask]) != 0)[0]
    ex_gal_index = ((tes_m['hostgal_specz'].isnull()) &
                    (~tes_m['distmod'].isnull())).values
    ex_gal_spec_index = ((~tes_m['hostgal_specz'].isnull()) &
                         (~tes_m['distmod'].isnull())).values
    gal_index = (tes_m['distmod'].isnull()).values

    fn_s = np.load('../fi/' + exp_name + '_fn_s_' + data_type + '.npy')
    fn_s = [el.replace('/', '_') for el in fn_s]
    X_test = load_arr(fn_s, 'test')

    model = CatBoostClassifier()
    model.load_model('../models/' + exp_name + '_' + data_type + '.cbm')

    if data_type == 'ex_gal':
        real_test_data = FeaturesData(X_test.astype(np.float32)[ex_gal_index])
        y_pred_ex_gal = model.predict_proba(real_test_data)
        ex_gal_pred = np.zeros((y_pred_ex_gal.shape[0], 14))
        ex_gal_pred[:, ex_gal_labels] = y_pred_ex_gal
        np.save('../preds/' + data_type + '_pred_' + exp_name + '.npy',
                ex_gal_pred)
    elif data_type == 'ex_gal_spec':
        real_test_data = FeaturesData(
            X_test.astype(np.float32)[ex_gal_spec_index])
        y_pred_ex_gal_spec = model.predict_proba(real_test_data)
        ex_gal_spec_pred = np.zeros((y_pred_ex_gal_spec.shape[0], 14))
        ex_gal_spec_pred[:, ex_gal_labels] = y_pred_ex_gal_spec
        np.save('../preds/' + data_type + '_pred_' + exp_name + '.npy',
                ex_gal_spec_pred)
    elif data_type == 'gal':
        real_test_data = FeaturesData(X_test.astype(np.float32)[gal_index])
        y_pred_gal = model.predict_proba(real_test_data)
        gal_pred = np.zeros((y_pred_gal.shape[0], 14))
        gal_pred[:, gal_labels] = y_pred_gal
        np.save('../preds/' + data_type + '_pred_' + exp_name + '.npy',
                gal_pred)
    else:
        raise Error
    gc.collect()
Esempio n. 27
0
class CatBoostEnsembleModel:
    def __init__(self):
        self.model1 = CatBoostClassifier()
        self.model2 = CatBoostClassifier()
        self.model1.load_model('lb23578/cat_154k_144_1000.model')
        self.model2.load_model('lb22592/cat_0820_500_143.model')
        print(self.model1.classes_)

    def predict_with_proba(self, X, w=[0.7, 0.3]):
        p1 = self.model1.predict_proba(X)
        p2 = self.model2.predict_proba(X)
        prob = p1*w[0] + p2*w[1]
        idx = np.argmax(prob, axis=1)
        assert len(idx) == len(prob)
        labels = np.array(self.model1.classes_)[idx]
        assert len(labels) == len(prob)

        return labels, prob
Esempio n. 28
0
    def predict_from_df_prod(self, df):
        """
        Production prediction code.
        """

        hidden_states = self.create_hidden_states(df)

        # Combine the metadata with the transformer output
        metadata_df = df[['sc_id_cat', 'version_number', 'partisan_lean']]
        metadata_df.reset_index(drop=True, inplace=True)
        feature_extractor_df = pd.concat(
            [metadata_df, pd.DataFrame(hidden_states)], axis=1)

        # Run the Catboost Classifier.
        catboost_model = CatBoostClassifier()
        catboost_model.load_model('models/catboost.production')
        preds_cat = catboost_model.predict_proba(feature_extractor_df)[:, 1]
        return preds_cat
Esempio n. 29
0
class CatBoost:
    _verbose = 200
    _train_dir = DATA_CACHE_DIR
    _is_gpu_available = get_gpu_device_count()
    _task_type = "GPU" if _is_gpu_available > 0 else None
    _devices = "GPU" if _is_gpu_available > 0 else None

    def __init__(self, model_id, num_input_features, num_output_classes,
                 model_save_path, **aux_params):
        self.model = CatBoostClassifier(loss_function="MultiClass",
                                        task_type=self._task_type,
                                        devices=self._devices,
                                        train_dir=self._train_dir,
                                        random_seed=SEED)
        self.model.set_params(**aux_params)
        self.model_id = model_id

        path = f"{model_save_path}/{model_id}"
        os.makedirs(path, exist_ok=True)
        self.model_path = path
        self.modelfile_save_path = os.path.join(path, STANDARD_MODEL_NAME)

    def load(self):
        self.model.load_model(self.modelfile_save_path)

    def save(self):
        self.model.save_model(self.modelfile_save_path)

    def fit(self, X_train, y_train, X_valid, y_valid):
        self.model.fit(Pool(X_train, y_train),
                       eval_set=(X_valid, y_valid),
                       use_best_model=True,
                       verbose=self._verbose)
        self.save()

    def predict(self, X, load=False):
        if load:
            self.load()
        return self.model.predict_proba(X)

    def explain(self, X_train, y_train, features, classes):
        importances = self.model.get_feature_importance(
            data=Pool(X_train, y_train))
        plot_importance(importances, features, self.model_path, self.model_id)
Esempio n. 30
0
def boost_scor(t_s, sample_subm, model_paths, sol_path, model_number):
    cols_path = f'{model_paths}/cols_{model_number}'
    model_path = f'{model_paths}/model_{model_number}'
    cats_path = f'{model_paths}/cats_{model_number}'
    output_path = f'{sol_path}/sol_{model_number}.csv'
    test_scores = t_s.copy()
    cols = joblib.load(cols_path)
    cb2 = CatBoostClassifier()
    cb2.load_model(model_path)
    cats = joblib.load(cats_path)
    test_pool = Pool(test_scores[cols], cat_features=cats)
    test_scores['score'] = cb2.predict_proba(test_pool)[:, 1]
    test_scores = test_scores[['app_id', 'score']]
    sample_subm2 = sample_subm.merge(test_scores,
                                     on=['app_id']).drop(['product'],
                                                         axis=1,
                                                         errors='ignore')
    sample_subm2.rename(columns={'score': 'flag'}, inplace=True)
    sample_subm2.to_csv(output_path, index=False)
Esempio n. 31
0
    def get_model(cls):
        """Get the model object for this instance, loading it if it's not already loaded."""
        if cls.model is None:
            tmp_model = CatBoostClassifier()
            m_path = os.path.join(model_path, 'heart.cbm')
            cls.model = tmp_model.load_model(m_path)

        # if cls.params is None:
        #     with open(param_path, 'r') as in_str:
        #         cls.params = json.loads(in_str.read())

        return cls.model