Beispiel #1
0
def predict_chunk(df_, clfs_, meta_, features, train_mean):
    # process all features
    full_test = featurize(df_, meta_)
    full_test.fillna(0, inplace=True)

    # Make predictions
    preds_ = None
    for clf in clfs_:
        if preds_ is None:
            preds_ = clf.predict_proba(full_test[features])
        else:
            preds_ += clf.predict_proba(full_test[features])

    preds_ = preds_ / len(clfs_)

    # Compute preds_99 as the proba of class not being any of the others
    # preds_99 = 0.1 gives 1.769
    preds_99 = np.ones(preds_.shape[0])
    for i in range(preds_.shape[1]):
        preds_99 *= (1 - preds_[:, i])

    # Create DataFrame from predictions
    preds_df_ = pd.DataFrame(
        preds_, columns=['class_{}'.format(s) for s in clfs_[0].classes_])
    preds_df_['object_id'] = full_test['object_id']
    preds_df_['class_99'] = 0.14 * preds_99 / np.mean(preds_99)
    return preds_df_
def main(argc, argv):
    meta_train = process_meta('../input/training_set_metadata.csv')

    train = pd.read_csv('../input/training_set.csv')
    full_train = featurize(train, meta_train)

    # Taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    galactic_classes_weights = {c: 1 for c in galactic_classes}
    extragalactic_classes_weights = {c: 1 for c in extragalactic_classes}
    extragalactic_classes_weights.update({c: 2 for c in [64, 15]})

    full_train.fillna(0, inplace=True)
    galactic_cut = full_train['hostgal_photoz'] == 0

    clfs_gal, score_gal = train_model(full_train[galactic_cut],
                            galactic_classes, galactic_classes_weights, 'gal')
    clfs_ext, score_ext = train_model(full_train[~galactic_cut],
                            extragalactic_classes, extragalactic_classes_weights, 'ext')

    filename = 'subm_{:.6f}_{:.6f}_{}.csv'.format(
            score_gal, score_ext,
            datetime.now().strftime('%Y-%m-%d-%H-%M'))
    print('save to {}'.format(filename))

    # TEST
    if 'target' in full_train:
        y = full_train['target']
        del full_train['target']

    if 'object_id' in full_train:
        object_id = full_train['object_id']
        del full_train['object_id']
        del full_train['hostgal_specz']
        del full_train['ra'], full_train['decl'], full_train['gal_l'], full_train['gal_b']
        del full_train['ddf']
    process_test(clfs_gal, clfs_ext,
                 features=full_train.columns,
                 filename=filename,
                 chunks=5000000)

    z = pd.read_csv(filename)
    print("Shape BEFORE grouping: {}".format(z.shape))
    z = z.groupby('object_id').mean()
    print("Shape AFTER grouping: {}".format(z.shape))
    z.to_csv('single_{}'.format(filename), index=True)
Beispiel #3
0
def main(argc, argv):
    meta_train = process_meta('../input/training_set_metadata.csv')

    train = pd.read_csv('../input/training_set.csv')
    full_train = featurize(train, meta_train)

    if 'target' in full_train:
        y = full_train['target']
        del full_train['target']

    classes = sorted(y.unique())
    # Taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    class_weights = {c: 1 for c in classes}
    class_weights.update({c: 2 for c in [64, 15]})
    print('Unique classes : {}, {}'.format(len(classes), classes))
    print(class_weights)
    # sanity check: classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    # sanity check: class_weights = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2,
    #                                65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}

    if 'object_id' in full_train:
        object_id = full_train['object_id']
        del full_train['object_id']
        del full_train['hostgal_specz']
        del full_train['ra'], full_train['decl'], full_train[
            'gal_l'], full_train['gal_b']
        del full_train['ddf']

    train_mean = full_train.mean(axis=0)
    # train_mean.to_hdf('train_data.hdf5', 'data')
    pd.set_option('display.max_rows', 500)
    print(full_train.describe().T)
    full_train.fillna(0, inplace=True)

    eval_func = partial(
        lgbm_modeling_cross_validation,
        full_train=full_train,
        y=y,
        classes=classes,
        class_weights=class_weights,
        id=object_id,
        nr_fold=5,
        random_state=1,
    )

    lgbm_params.update({'n_estimators': 1000})

    # modeling from CV
    clfs, score = eval_func(lgbm_params)

    filename = 'subm_{:.6f}_{}.csv'.format(
        score,
        datetime.now().strftime('%Y-%m-%d-%H-%M'))
    print('save to {}'.format(filename))

    # TEST
    process_test(clfs,
                 features=full_train.columns,
                 train_mean=train_mean,
                 filename=filename,
                 chunks=5000000)

    z = pd.read_csv(filename)
    print("Shape BEFORE grouping: {}".format(z.shape))
    z = z.groupby('object_id').mean()
    print("Shape AFTER grouping: {}".format(z.shape))
    z.to_csv('single_{}'.format(filename), index=True)
Beispiel #4
0
def predict_chunk(df_, clfs_gal_, clfs_ext_, meta_, features):
    # process all features
    full_test = featurize(df_, meta_)
    full_test.fillna(0, inplace=True)

    galactic_cut = full_test['hostgal_photoz'] == 0
    gal_test = full_test[galactic_cut]
    ext_test = full_test[~galactic_cut]

    # Make predictions
    preds_gal = None
    if not gal_test.empty:
        for clf in clfs_gal_:
            if preds_gal is None:
                preds_gal = clf.predict_proba(gal_test[features])
            else:
                preds_gal += clf.predict_proba(gal_test[features])

        preds_gal = preds_gal / len(clfs_gal_)

        preds_99_gal = np.ones(preds_gal.shape[0])
        for i in range(preds_gal.shape[1]):
            preds_99_gal *= (1 - preds_gal[:, i])

        # Create DataFrame from predictions
        preds_gal = pd.DataFrame(
            preds_gal,
            columns=['class_{}'.format(s) for s in clfs_gal_[0].classes_])
        assert preds_gal.shape[0] == gal_test.shape[
            0], 'len of preds={}, test={}'.format(preds_gal.shape[0],
                                                  gal_test.shape[0])
        preds_gal['object_id'] = gal_test['object_id'].values
        for c in ['class_{}'.format(s) for s in extragalactic_classes]:
            preds_gal.insert(0, c, 0.0)
        preds_gal['class_99'] = 0.017 * preds_99_gal / np.mean(preds_99_gal)

    preds_ext = None
    if not ext_test.empty:
        for clf in clfs_ext_:
            if preds_ext is None:
                preds_ext = clf.predict_proba(ext_test[features])
            else:
                preds_ext += clf.predict_proba(ext_test[features])

        preds_ext = preds_ext / len(clfs_ext_)

        preds_99_ext = np.ones(preds_ext.shape[0])
        for i in range(preds_ext.shape[1]):
            preds_99_ext *= (1 - preds_ext[:, i])

        # Create DataFrame from predictions
        preds_ext = pd.DataFrame(
            preds_ext,
            columns=['class_{}'.format(s) for s in clfs_ext_[0].classes_])
        assert preds_ext.shape[0] == ext_test.shape[
            0], 'len of preds={}, test={}'.format(preds_ext.shape[0],
                                                  ext_test.shape[0])
        preds_ext['object_id'] = ext_test['object_id'].values
        for c in ['class_{}'.format(s) for s in galactic_classes]:
            preds_ext.insert(0, c, 0.0)
        preds_ext['class_99'] = 0.17 * preds_99_ext / np.mean(preds_99_ext)

    preds_df_ = pd.concat([preds_gal, preds_ext],
                          ignore_index=True,
                          sort=False)

    return preds_df_