Example #1
0
def data_from_feather(file, threshold=5):
    print("reading feather " + file)
    t0 = time()
    df=read_feather(file)
    tdiff = round(time() - t0)
    if tdiff > threshold:
        print(" ... that took " + str(tdiff) + " seconds")
    return Data(df)
Example #2
0
def data_from_feather(file, threshold=5):
    print("reading feather " + file)
    t0 = time()
    df = read_feather(file)
    tdiff = round(time() - t0)
    if tdiff > threshold:
        print(" ... that took " + str(tdiff) + " seconds")
    return datatools.Data(
        df,
        expand_categorical_as_continuous=False  #select=KEEPERS
    )
Example #3
0
def densify(infile, outfile, groups, models=None):
    print("### Loading " + infile)
    df = read_feather(infile)
    if models is None:
        models = [cols2density(df, cols) for cols in groups]
    print("Scoring on all groups ...")
    scores_df = pd.DataFrame({
        'pydens_' + str(k): models[k].density(df[groups[k]])
        for k in range(len(groups))
    })
    print("Concatenating ...")
    new_df = pd.concat([df, scores_df], axis=1)
    print("Feathering ...")
    new_df.to_feather(outfile)
    return models
def test_sample():
    ''' Generate a 500k random sample of the testing data '''
    filepath = dp("refactored/densified_test.feather")
    print("loading " + filepath)
    df = read_feather(filepath, columns=types.predictors)

    # Append the raw data index
    assert 'raw_data_index' not in df.columns
    assert df.shape[0] == N_TEST
    df['raw_data_index'] = N_TRAIN + np.array(range(N_TEST))

    print("Uniformly sampling 500k rows from the testing data ... ")
    np.random.seed(0)
    idx_random = np.random.permutation(500000)

    print("Featherizing sample ... ")
    df.iloc[idx_random].reset_index(drop=True).to_feather(
        dp("refactored/test_sample.feather"))
def split_train():
    filepath = dp("refactored/densified_train.feather")
    print("loading " + filepath)
    df = read_feather(filepath, columns=types.predictors + [types.response])
    # Append the raw data index
    assert 'raw_data_index' not in df.columns
    assert df.shape[0] == N_TRAIN
    df['raw_data_index'] = range(N_TRAIN)

    print("Splitting into subgroups ... ")
    np.random.seed(0)
    idx_random = np.random.permutation(N_TRAIN)
    idx_groups = np.array_split(idx_random, NGROUPS)

    for k, idxs in enumerate(idx_groups):
        sk = str(k)
        print("... featherizing group " + sk + " of " + str(NGROUPS))
        fname = dp("refactored/train_split_" + sk + ".feather")
        df.iloc[idxs].reset_index(drop=True).to_feather(fname)
def refactor_col(infile, col):
    print(" ... " + col)
    series = read_feather(infile, columns=[col])[col]
    if col in PREDCOLS:
        metadata = pd.read_csv(dp("metadata/" + col)).drop('counts', axis=1)
        if col in FeaturesByType.categorical:
            df = pd.DataFrame({col: series})
            df['order'] = range(df.shape[0])
            df = df.merge(metadata, how='left').sort_values('order')
            newcol = 'new_' + col
            series = df[newcol].fillna(metadata.shape[0])
            values = series.astype(
                np.int64
            ).values  # Lightgbm treats this as missing for categorical features
        elif metadata[col].dtype.name in ['int64', 'float64']:
            values = series.values
        else:
            pdb.set_trace()
    else:
        values = series.values
    return values
Example #7
0
def read_response(f):
    col = 'HasDetections'
    return read_feather(f, columns=[col])[col].values
def load_features(col):
    return read_feather(dp('raw/train.feather'), columns=[col])
Example #9
0
from feather import read_dataframe as read_feather
import numpy as np
import pdb

import pydens  # Install from github:https://github.com/zkurtz/pydens
import zpylib as zp
from zpylib import data_path as dp

target = 'HasDetections'
models_path = zp.model_path('lgb_3feather.pkl')
top_features = zp.model_loaders.which_top_features(models_path, N=100)
top_features_with_target = top_features + [target]

###############
## Load the data for the top features both for a sample of train and test
train_df = read_feather(dp("refactored/train_split_0.feather"),
                        columns=top_features_with_target).iloc[:500000]
test_df = read_feather(dp("refactored/test_sample.feather"),
                       columns=top_features)
train_data = zp.datatools.Data(train_df.drop(target, axis=1))
test_data = zp.datatools.Data(test_df)
cats = [
    f for f in train_data.X.columns if f in train_data.coltypes.categorical
]

###############
## Fit a density model
classifier = pydens.classifiers.lightgbm.Lgbm(categorical_features=cats,
                                              verbose=True)
num_dens_params = {'loner_min_count': 100, 'binning_params': {'max_bins': 20}}
cade = pydens.cade.Cade(classifier=classifier,
                        initial_density=pydens.models.JointDensity(