def data_from_feather(file, threshold=5): print("reading feather " + file) t0 = time() df=read_feather(file) tdiff = round(time() - t0) if tdiff > threshold: print(" ... that took " + str(tdiff) + " seconds") return Data(df)
def data_from_feather(file, threshold=5): print("reading feather " + file) t0 = time() df = read_feather(file) tdiff = round(time() - t0) if tdiff > threshold: print(" ... that took " + str(tdiff) + " seconds") return datatools.Data( df, expand_categorical_as_continuous=False #select=KEEPERS )
def densify(infile, outfile, groups, models=None): print("### Loading " + infile) df = read_feather(infile) if models is None: models = [cols2density(df, cols) for cols in groups] print("Scoring on all groups ...") scores_df = pd.DataFrame({ 'pydens_' + str(k): models[k].density(df[groups[k]]) for k in range(len(groups)) }) print("Concatenating ...") new_df = pd.concat([df, scores_df], axis=1) print("Feathering ...") new_df.to_feather(outfile) return models
def test_sample(): ''' Generate a 500k random sample of the testing data ''' filepath = dp("refactored/densified_test.feather") print("loading " + filepath) df = read_feather(filepath, columns=types.predictors) # Append the raw data index assert 'raw_data_index' not in df.columns assert df.shape[0] == N_TEST df['raw_data_index'] = N_TRAIN + np.array(range(N_TEST)) print("Uniformly sampling 500k rows from the testing data ... ") np.random.seed(0) idx_random = np.random.permutation(500000) print("Featherizing sample ... ") df.iloc[idx_random].reset_index(drop=True).to_feather( dp("refactored/test_sample.feather"))
def split_train(): filepath = dp("refactored/densified_train.feather") print("loading " + filepath) df = read_feather(filepath, columns=types.predictors + [types.response]) # Append the raw data index assert 'raw_data_index' not in df.columns assert df.shape[0] == N_TRAIN df['raw_data_index'] = range(N_TRAIN) print("Splitting into subgroups ... ") np.random.seed(0) idx_random = np.random.permutation(N_TRAIN) idx_groups = np.array_split(idx_random, NGROUPS) for k, idxs in enumerate(idx_groups): sk = str(k) print("... featherizing group " + sk + " of " + str(NGROUPS)) fname = dp("refactored/train_split_" + sk + ".feather") df.iloc[idxs].reset_index(drop=True).to_feather(fname)
def refactor_col(infile, col): print(" ... " + col) series = read_feather(infile, columns=[col])[col] if col in PREDCOLS: metadata = pd.read_csv(dp("metadata/" + col)).drop('counts', axis=1) if col in FeaturesByType.categorical: df = pd.DataFrame({col: series}) df['order'] = range(df.shape[0]) df = df.merge(metadata, how='left').sort_values('order') newcol = 'new_' + col series = df[newcol].fillna(metadata.shape[0]) values = series.astype( np.int64 ).values # Lightgbm treats this as missing for categorical features elif metadata[col].dtype.name in ['int64', 'float64']: values = series.values else: pdb.set_trace() else: values = series.values return values
def read_response(f): col = 'HasDetections' return read_feather(f, columns=[col])[col].values
def load_features(col): return read_feather(dp('raw/train.feather'), columns=[col])
from feather import read_dataframe as read_feather import numpy as np import pdb import pydens # Install from github:https://github.com/zkurtz/pydens import zpylib as zp from zpylib import data_path as dp target = 'HasDetections' models_path = zp.model_path('lgb_3feather.pkl') top_features = zp.model_loaders.which_top_features(models_path, N=100) top_features_with_target = top_features + [target] ############### ## Load the data for the top features both for a sample of train and test train_df = read_feather(dp("refactored/train_split_0.feather"), columns=top_features_with_target).iloc[:500000] test_df = read_feather(dp("refactored/test_sample.feather"), columns=top_features) train_data = zp.datatools.Data(train_df.drop(target, axis=1)) test_data = zp.datatools.Data(test_df) cats = [ f for f in train_data.X.columns if f in train_data.coltypes.categorical ] ############### ## Fit a density model classifier = pydens.classifiers.lightgbm.Lgbm(categorical_features=cats, verbose=True) num_dens_params = {'loner_min_count': 100, 'binning_params': {'max_bins': 20}} cade = pydens.cade.Cade(classifier=classifier, initial_density=pydens.models.JointDensity(