def test_sample(): ''' Generate a 500k random sample of the testing data ''' filepath = dp("refactored/densified_test.feather") print("loading " + filepath) df = read_feather(filepath, columns=types.predictors) # Append the raw data index assert 'raw_data_index' not in df.columns assert df.shape[0] == N_TEST df['raw_data_index'] = N_TRAIN + np.array(range(N_TEST)) print("Uniformly sampling 500k rows from the testing data ... ") np.random.seed(0) idx_random = np.random.permutation(500000) print("Featherizing sample ... ") df.iloc[idx_random].reset_index(drop=True).to_feather( dp("refactored/test_sample.feather"))
def split_train(): filepath = dp("refactored/densified_train.feather") print("loading " + filepath) df = read_feather(filepath, columns=types.predictors + [types.response]) # Append the raw data index assert 'raw_data_index' not in df.columns assert df.shape[0] == N_TRAIN df['raw_data_index'] = range(N_TRAIN) print("Splitting into subgroups ... ") np.random.seed(0) idx_random = np.random.permutation(N_TRAIN) idx_groups = np.array_split(idx_random, NGROUPS) for k, idxs in enumerate(idx_groups): sk = str(k) print("... featherizing group " + sk + " of " + str(NGROUPS)) fname = dp("refactored/train_split_" + sk + ".feather") df.iloc[idxs].reset_index(drop=True).to_feather(fname)
def make_metadata(col): print("Beginning column " + col) newcol = 'new_' + col X = load_features(col) series = X[col] md = pd.DataFrame(series.value_counts(dropna=False)) md['counts'] = md[col] md[col] = md.index if col in VERSION_COLS: idx_df = build_version_index(series) md = md.merge(idx_df, how='left') md.sort_values(newcol, inplace=True) else: md[newcol] = range(md.shape[0]) md.to_csv(dp('metadata/' + col), index=False)
def refactor_col(infile, col): print(" ... " + col) series = read_feather(infile, columns=[col])[col] if col in PREDCOLS: metadata = pd.read_csv(dp("metadata/" + col)).drop('counts', axis=1) if col in FeaturesByType.categorical: df = pd.DataFrame({col: series}) df['order'] = range(df.shape[0]) df = df.merge(metadata, how='left').sort_values('order') newcol = 'new_' + col series = df[newcol].fillna(metadata.shape[0]) values = series.astype( np.int64 ).values # Lightgbm treats this as missing for categorical features elif metadata[col].dtype.name in ['int64', 'float64']: values = series.values else: pdb.set_trace() else: values = series.values return values
models = [cols2density(df, cols) for cols in groups] print("Scoring on all groups ...") scores_df = pd.DataFrame({ 'pydens_' + str(k): models[k].density(df[groups[k]]) for k in range(len(groups)) }) print("Concatenating ...") new_df = pd.concat([df, scores_df], axis=1) print("Feathering ...") new_df.to_feather(outfile) return models # Constants np.random.seed(0) ALLCOLS = pd.read_csv(dp('raw/train.csv'), nrows=2).columns.tolist() DENSIFIABLE_COLS = [ a for a in ALLCOLS if ((a != 'HasDetections') and (a != 'MachineIdentifier')) ] N = 25 # number of features to densify per run N_GROUPS = 10 GROUPS = [select_group(DENSIFIABLE_COLS, N).tolist() for k in range(N_GROUPS)] # Main models = densify(infile=dp("refactored/train.feather"), outfile=dp("refactored/densified_train.feather"), groups=GROUPS) _ = densify(infile=dp("refactored/test.feather"), outfile=dp("refactored/densified_test.feather"), groups=GROUPS,
import numpy as np import os import pandas as pd import pdb import pickle from time import time from zpylib import datatools from zpylib.learn.gbm import Lgbm from zpylib import stats from zpylib import data_path as dp from zpylib import model_path as mp # Constants TRAIN_PATHS = [ dp('refactored/' + st) for st in os.listdir(dp('refactored')) if 'train_' in st ] TRAIN_PATHS.sort() TEST_PATH = dp('refactored/test.feather') MODELS_PATH = mp('rf_3feather.pkl') def read_response(f): col = 'HasDetections' return read_feather(f, columns=[col])[col].values def multi_read_response(files): return np.concatenate([read_response(f) for f in files])
import numpy as np import os import pandas as pd import pdb import pickle from time import time from zpylib import datatools from zpylib.learn.gbm import Lgbm from zpylib import stats from zpylib import data_path as dp from zpylib import model_path as mp # Constants TRAIN_PATHS = [ dp('refactored/' + st) for st in os.listdir(dp('refactored')) if 'train_' in st ] TRAIN_PATHS.sort() TEST_PATH = dp('refactored/test.feather') SAMPLE_PATH = dp('submit/sample_submission.csv') SUBMIT_PATH = dp('submit/submission.csv') MODELS_PATH = mp('lgb_3feather.pkl') EXTRA_PREDICTORS = ['pydens_' + str(k) for k in range(10)] # KEEPERS = [ # 'SmartScreen', 'Census_OEMModelIdentifier', 'AvSigVersion', 'Census_FirmwareVersionIdentifier', 'CityIdentifier', # 'continuous_AVProductStatesIdentifier', 'AVProductStatesIdentifier', 'CountryIdentifier', # 'Census_ProcessorModelIdentifier', 'EngineVersion', 'AppVersion', 'Census_TotalPhysicalRAM', 'Census_OSVersion', # 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 'Wdft_IsGamer', 'Census_OSInstallTypeName', 'OsBuildLab', # 'LocaleEnglishNameIdentifier', 'DefaultBrowsersIdentifier', 'IeVerIdentifier', 'GeoNameIdentifier',
def load_features(col): return read_feather(dp('raw/train.feather'), columns=[col])
import pandas as pd from zpylib import data_path as dp def featherize(infile, outfile): print("read_csv-ing " + infile) df = pd.read_csv(infile, low_memory=False) print("writing " + outfile) df.to_feather(outfile) featherize(dp('raw/train.csv'), dp('raw/train.feather')) featherize(dp('raw/test.csv'), dp('raw/test.feather'))
# # Identify categoricals that also appear important as continuous features # df = lgb1.importance() # important_continuous = set(df[df.gain > 1000].feature.tolist()) # catcont = list(set(data.coltypes.categorical).intersection(important_continuous)) # # Of these, leave as categorical those with fewer than four distinct values # mdf = metadata.build_refactored_metadata() # mdf = mdf[mdf.is_categorical==1] # mdf = mdf[mdf['nunique'] > 3] # mycatcont = list(set(catcont).intersection(set(mdf.colname.tolist()))) # "['" + "', '".join(mycatcont) + "']" # pd.read_csv(dp('metadata/AVProductStatesIdentifier')) def filter_categoricals_on_nunique(cols, lb=4): mdf = metadata.build_refactored_metadata() mdf = mdf[mdf.is_categorical == 1] mdf = mdf[mdf['nunique'] > 3] okcols = mdf.colname.tolist() return list(set(cols).intersection(set(okcols))) dtypes = datatools.FeaturesByType() df = datatools.read_feather(dp('refactored/train_split_0.feather')) data = datatools.Data(df, select=dtypes.categorical) lgb = train_one_gbm(data, cat=True) top_categoricals = lgb.importance().head(10).feature.tolist() filter_categoricals_on_nunique(top_categoricals) pdb.set_trace() pd.read_csv(dp('metadata/SmartScreen'))
from feather import read_dataframe as read_feather import numpy as np import pdb import pydens # Install from github:https://github.com/zkurtz/pydens import zpylib as zp from zpylib import data_path as dp target = 'HasDetections' models_path = zp.model_path('lgb_3feather.pkl') top_features = zp.model_loaders.which_top_features(models_path, N=100) top_features_with_target = top_features + [target] ############### ## Load the data for the top features both for a sample of train and test train_df = read_feather(dp("refactored/train_split_0.feather"), columns=top_features_with_target).iloc[:500000] test_df = read_feather(dp("refactored/test_sample.feather"), columns=top_features) train_data = zp.datatools.Data(train_df.drop(target, axis=1)) test_data = zp.datatools.Data(test_df) cats = [ f for f in train_data.X.columns if f in train_data.coltypes.categorical ] ############### ## Fit a density model classifier = pydens.classifiers.lightgbm.Lgbm(categorical_features=cats, verbose=True) num_dens_params = {'loner_min_count': 100, 'binning_params': {'max_bins': 20}} cade = pydens.cade.Cade(classifier=classifier,
from distutils.version import LooseVersion import pandas as pd import pdb from zpylib import datatools from zpylib import data_path as dp cols = datatools.identify_version_features( ) # + ['MachineIdentifier'] # see assertion below ... print("Read version features in train.csv") vftrain = pd.read_csv(dp("raw/train.csv"), usecols=cols + ['HasDetections']) print("Read version features in test.csv") vftest = pd.read_csv(dp("raw/test.csv"), usecols=cols) #assert vftrain.shape[0] + vftest.shape[0] == pd.concat([vftrain.MachineIdentifier, vftest.MachineIdentifier]).nunique() df = pd.concat([vftrain, vftest], ignore_index=True, sort=True) def build_version_index(v): values = df[v].unique().tolist() values.sort(key=LooseVersion) return pd.DataFrame({v: values, 'idx': range(len(values))}) vmeta = {v: build_version_index(v) for v in cols} def order_correlation(v): print(v) lkp = vmeta[v] x = vftrain[[v]]
newcol = 'new_' + col series = df[newcol].fillna(metadata.shape[0]) values = series.astype( np.int64 ).values # Lightgbm treats this as missing for categorical features elif metadata[col].dtype.name in ['int64', 'float64']: values = series.values else: pdb.set_trace() else: values = series.values return values def refactor(infile, outfile, expected_columns): print("### Loading " + infile) df = pd.DataFrame( {col: refactor_col(infile, col) for col in expected_columns}) df.to_feather(outfile) # Constants ALLCOLS = pd.read_csv(dp('raw/train.csv'), nrows=2).columns.tolist() ALLCOLS_EXCEPT_RESPONSE = [a for a in ALLCOLS if a != 'HasDetections'] PREDCOLS = train_colnames() # Main refactor(dp("raw/train.feather"), dp("refactored/train.feather"), ALLCOLS) refactor(dp("raw/test.feather"), dp("refactored/test.feather"), ALLCOLS_EXCEPT_RESPONSE)