Python dp Beispiele, zpylib.dp Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: data_partitions.py Projekt: zkurtz/kaggle_malware_2019

def test_sample():
    ''' Generate a 500k random sample of the testing data '''
    filepath = dp("refactored/densified_test.feather")
    print("loading " + filepath)
    df = read_feather(filepath, columns=types.predictors)

    # Append the raw data index
    assert 'raw_data_index' not in df.columns
    assert df.shape[0] == N_TEST
    df['raw_data_index'] = N_TRAIN + np.array(range(N_TEST))

    print("Uniformly sampling 500k rows from the testing data ... ")
    np.random.seed(0)
    idx_random = np.random.permutation(500000)

    print("Featherizing sample ... ")
    df.iloc[idx_random].reset_index(drop=True).to_feather(
        dp("refactored/test_sample.feather"))

Beispiel #2

0

Datei anzeigen

Datei: data_partitions.py Projekt: zkurtz/kaggle_malware_2019

def split_train():
    filepath = dp("refactored/densified_train.feather")
    print("loading " + filepath)
    df = read_feather(filepath, columns=types.predictors + [types.response])
    # Append the raw data index
    assert 'raw_data_index' not in df.columns
    assert df.shape[0] == N_TRAIN
    df['raw_data_index'] = range(N_TRAIN)

    print("Splitting into subgroups ... ")
    np.random.seed(0)
    idx_random = np.random.permutation(N_TRAIN)
    idx_groups = np.array_split(idx_random, NGROUPS)

    for k, idxs in enumerate(idx_groups):
        sk = str(k)
        print("... featherizing group " + sk + " of " + str(NGROUPS))
        fname = dp("refactored/train_split_" + sk + ".feather")
        df.iloc[idxs].reset_index(drop=True).to_feather(fname)

Beispiel #3

0

Datei anzeigen

Datei: raw_feature_metadata_build.py Projekt: zkurtz/kaggle_malware_2019

def make_metadata(col):
    print("Beginning column " + col)
    newcol = 'new_' + col
    X = load_features(col)
    series = X[col]
    md = pd.DataFrame(series.value_counts(dropna=False))
    md['counts'] = md[col]
    md[col] = md.index
    if col in VERSION_COLS:
        idx_df = build_version_index(series)
        md = md.merge(idx_df, how='left')
        md.sort_values(newcol, inplace=True)
    else:
        md[newcol] = range(md.shape[0])
    md.to_csv(dp('metadata/' + col), index=False)

Beispiel #4

0

Datei anzeigen

Datei: refactor_data.py Projekt: zkurtz/kaggle_malware_2019

def refactor_col(infile, col):
    print(" ... " + col)
    series = read_feather(infile, columns=[col])[col]
    if col in PREDCOLS:
        metadata = pd.read_csv(dp("metadata/" + col)).drop('counts', axis=1)
        if col in FeaturesByType.categorical:
            df = pd.DataFrame({col: series})
            df['order'] = range(df.shape[0])
            df = df.merge(metadata, how='left').sort_values('order')
            newcol = 'new_' + col
            series = df[newcol].fillna(metadata.shape[0])
            values = series.astype(
                np.int64
            ).values  # Lightgbm treats this as missing for categorical features
        elif metadata[col].dtype.name in ['int64', 'float64']:
            values = series.values
        else:
            pdb.set_trace()
    else:
        values = series.values
    return values

Beispiel #5

0

Datei anzeigen

        models = [cols2density(df, cols) for cols in groups]
    print("Scoring on all groups ...")
    scores_df = pd.DataFrame({
        'pydens_' + str(k): models[k].density(df[groups[k]])
        for k in range(len(groups))
    })
    print("Concatenating ...")
    new_df = pd.concat([df, scores_df], axis=1)
    print("Feathering ...")
    new_df.to_feather(outfile)
    return models


# Constants
np.random.seed(0)
ALLCOLS = pd.read_csv(dp('raw/train.csv'), nrows=2).columns.tolist()
DENSIFIABLE_COLS = [
    a for a in ALLCOLS
    if ((a != 'HasDetections') and (a != 'MachineIdentifier'))
]
N = 25  # number of features to densify per run
N_GROUPS = 10
GROUPS = [select_group(DENSIFIABLE_COLS, N).tolist() for k in range(N_GROUPS)]

# Main
models = densify(infile=dp("refactored/train.feather"),
                 outfile=dp("refactored/densified_train.feather"),
                 groups=GROUPS)
_ = densify(infile=dp("refactored/test.feather"),
            outfile=dp("refactored/densified_test.feather"),
            groups=GROUPS,

Beispiel #6

0

Datei anzeigen

Datei: rf.py Projekt: zkurtz/kaggle_malware_2019

import numpy as np
import os
import pandas as pd
import pdb
import pickle
from time import time

from zpylib import datatools
from zpylib.learn.gbm import Lgbm
from zpylib import stats
from zpylib import data_path as dp
from zpylib import model_path as mp

# Constants
TRAIN_PATHS = [
    dp('refactored/' + st) for st in os.listdir(dp('refactored'))
    if 'train_' in st
]
TRAIN_PATHS.sort()
TEST_PATH = dp('refactored/test.feather')
MODELS_PATH = mp('rf_3feather.pkl')


def read_response(f):
    col = 'HasDetections'
    return read_feather(f, columns=[col])[col].values


def multi_read_response(files):
    return np.concatenate([read_response(f) for f in files])

Beispiel #7

0

Datei anzeigen

Datei: gbm.py Projekt: zkurtz/kaggle_malware_2019

import numpy as np
import os
import pandas as pd
import pdb
import pickle
from time import time

from zpylib import datatools
from zpylib.learn.gbm import Lgbm
from zpylib import stats
from zpylib import data_path as dp
from zpylib import model_path as mp

# Constants
TRAIN_PATHS = [
    dp('refactored/' + st) for st in os.listdir(dp('refactored'))
    if 'train_' in st
]
TRAIN_PATHS.sort()
TEST_PATH = dp('refactored/test.feather')
SAMPLE_PATH = dp('submit/sample_submission.csv')
SUBMIT_PATH = dp('submit/submission.csv')
MODELS_PATH = mp('lgb_3feather.pkl')

EXTRA_PREDICTORS = ['pydens_' + str(k) for k in range(10)]
# KEEPERS = [
#     'SmartScreen', 'Census_OEMModelIdentifier', 'AvSigVersion', 'Census_FirmwareVersionIdentifier', 'CityIdentifier',
#     'continuous_AVProductStatesIdentifier', 'AVProductStatesIdentifier', 'CountryIdentifier',
#     'Census_ProcessorModelIdentifier', 'EngineVersion', 'AppVersion', 'Census_TotalPhysicalRAM', 'Census_OSVersion',
#     'Census_InternalPrimaryDiagonalDisplaySizeInInches', 'Wdft_IsGamer', 'Census_OSInstallTypeName', 'OsBuildLab',
#     'LocaleEnglishNameIdentifier', 'DefaultBrowsersIdentifier', 'IeVerIdentifier', 'GeoNameIdentifier',

Beispiel #8

0

Datei anzeigen

Datei: raw_feature_metadata_build.py Projekt: zkurtz/kaggle_malware_2019

def load_features(col):
    return read_feather(dp('raw/train.feather'), columns=[col])

Beispiel #9

0

Datei anzeigen

Datei: featherize.py Projekt: zkurtz/kaggle_malware_2019

import pandas as pd

from zpylib import data_path as dp

def featherize(infile, outfile):
    print("read_csv-ing " + infile)
    df = pd.read_csv(infile, low_memory=False)
    print("writing " + outfile)
    df.to_feather(outfile)

featherize(dp('raw/train.csv'), dp('raw/train.feather'))
featherize(dp('raw/test.csv'), dp('raw/test.feather'))

Beispiel #10

0

Datei anzeigen

# # Identify categoricals that also appear important as continuous features
# df = lgb1.importance()
# important_continuous = set(df[df.gain > 1000].feature.tolist())
# catcont = list(set(data.coltypes.categorical).intersection(important_continuous))
# # Of these, leave as categorical those with fewer than four distinct values
# mdf = metadata.build_refactored_metadata()
# mdf = mdf[mdf.is_categorical==1]
# mdf = mdf[mdf['nunique'] > 3]
# mycatcont = list(set(catcont).intersection(set(mdf.colname.tolist())))
# "['" + "', '".join(mycatcont) + "']"
# pd.read_csv(dp('metadata/AVProductStatesIdentifier'))


def filter_categoricals_on_nunique(cols, lb=4):
    mdf = metadata.build_refactored_metadata()
    mdf = mdf[mdf.is_categorical == 1]
    mdf = mdf[mdf['nunique'] > 3]
    okcols = mdf.colname.tolist()
    return list(set(cols).intersection(set(okcols)))


dtypes = datatools.FeaturesByType()
df = datatools.read_feather(dp('refactored/train_split_0.feather'))
data = datatools.Data(df, select=dtypes.categorical)
lgb = train_one_gbm(data, cat=True)
top_categoricals = lgb.importance().head(10).feature.tolist()
filter_categoricals_on_nunique(top_categoricals)
pdb.set_trace()

pd.read_csv(dp('metadata/SmartScreen'))

Beispiel #11

0

Datei anzeigen

from feather import read_dataframe as read_feather
import numpy as np
import pdb

import pydens  # Install from github:https://github.com/zkurtz/pydens
import zpylib as zp
from zpylib import data_path as dp

target = 'HasDetections'
models_path = zp.model_path('lgb_3feather.pkl')
top_features = zp.model_loaders.which_top_features(models_path, N=100)
top_features_with_target = top_features + [target]

###############
## Load the data for the top features both for a sample of train and test
train_df = read_feather(dp("refactored/train_split_0.feather"),
                        columns=top_features_with_target).iloc[:500000]
test_df = read_feather(dp("refactored/test_sample.feather"),
                       columns=top_features)
train_data = zp.datatools.Data(train_df.drop(target, axis=1))
test_data = zp.datatools.Data(test_df)
cats = [
    f for f in train_data.X.columns if f in train_data.coltypes.categorical
]

###############
## Fit a density model
classifier = pydens.classifiers.lightgbm.Lgbm(categorical_features=cats,
                                              verbose=True)
num_dens_params = {'loner_min_count': 100, 'binning_params': {'max_bins': 20}}
cade = pydens.cade.Cade(classifier=classifier,

Beispiel #12

0

Datei anzeigen

Datei: versions.py Projekt: zkurtz/kaggle_malware_2019

from distutils.version import LooseVersion
import pandas as pd
import pdb

from zpylib import datatools
from zpylib import data_path as dp

cols = datatools.identify_version_features(
)  # + ['MachineIdentifier'] # see assertion below ...

print("Read version features in train.csv")
vftrain = pd.read_csv(dp("raw/train.csv"), usecols=cols + ['HasDetections'])
print("Read version features in test.csv")
vftest = pd.read_csv(dp("raw/test.csv"), usecols=cols)
#assert vftrain.shape[0] + vftest.shape[0] == pd.concat([vftrain.MachineIdentifier, vftest.MachineIdentifier]).nunique()
df = pd.concat([vftrain, vftest], ignore_index=True, sort=True)


def build_version_index(v):
    values = df[v].unique().tolist()
    values.sort(key=LooseVersion)
    return pd.DataFrame({v: values, 'idx': range(len(values))})


vmeta = {v: build_version_index(v) for v in cols}


def order_correlation(v):
    print(v)
    lkp = vmeta[v]
    x = vftrain[[v]]

Beispiel #13

0

Datei anzeigen

Datei: refactor_data.py Projekt: zkurtz/kaggle_malware_2019

            newcol = 'new_' + col
            series = df[newcol].fillna(metadata.shape[0])
            values = series.astype(
                np.int64
            ).values  # Lightgbm treats this as missing for categorical features
        elif metadata[col].dtype.name in ['int64', 'float64']:
            values = series.values
        else:
            pdb.set_trace()
    else:
        values = series.values
    return values


def refactor(infile, outfile, expected_columns):
    print("### Loading " + infile)
    df = pd.DataFrame(
        {col: refactor_col(infile, col)
         for col in expected_columns})
    df.to_feather(outfile)


# Constants
ALLCOLS = pd.read_csv(dp('raw/train.csv'), nrows=2).columns.tolist()
ALLCOLS_EXCEPT_RESPONSE = [a for a in ALLCOLS if a != 'HasDetections']
PREDCOLS = train_colnames()

# Main
refactor(dp("raw/train.feather"), dp("refactored/train.feather"), ALLCOLS)
refactor(dp("raw/test.feather"), dp("refactored/test.feather"),
         ALLCOLS_EXCEPT_RESPONSE)