Exemple #1
0
def initial_processing(df, mode):
    if df.memory_usage().sum() > BIG_DATASET_SIZE:
        is_big = True
    else:
        is_big = False

    with Profiler(' - features from datetime'):
        df, date_cols, orig_date_cols = transform_datetime_features(df)

    cat_cols = get_cat_freqs(df)

    numeric_cols = [c for c in df.columns if c.startswith('number')]

    with Profiler(' - reindex new cols'):
        used_cols = date_cols + list(cat_cols) + numeric_cols
        df = df.reindex(columns=used_cols)

    # if is_big:
    #     with Profiler(' - convert to float32'):
    #         df[numeric_cols] = df[numeric_cols].astype(np.float32)

    print(f' - Cat: {len(cat_cols)}, num: {len(numeric_cols)}, date: {len(date_cols)}, orig_dt: {len(orig_date_cols)}')
    print(f' - Used: {len(used_cols)}, memory: {get_mem(df)}')
    params = dict(
        cat_cols=cat_cols,
        numeric_cols=numeric_cols,
        date_cols=date_cols,
        used_cols=used_cols
    )
    return df, params
def load_data(filename, datatype='train', cfg={}):

    model_config = cfg
    model_config['missing'] = True

    # read dataset
    df = pd.read_csv(filename, low_memory=False)
    if datatype == 'train':
        y = df.target
        df = df.drop('target', axis=1)
        if df.memory_usage().sum() > BIG_DATASET_SIZE:
            model_config['is_big'] = True
    else:
        y = None
    print('Dataset read, shape {}'.format(df.shape))

    # features from datetime
    df = transform_datetime_features(df)
    print('Transform datetime done, shape {}'.format(df.shape))

    # categorical encoding
    if datatype == 'train':
        df, categorical_values = transform_categorical_features(df)
        model_config['categorical_values'] = categorical_values
    else:
        df, categorical_values = transform_categorical_features(df, model_config['categorical_values'])
    print('Transform categorical done, shape {}'.format(df.shape))

    # drop constant features
    if datatype == 'train':
        constant_columns = [
            col_name
            for col_name in df.columns
            if df[col_name].nunique() == 1
            ]
        df.drop(constant_columns, axis=1, inplace=True)

    # filter columns
    if datatype == 'train':
        model_config['used_columns'] = [c for c in df.columns if check_column_name(c) or c in categorical_values]
    used_columns = model_config['used_columns']
    print('Used {} columns'.format(len(used_columns)))

    line_id = df[['line_id', ]]
    df = df[used_columns]

    # missing values
    if model_config['missing']:
        df.fillna(-1, inplace=True)

    return df.values.astype(np.float16) if 'is_big' in model_config else df, y, model_config, line_id
Exemple #3
0
def preprocess(df, model_config, type='train'):
    """preprocessing and feature engineering for input data"""

    print('preprocess data..')

    # extract datetime features
    df = transform_datetime_features(df)
    print('datetime features extracted')

    # categorical count encoding
    if type == 'train':
        df, categorical_values = count_encoding(df)
        model_config['categorical_values'] = categorical_values
    elif type == 'test':
        df = count_encoding(df, model_config['categorical_values'])
    print('count encoding of categorical features added')

    # drop constant features
    if type == 'train':
        df = drop_const_cols(df)

    # scaling
    # if mtype == 'train':
    #     df, scaler_mean, scaler_std = std_scaler(df)
    #     model_config['scaler_mean'] = scaler_mean
    #     model_config['scaler_std'] = scaler_std
    # elif type=='test':
    #     df = model_config['scaler'].transform(df)

    # filter columns
    if type == 'train':
        df, used_columns = filter_columns(df, groups=['number', 'count'])
        model_config['used_columns'] = used_columns
    elif type == 'test':
        df_pred = df[['line_id']]
        df = df[model_config['used_columns']]

    # missing values
    df.fillna(-1, inplace=True)

    # convert if dataframe is too big
    # if model_config['is_big']:
    #     df = df.astype(np.float16)

    if type == 'train':
        return df, model_config
    else:
        return df, df_pred
Exemple #4
0
    args = parser.parse_args()

    start_time = time.time()

    # load model
    model_config_filename = os.path.join(args.model_dir, 'model_config.pkl')
    with open(model_config_filename, 'rb') as fin:
        model_config = pickle.load(fin)

    # read dataset
    df = pd.read_csv(args.test_csv)
    print('Dataset read, shape {}'.format(df.shape))

    if not model_config['is_big']:
        # features from datetime
        df = transform_datetime_features(df)

        # categorical encoding
        for col_name, unique_values in model_config[
                'categorical_values'].items():
            for unique_value in unique_values:
                df['onehot_{}={}'.format(
                    col_name,
                    unique_value)] = (df[col_name] == unique_value).astype(int)

    # missing values
    if model_config['missing']:
        df.fillna(-1, inplace=True)
    elif any(df.isnull()):
        df.fillna(value=df.mean(axis=0), inplace=True)
                        required=True)
    args = parser.parse_args()

    start_time = time.time()

    df = pd.read_csv(args.train_csv)
    df_y = df.target
    df_X = df.drop('target', axis=1)

    print('Dataset read, shape {}'.format(df_X.shape))

    # dict with data necessary to make predictions
    model_config = {}

    # features from datetime
    df_X = transform_datetime_features(df_X)

    # missing values
    if any(df_X.isnull()):
        model_config['missing'] = True
        df_X.fillna(-1, inplace=True)

    # categorical encoding
    categorical_values = {}
    for col_name in list(df_X.columns):
        col_unique_values = df_X[col_name].unique()
        if 2 < len(col_unique_values) <= ONEHOT_MAX_UNIQUE_VALUES:
            categorical_values[col_name] = col_unique_values
            for unique_value in col_unique_values:
                df_X['onehot_{}={}'.format(col_name, unique_value)] = (
                    df_X[col_name] == unique_value).astype(int)
Exemple #6
0
def main(args):
    start_time = time.time()

    # load model
    model_config_filename = os.path.join(args.model_dir, 'model_config.pkl')
    with open(model_config_filename, 'rb') as fin:
        model_config = pickle.load(fin)

    mode = model_config['mode']
    # read dataset
    df = pd.read_csv(args.test_csv)
    print('Test Dataset read, shape {}'.format(df.shape))

    if not model_config['is_big']:
        ##
        # features from datetime
        transform_datetime_features(df)

        ##
        # categorical encoding
        for col_name, unique_values in model_config[
                'categorical_values'].items():
            for unique_value in unique_values:
                df['onehot_{}={}'.format(
                    col_name,
                    unique_value)] = (df[col_name] == unique_value).astype(int)
    else:
        ##
        # features from datetime
        transform_datetime_features(df)
        ##
        codestring_columns = model_config['codestring_columns']
        for col_name in codestring_columns:
            l = df[col_name].dropna().str.len().unique()[0]
            for i in range(l):
                df['string_{}_{}'.format(col_name, i)] = df[col_name].str[i]
    ##
    # missing values
    if model_config['missing']:
        df.fillna(-1, inplace=True)
    elif any(df.isnull()):
        df.fillna(value=df.mean(axis=0), inplace=True)

    number_columns = model_config['number_columns']
    datetime_columns = model_config['datetime_columns']
    id_columns = model_config['id_columns']

    ##
    if len(id_columns) > 0 and len(
            datetime_columns) > 0 and mode == MODE_REGRESSION:
        # check_3
        def f_trans(x):
            for cn in number_columns:
                x['{}_s{}'.format(cn, -1)] = x[cn].shift(-1).fillna(0)

            return x

        df = df[id_columns + ['line_id'] +
                number_columns].groupby(id_columns).apply(f_trans)

    ##
    if 3 <= len(datetime_columns) <= 10:
        # check_4
        print('Add delta datetime columns')
        #for cn in datetime_columns:
        #    df[cn] = pd.to_datetime(df[cn])
        import itertools
        for c1, c2 in list(itertools.combinations(datetime_columns, 2)):
            df['number_{}_{}'.format(c1, c2)] = (df[c1] - df[c2]).dt.days

    # filter columns
    used_columns = model_config['used_columns']

    # scale
    #X_scaled = model_config['scaler'].transform(df[used_columns])
    X_scaled = df[used_columns]

    model = model_config['model']

    df['prediction'] = model.predict(X_scaled)

    # if mode == MODE_REGRESSION:
    #     df['prediction'] = model.predict(X_scaled)
    # elif mode == MODE_CLASSIFICATION:
    #     if hasattr(model, 'predict_proba'):
    #         df['prediction'] = model.predict_proba(X_scaled)[:, 1]
    #     else:
    #         df['prediction'] = model._predict_proba_lr(X_scaled)[:, 1]
    # else:
    #     raise Exception('Ошибочный режим {}'.format(mode))

    df[['line_id', 'prediction']].to_csv(args.prediction_csv, index=False)

    print('Prediction time: {}'.format(time.time() - start_time))
    return 0
Exemple #7
0
df = pd.read_csv(train_csv)
print('Dataset read, shape {}'.format(df.shape))
print('Dataset memory usage {:.3} MB'.format(df.memory_usage().sum() / 1024 /
                                             1024))
df_test = pd.read_csv(test_csv)
print('Test Dataset read, shape {}'.format(df.shape))
y_true = pd.read_csv(target_csv)

df_test = pd.merge(df_test, y_true, on='line_id')

df_y = df.target
#df_X = df.drop('target', axis=1)
df_X = df.copy()

df_X = transform_datetime_features(df_X)
df_test = transform_datetime_features(df_test)

df_X_u = df_X[df_X['id_0'] == 102].copy()
df_test_u = df_test[df_test['id_0'] == 102].copy()

# onehot для дней недели, месяца и крартала
droped_columns = ['number_{}'.format(i) for i in range(23)]
# Удаляем не информативные колонки
df_X_u.drop(droped_columns, axis=1, inplace=True)

# number_22, 24, 25 сдвиг на 3 дня назад, то есть основная колонка number_25
# number_26 будни или рабочей день
df_X_u.drop('number_26', axis=1, inplace=True)
# number_32-38 onehot каждые 5 дней, начиная с 1 числа
droped_columns = ['number_{}'.format(i) for i in range(32, 38 + 1)]
Exemple #8
0
    ONEHOT_MAX_UNIQUE_VALUES = model_config['ONEHOT_MAX_UNIQUE_VALUES']
    # read dataset
    df = pd.read_csv(args.test_csv)
    print('Dataset read, shape {}'.format(df.shape))

    # drop train constant values
    df.drop(model_config['constant_columns'], axis=1, inplace=True)
    # rename c_, d_, r_
    df = utils.add_prefix_to_colnames(df, ONEHOT_MAX_UNIQUE_VALUES)
    # missing values
    _, df = utils.replace_na_and_create_na_feature(df,
                                                   model_config['na_features'])

    if not model_config['is_big']:
        # features from datetime
        df = utils.transform_datetime_features(df)

        # categorical onehot encoding
        df = utils.onehot_encoding_test(df,
                                        model_config['categorical_to_onehot'])

        # real number feature extraction

    # filter columns
    used_columns = model_config['used_columns']

    # scale
    X_scaled = model_config['scaler'].transform(df[used_columns])

    model = model_config['model']
    if model_config['mode'] == 'regression':
Exemple #9
0
def preprocess(df, model_config, y=None, type='train', count_enc=True,
               likelihood_enc=False, scaling=False,
               use_groups=['number', 'count', 'likelihood']):
               
    """preprocessing and feature engineering for input data"""

    print('preprocess data..')

    # extract datetime features
    df = transform_datetime_features(df)
    print('datetime features extracted')

    # categorical count encoding
    if count_enc:
        if type == 'train':
            df, categorical_values = count_encoding(df)
            model_config['categorical_values'] = categorical_values
        elif type == 'test':
            df = count_encoding(df, model_config['categorical_values'])
        print('count encoding of categorical features added')

    # mean target encoding
    if likelihood_enc:
        if type == 'train':
            cat_cols = df.columns[df.columns.str.startswith('string')].tolist()
            df, categorical_values, global_avg = likelihood_encoding(df.join(y), cat_cols)
            model_config['mean_target'] = categorical_values
            model_config['global_avg'] = global_avg
            model_config['cat_cols'] = cat_cols
        elif type == 'test':
            df = likelihood_encoding(df, model_config['cat_cols'],
                                     categorical_values=model_config['mean_target'],
                                     global_avg=model_config['global_avg'])
        print('mean target encoding of categorical features added')

    # drop constant features
    if type == 'train':
        df = drop_const_cols(df)

    # filter columns
    if type == 'train':
        df, used_columns = filter_columns(df, groups=use_groups)
        model_config['used_columns'] = used_columns
    elif type=='test':
        df_pred = df[['line_id']]
        df = df[model_config['used_columns']]

    # scaling
    if scaling:
        if type == 'train':
            df, scaler_mean, scaler_std = std_scaler(df)
            model_config['scaler_mean'] = scaler_mean
            model_config['scaler_std'] = scaler_std
        elif type=='test':
            df = std_scaler(df, model_config['scaler_mean'], model_config['scaler_std'])


    if type == 'train':
        return df, model_config
    else:
        return df, df_pred
Exemple #10
0
def main(args):
    start_time = time.time()

    df = pd.read_csv(args.train_csv)
    df_y = df.target
    df_X = df.drop('target', axis=1)
    is_big = df_X.memory_usage().sum() > BIG_DATASET_SIZE

    # dict with data necessary to make predictions
    model_config = {}
    model_config['categorical_values'] = {}
    model_config['is_big'] = is_big

    print('Dataset read, shape {}'.format(df_X.shape))

    ##
    # drop constant features
    constant_columns = [
        col_name for col_name in df_X.columns if df_X[col_name].nunique() == 1
    ]
    df_X.drop(constant_columns, axis=1, inplace=True)

    if is_big:

        reduce_mem_usage(df_X)
        ##
        # features from datetime
        transform_datetime_features(df_X)
        # №
        # Проверки что колонка имеют кодированные onehot признаки
        string_columns = [
            col_name for col_name in df_X.columns
            if col_name.startswith('string')
        ]

        codestring_columns = [
            col_name for col_name in string_columns
            if df_X[col_name].dropna().str.len().unique().shape[0] == 1
        ]
        model_config['codestring_columns'] = codestring_columns
        for col_name in codestring_columns:
            l = df_X[col_name].dropna().str.len().unique()[0]
            for i in range(l):
                df_X['string_{}_{}'.format(col_name,
                                           i)] = df_X[col_name].str[i]
        ##
        # missing values
        if any(df_X.isnull()):
            model_config['missing'] = True
            df_X.fillna(-1, inplace=True)

        # new_feature_count = min(df_X.shape[1],
        #                         int(df_X.shape[1] / (df_X.memory_usage().sum() / BIG_DATASET_SIZE)))
        # # take only high correlated features
        # correlations = np.abs([np.corrcoef(df_y, df_X[col_name])[0, 1]
        #                        for col_name in df_X.columns if col_name.startswith('number')
        #                        ])
        # new_columns = df_X.columns[np.argsort(correlations)[-new_feature_count:]]
        # df_X = df_X[new_columns]

    else:
        ##
        # features from datetime
        transform_datetime_features(df_X)

        #
        # categorical encoding
        categorical_values = {}
        for col_name in list(df_X.columns):
            col_unique_values = df_X[col_name].unique()
            if 2 < len(col_unique_values) <= ONEHOT_MAX_UNIQUE_VALUES:
                categorical_values[col_name] = col_unique_values
                for unique_value in col_unique_values:
                    df_X['onehot_{}={}'.format(col_name, unique_value)] = (
                        df_X[col_name] == unique_value).astype(int)
        model_config['categorical_values'] = categorical_values

        #
        # missing values
        if any(df_X.isnull()):
            model_config['missing'] = True
            df_X.fillna(-1, inplace=True)

    #
    number_columns = [
        col_name for col_name in df_X.columns if col_name.startswith('number')
    ]
    model_config['number_columns'] = number_columns
    print('number_columns: {}'.format(number_columns))

    #
    id_columns = [
        col_name for col_name in df_X.columns if col_name.startswith('id')
    ]
    model_config['id_columns'] = id_columns
    print('id_columns: {}'.format(id_columns))

    #
    datetime_columns = [
        col_name for col_name in df_X.columns
        if col_name.startswith('datetime')
    ]
    model_config['datetime_columns'] = datetime_columns
    print('datetime_columns: {}'.format(datetime_columns))

    ##
    # Колонки с шумом
    def f_noise_columns(df, val):
        u = df.shape[0]
        return [
            col_name for col_name in df.columns
            if df[col_name].unique().shape[0] / u >= val
        ]

    number_columns = [
        col_name for col_name in df_X.columns if col_name.startswith('number')
    ]
    noise_columns = f_noise_columns(df_X[number_columns], 0.95)
    model_config['noise_columns'] = noise_columns
    print('noise_columns: {}'.format(noise_columns))
    df_X.drop(noise_columns, axis=1, inplace=True)

    ##
    if len(id_columns) > 0 and len(
            datetime_columns) > 0 and args.mode == MODE_REGRESSION:
        # # check_3
        def f_trans(x):
            for cn in number_columns:
                x['{}_s{}'.format(cn, -1)] = x[cn].shift(-1).fillna(0)

            return x

        df_X = df_X[id_columns + ['line_id'] +
                    number_columns].groupby(id_columns).apply(f_trans)

    ##
    if 3 <= len(datetime_columns) <= 10:
        # check_4
        print('Add delta datetime columns')
        # for cn in datetime_columns:
        #    df_X[cn] = pd.to_datetime(df_X[cn])
        import itertools
        for c1, c2 in list(itertools.combinations(datetime_columns, 2)):
            df_X['number_{}_{}'.format(c1, c2)] = (df_X[c1] - df_X[c2]).dt.days

    # use only numeric columns
    used_columns = [
        col_name for col_name in df_X.columns
        if col_name.startswith('number') or col_name.startswith('onehot')
    ]
    model_config['used_columns'] = used_columns

    X_train = df_X[used_columns].values
    y_train = df_y.values
    # scaling
    # scaler = StandardScaler(copy=False)
    # df_X = scaler.fit_transform(df_X)
    # model_config['scaler'] = scaler

    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective':
        'regression' if args.mode == MODE_REGRESSION else 'binary',
        # 'objective': 'binary',
        'metric': 'rmse',
        "learning_rate": 0.01,
        "num_leaves": 200,
        "feature_fraction": 0.70,
        "bagging_fraction": 0.70,
        'bagging_freq': 4,
        "max_depth": -1,
        "verbosity": -1,
        "reg_alpha": 0.3,
        "reg_lambda": 0.1,
        # "min_split_gain":0.2,
        "min_child_weight": 10,
        'zero_as_missing': True,
        'num_threads': 4,
    }

    model = lgb.train(params, lgb.Dataset(X_train, label=y_train), 600)

    # fitting
    model_config['mode'] = args.mode
    # if args.mode == MODE_REGRESSION:
    #     # Подбор модели
    #     # check_2
    #     Scores = list()
    #     for model in [Ridge(), LGBMRegressor(n_estimators=70)]:
    #         model.fit(X_train, y_train)
    #         kfold = KFold(n_splits=3, shuffle=True, random_state=0)
    #         score = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=1, scoring='neg_mean_squared_error',
    #                                 verbose=0)
    #
    #         print('X {} y {} score: {} mean: {}'.format(X_train.shape, y_train.shape, score.round(2), score.mean()))
    #         Scores.append((abs(score.mean()), model))
    #     Scores.sort(key=lambda k: k[0])
    #
    #     model = Scores[0][1]
    #     print(Scores)
    #
    # else:
    #     # model = RidgeClassifier()
    #     # model = LGBMClassifier(n_estimators=70)
    #     # model.fit(X_train, y_train)
    #
    #     Scores = list()
    #     for model in [RidgeClassifier(), LGBMClassifier(n_estimators=70)]:
    #         model.fit(X_train, y_train)
    #         kfold = KFold(n_splits=3, shuffle=True, random_state=0)
    #         score = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=1, scoring='roc_auc',
    #                                 verbose=0)
    #
    #         print('X {} y {} score: {} mean: {}'.format(X_train.shape, y_train.shape, score.round(2), score.mean()))
    #         Scores.append((abs(score.mean()), model))
    #     Scores.sort(key=lambda k: k[0], reverse=True)
    #
    #     model = Scores[0][1]
    #     print(Scores)

    model_config['model'] = model

    model_config_filename = os.path.join(args.model_dir, 'model_config.pkl')
    with open(model_config_filename, 'wb') as fout:
        pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL)

    print('Train time: {}'.format(time.time() - start_time))