def initial_processing(df, mode): if df.memory_usage().sum() > BIG_DATASET_SIZE: is_big = True else: is_big = False with Profiler(' - features from datetime'): df, date_cols, orig_date_cols = transform_datetime_features(df) cat_cols = get_cat_freqs(df) numeric_cols = [c for c in df.columns if c.startswith('number')] with Profiler(' - reindex new cols'): used_cols = date_cols + list(cat_cols) + numeric_cols df = df.reindex(columns=used_cols) # if is_big: # with Profiler(' - convert to float32'): # df[numeric_cols] = df[numeric_cols].astype(np.float32) print(f' - Cat: {len(cat_cols)}, num: {len(numeric_cols)}, date: {len(date_cols)}, orig_dt: {len(orig_date_cols)}') print(f' - Used: {len(used_cols)}, memory: {get_mem(df)}') params = dict( cat_cols=cat_cols, numeric_cols=numeric_cols, date_cols=date_cols, used_cols=used_cols ) return df, params
def load_data(filename, datatype='train', cfg={}): model_config = cfg model_config['missing'] = True # read dataset df = pd.read_csv(filename, low_memory=False) if datatype == 'train': y = df.target df = df.drop('target', axis=1) if df.memory_usage().sum() > BIG_DATASET_SIZE: model_config['is_big'] = True else: y = None print('Dataset read, shape {}'.format(df.shape)) # features from datetime df = transform_datetime_features(df) print('Transform datetime done, shape {}'.format(df.shape)) # categorical encoding if datatype == 'train': df, categorical_values = transform_categorical_features(df) model_config['categorical_values'] = categorical_values else: df, categorical_values = transform_categorical_features(df, model_config['categorical_values']) print('Transform categorical done, shape {}'.format(df.shape)) # drop constant features if datatype == 'train': constant_columns = [ col_name for col_name in df.columns if df[col_name].nunique() == 1 ] df.drop(constant_columns, axis=1, inplace=True) # filter columns if datatype == 'train': model_config['used_columns'] = [c for c in df.columns if check_column_name(c) or c in categorical_values] used_columns = model_config['used_columns'] print('Used {} columns'.format(len(used_columns))) line_id = df[['line_id', ]] df = df[used_columns] # missing values if model_config['missing']: df.fillna(-1, inplace=True) return df.values.astype(np.float16) if 'is_big' in model_config else df, y, model_config, line_id
def preprocess(df, model_config, type='train'): """preprocessing and feature engineering for input data""" print('preprocess data..') # extract datetime features df = transform_datetime_features(df) print('datetime features extracted') # categorical count encoding if type == 'train': df, categorical_values = count_encoding(df) model_config['categorical_values'] = categorical_values elif type == 'test': df = count_encoding(df, model_config['categorical_values']) print('count encoding of categorical features added') # drop constant features if type == 'train': df = drop_const_cols(df) # scaling # if mtype == 'train': # df, scaler_mean, scaler_std = std_scaler(df) # model_config['scaler_mean'] = scaler_mean # model_config['scaler_std'] = scaler_std # elif type=='test': # df = model_config['scaler'].transform(df) # filter columns if type == 'train': df, used_columns = filter_columns(df, groups=['number', 'count']) model_config['used_columns'] = used_columns elif type == 'test': df_pred = df[['line_id']] df = df[model_config['used_columns']] # missing values df.fillna(-1, inplace=True) # convert if dataframe is too big # if model_config['is_big']: # df = df.astype(np.float16) if type == 'train': return df, model_config else: return df, df_pred
args = parser.parse_args() start_time = time.time() # load model model_config_filename = os.path.join(args.model_dir, 'model_config.pkl') with open(model_config_filename, 'rb') as fin: model_config = pickle.load(fin) # read dataset df = pd.read_csv(args.test_csv) print('Dataset read, shape {}'.format(df.shape)) if not model_config['is_big']: # features from datetime df = transform_datetime_features(df) # categorical encoding for col_name, unique_values in model_config[ 'categorical_values'].items(): for unique_value in unique_values: df['onehot_{}={}'.format( col_name, unique_value)] = (df[col_name] == unique_value).astype(int) # missing values if model_config['missing']: df.fillna(-1, inplace=True) elif any(df.isnull()): df.fillna(value=df.mean(axis=0), inplace=True)
required=True) args = parser.parse_args() start_time = time.time() df = pd.read_csv(args.train_csv) df_y = df.target df_X = df.drop('target', axis=1) print('Dataset read, shape {}'.format(df_X.shape)) # dict with data necessary to make predictions model_config = {} # features from datetime df_X = transform_datetime_features(df_X) # missing values if any(df_X.isnull()): model_config['missing'] = True df_X.fillna(-1, inplace=True) # categorical encoding categorical_values = {} for col_name in list(df_X.columns): col_unique_values = df_X[col_name].unique() if 2 < len(col_unique_values) <= ONEHOT_MAX_UNIQUE_VALUES: categorical_values[col_name] = col_unique_values for unique_value in col_unique_values: df_X['onehot_{}={}'.format(col_name, unique_value)] = ( df_X[col_name] == unique_value).astype(int)
def main(args): start_time = time.time() # load model model_config_filename = os.path.join(args.model_dir, 'model_config.pkl') with open(model_config_filename, 'rb') as fin: model_config = pickle.load(fin) mode = model_config['mode'] # read dataset df = pd.read_csv(args.test_csv) print('Test Dataset read, shape {}'.format(df.shape)) if not model_config['is_big']: ## # features from datetime transform_datetime_features(df) ## # categorical encoding for col_name, unique_values in model_config[ 'categorical_values'].items(): for unique_value in unique_values: df['onehot_{}={}'.format( col_name, unique_value)] = (df[col_name] == unique_value).astype(int) else: ## # features from datetime transform_datetime_features(df) ## codestring_columns = model_config['codestring_columns'] for col_name in codestring_columns: l = df[col_name].dropna().str.len().unique()[0] for i in range(l): df['string_{}_{}'.format(col_name, i)] = df[col_name].str[i] ## # missing values if model_config['missing']: df.fillna(-1, inplace=True) elif any(df.isnull()): df.fillna(value=df.mean(axis=0), inplace=True) number_columns = model_config['number_columns'] datetime_columns = model_config['datetime_columns'] id_columns = model_config['id_columns'] ## if len(id_columns) > 0 and len( datetime_columns) > 0 and mode == MODE_REGRESSION: # check_3 def f_trans(x): for cn in number_columns: x['{}_s{}'.format(cn, -1)] = x[cn].shift(-1).fillna(0) return x df = df[id_columns + ['line_id'] + number_columns].groupby(id_columns).apply(f_trans) ## if 3 <= len(datetime_columns) <= 10: # check_4 print('Add delta datetime columns') #for cn in datetime_columns: # df[cn] = pd.to_datetime(df[cn]) import itertools for c1, c2 in list(itertools.combinations(datetime_columns, 2)): df['number_{}_{}'.format(c1, c2)] = (df[c1] - df[c2]).dt.days # filter columns used_columns = model_config['used_columns'] # scale #X_scaled = model_config['scaler'].transform(df[used_columns]) X_scaled = df[used_columns] model = model_config['model'] df['prediction'] = model.predict(X_scaled) # if mode == MODE_REGRESSION: # df['prediction'] = model.predict(X_scaled) # elif mode == MODE_CLASSIFICATION: # if hasattr(model, 'predict_proba'): # df['prediction'] = model.predict_proba(X_scaled)[:, 1] # else: # df['prediction'] = model._predict_proba_lr(X_scaled)[:, 1] # else: # raise Exception('Ошибочный режим {}'.format(mode)) df[['line_id', 'prediction']].to_csv(args.prediction_csv, index=False) print('Prediction time: {}'.format(time.time() - start_time)) return 0
df = pd.read_csv(train_csv) print('Dataset read, shape {}'.format(df.shape)) print('Dataset memory usage {:.3} MB'.format(df.memory_usage().sum() / 1024 / 1024)) df_test = pd.read_csv(test_csv) print('Test Dataset read, shape {}'.format(df.shape)) y_true = pd.read_csv(target_csv) df_test = pd.merge(df_test, y_true, on='line_id') df_y = df.target #df_X = df.drop('target', axis=1) df_X = df.copy() df_X = transform_datetime_features(df_X) df_test = transform_datetime_features(df_test) df_X_u = df_X[df_X['id_0'] == 102].copy() df_test_u = df_test[df_test['id_0'] == 102].copy() # onehot для дней недели, месяца и крартала droped_columns = ['number_{}'.format(i) for i in range(23)] # Удаляем не информативные колонки df_X_u.drop(droped_columns, axis=1, inplace=True) # number_22, 24, 25 сдвиг на 3 дня назад, то есть основная колонка number_25 # number_26 будни или рабочей день df_X_u.drop('number_26', axis=1, inplace=True) # number_32-38 onehot каждые 5 дней, начиная с 1 числа droped_columns = ['number_{}'.format(i) for i in range(32, 38 + 1)]
ONEHOT_MAX_UNIQUE_VALUES = model_config['ONEHOT_MAX_UNIQUE_VALUES'] # read dataset df = pd.read_csv(args.test_csv) print('Dataset read, shape {}'.format(df.shape)) # drop train constant values df.drop(model_config['constant_columns'], axis=1, inplace=True) # rename c_, d_, r_ df = utils.add_prefix_to_colnames(df, ONEHOT_MAX_UNIQUE_VALUES) # missing values _, df = utils.replace_na_and_create_na_feature(df, model_config['na_features']) if not model_config['is_big']: # features from datetime df = utils.transform_datetime_features(df) # categorical onehot encoding df = utils.onehot_encoding_test(df, model_config['categorical_to_onehot']) # real number feature extraction # filter columns used_columns = model_config['used_columns'] # scale X_scaled = model_config['scaler'].transform(df[used_columns]) model = model_config['model'] if model_config['mode'] == 'regression':
def preprocess(df, model_config, y=None, type='train', count_enc=True, likelihood_enc=False, scaling=False, use_groups=['number', 'count', 'likelihood']): """preprocessing and feature engineering for input data""" print('preprocess data..') # extract datetime features df = transform_datetime_features(df) print('datetime features extracted') # categorical count encoding if count_enc: if type == 'train': df, categorical_values = count_encoding(df) model_config['categorical_values'] = categorical_values elif type == 'test': df = count_encoding(df, model_config['categorical_values']) print('count encoding of categorical features added') # mean target encoding if likelihood_enc: if type == 'train': cat_cols = df.columns[df.columns.str.startswith('string')].tolist() df, categorical_values, global_avg = likelihood_encoding(df.join(y), cat_cols) model_config['mean_target'] = categorical_values model_config['global_avg'] = global_avg model_config['cat_cols'] = cat_cols elif type == 'test': df = likelihood_encoding(df, model_config['cat_cols'], categorical_values=model_config['mean_target'], global_avg=model_config['global_avg']) print('mean target encoding of categorical features added') # drop constant features if type == 'train': df = drop_const_cols(df) # filter columns if type == 'train': df, used_columns = filter_columns(df, groups=use_groups) model_config['used_columns'] = used_columns elif type=='test': df_pred = df[['line_id']] df = df[model_config['used_columns']] # scaling if scaling: if type == 'train': df, scaler_mean, scaler_std = std_scaler(df) model_config['scaler_mean'] = scaler_mean model_config['scaler_std'] = scaler_std elif type=='test': df = std_scaler(df, model_config['scaler_mean'], model_config['scaler_std']) if type == 'train': return df, model_config else: return df, df_pred
def main(args): start_time = time.time() df = pd.read_csv(args.train_csv) df_y = df.target df_X = df.drop('target', axis=1) is_big = df_X.memory_usage().sum() > BIG_DATASET_SIZE # dict with data necessary to make predictions model_config = {} model_config['categorical_values'] = {} model_config['is_big'] = is_big print('Dataset read, shape {}'.format(df_X.shape)) ## # drop constant features constant_columns = [ col_name for col_name in df_X.columns if df_X[col_name].nunique() == 1 ] df_X.drop(constant_columns, axis=1, inplace=True) if is_big: reduce_mem_usage(df_X) ## # features from datetime transform_datetime_features(df_X) # № # Проверки что колонка имеют кодированные onehot признаки string_columns = [ col_name for col_name in df_X.columns if col_name.startswith('string') ] codestring_columns = [ col_name for col_name in string_columns if df_X[col_name].dropna().str.len().unique().shape[0] == 1 ] model_config['codestring_columns'] = codestring_columns for col_name in codestring_columns: l = df_X[col_name].dropna().str.len().unique()[0] for i in range(l): df_X['string_{}_{}'.format(col_name, i)] = df_X[col_name].str[i] ## # missing values if any(df_X.isnull()): model_config['missing'] = True df_X.fillna(-1, inplace=True) # new_feature_count = min(df_X.shape[1], # int(df_X.shape[1] / (df_X.memory_usage().sum() / BIG_DATASET_SIZE))) # # take only high correlated features # correlations = np.abs([np.corrcoef(df_y, df_X[col_name])[0, 1] # for col_name in df_X.columns if col_name.startswith('number') # ]) # new_columns = df_X.columns[np.argsort(correlations)[-new_feature_count:]] # df_X = df_X[new_columns] else: ## # features from datetime transform_datetime_features(df_X) # # categorical encoding categorical_values = {} for col_name in list(df_X.columns): col_unique_values = df_X[col_name].unique() if 2 < len(col_unique_values) <= ONEHOT_MAX_UNIQUE_VALUES: categorical_values[col_name] = col_unique_values for unique_value in col_unique_values: df_X['onehot_{}={}'.format(col_name, unique_value)] = ( df_X[col_name] == unique_value).astype(int) model_config['categorical_values'] = categorical_values # # missing values if any(df_X.isnull()): model_config['missing'] = True df_X.fillna(-1, inplace=True) # number_columns = [ col_name for col_name in df_X.columns if col_name.startswith('number') ] model_config['number_columns'] = number_columns print('number_columns: {}'.format(number_columns)) # id_columns = [ col_name for col_name in df_X.columns if col_name.startswith('id') ] model_config['id_columns'] = id_columns print('id_columns: {}'.format(id_columns)) # datetime_columns = [ col_name for col_name in df_X.columns if col_name.startswith('datetime') ] model_config['datetime_columns'] = datetime_columns print('datetime_columns: {}'.format(datetime_columns)) ## # Колонки с шумом def f_noise_columns(df, val): u = df.shape[0] return [ col_name for col_name in df.columns if df[col_name].unique().shape[0] / u >= val ] number_columns = [ col_name for col_name in df_X.columns if col_name.startswith('number') ] noise_columns = f_noise_columns(df_X[number_columns], 0.95) model_config['noise_columns'] = noise_columns print('noise_columns: {}'.format(noise_columns)) df_X.drop(noise_columns, axis=1, inplace=True) ## if len(id_columns) > 0 and len( datetime_columns) > 0 and args.mode == MODE_REGRESSION: # # check_3 def f_trans(x): for cn in number_columns: x['{}_s{}'.format(cn, -1)] = x[cn].shift(-1).fillna(0) return x df_X = df_X[id_columns + ['line_id'] + number_columns].groupby(id_columns).apply(f_trans) ## if 3 <= len(datetime_columns) <= 10: # check_4 print('Add delta datetime columns') # for cn in datetime_columns: # df_X[cn] = pd.to_datetime(df_X[cn]) import itertools for c1, c2 in list(itertools.combinations(datetime_columns, 2)): df_X['number_{}_{}'.format(c1, c2)] = (df_X[c1] - df_X[c2]).dt.days # use only numeric columns used_columns = [ col_name for col_name in df_X.columns if col_name.startswith('number') or col_name.startswith('onehot') ] model_config['used_columns'] = used_columns X_train = df_X[used_columns].values y_train = df_y.values # scaling # scaler = StandardScaler(copy=False) # df_X = scaler.fit_transform(df_X) # model_config['scaler'] = scaler params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression' if args.mode == MODE_REGRESSION else 'binary', # 'objective': 'binary', 'metric': 'rmse', "learning_rate": 0.01, "num_leaves": 200, "feature_fraction": 0.70, "bagging_fraction": 0.70, 'bagging_freq': 4, "max_depth": -1, "verbosity": -1, "reg_alpha": 0.3, "reg_lambda": 0.1, # "min_split_gain":0.2, "min_child_weight": 10, 'zero_as_missing': True, 'num_threads': 4, } model = lgb.train(params, lgb.Dataset(X_train, label=y_train), 600) # fitting model_config['mode'] = args.mode # if args.mode == MODE_REGRESSION: # # Подбор модели # # check_2 # Scores = list() # for model in [Ridge(), LGBMRegressor(n_estimators=70)]: # model.fit(X_train, y_train) # kfold = KFold(n_splits=3, shuffle=True, random_state=0) # score = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=1, scoring='neg_mean_squared_error', # verbose=0) # # print('X {} y {} score: {} mean: {}'.format(X_train.shape, y_train.shape, score.round(2), score.mean())) # Scores.append((abs(score.mean()), model)) # Scores.sort(key=lambda k: k[0]) # # model = Scores[0][1] # print(Scores) # # else: # # model = RidgeClassifier() # # model = LGBMClassifier(n_estimators=70) # # model.fit(X_train, y_train) # # Scores = list() # for model in [RidgeClassifier(), LGBMClassifier(n_estimators=70)]: # model.fit(X_train, y_train) # kfold = KFold(n_splits=3, shuffle=True, random_state=0) # score = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=1, scoring='roc_auc', # verbose=0) # # print('X {} y {} score: {} mean: {}'.format(X_train.shape, y_train.shape, score.round(2), score.mean())) # Scores.append((abs(score.mean()), model)) # Scores.sort(key=lambda k: k[0], reverse=True) # # model = Scores[0][1] # print(Scores) model_config['model'] = model model_config_filename = os.path.join(args.model_dir, 'model_config.pkl') with open(model_config_filename, 'wb') as fout: pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL) print('Train time: {}'.format(time.time() - start_time))