def run_class(depth, num_est, train_transformed, labels, test_transformed, evaluate=True, file_name=None, jobs=1): # train_transformed, labels = transform_set('train.csv') categories = sorted(list(set(labels))) mapping = {value: index for index, value in enumerate(categories)} label_transformed = [mapping[cat] for cat in labels] # test_transformed, _ = transform_set("test.csv", train=False) clf = RandomForestClassifier(max_depth=depth, verbose=2, n_jobs=jobs, n_estimators=num_est) # mkdirs(data_path('serialized/model1/')) # joblib.dump(clf, data_path('serialized/model1/model1.pkl')) print("=======================================") if not evaluate: clf.fit(train_transformed, label_transformed) del train_transformed del label_transformed test_prediction = clf.predict_proba(test_transformed) del test_transformed create_submission(test_prediction, file_name) if evaluate: cross = cross_validation(clf, train_transformed, label_transformed) print(cross, cross.mean()) return cross, cross.mean(), ', '.join(sorted({feature.split('_')[0] for feature in train_transformed.columns}))
def run_class(depth, num_est, train_transformed, labels, test_transformed, evaluate=True, file_name=None, jobs=1): # train_transformed, labels = transform_set('train.csv') categories = sorted(list(set(labels))) mapping = {value: index for index, value in enumerate(categories)} label_transformed = [mapping[cat] for cat in labels] # test_transformed, _ = transform_set("test.csv", train=False) clf = RandomForestClassifier(max_depth=depth, verbose=2, n_jobs=jobs, n_estimators=num_est) # mkdirs(data_path('serialized/model1/')) # joblib.dump(clf, data_path('serialized/model1/model1.pkl')) print("=======================================") if not evaluate: clf.fit(train_transformed, label_transformed) del train_transformed del label_transformed test_prediction = clf.predict_proba(test_transformed) del test_transformed create_submission(test_prediction, file_name) if evaluate: cross = cross_validation(clf, train_transformed, label_transformed) print(cross, cross.mean()) return cross, cross.mean(), ', '.join( sorted({ feature.split('_')[0] for feature in train_transformed.columns }))
result = transformer.transform_frame(train_frame) not_regex = "^Dates|^PdDistrict|^DayOfWeek|^Resolution|^X|^Y" train_transformed = result.filter(regex=not_regex) label_transformed = None if train: label_transformed = result.filter(regex="^Category") return train_transformed, label_transformed train_transformed, label_transformed = transform_set('train.csv') print(train_transformed.columns) print(label_transformed.columns) clf = OneVsRestClassifier(LogisticRegression(random_state=0)) train_prediction = clf.fit(train_transformed, label_transformed).predict(train_transformed) # mkdirs(data_path('serialized/model1/')) # joblib.dump(clf, data_path('serialized/model1/model1.pkl')) # print(cross_validation(clf, train_transformed, label_transformed)) test_transformed, _ = transform_set("test.csv", train=False) print("-=======================================") test_prediction = clf.predict_proba(test_transformed) create_submission(test_prediction, "submission1.csv")
clf = RandomForestClassifier(max_depth=10) clf.fit(train_transformed, label_transformed) # train_prediction = clf.predict(train_transformed) # mkdirs(data_path('serialized/model1/')) # joblib.dump(clf, data_path('serialized/model1/model1.pkl')) # print(cross_validation(clf, train_transformed, label_transformed)) test_transformed, _ = transform_set("test.csv", train=False) print("=======================================") test_prediction = clf.predict_proba(test_transformed) print(test_prediction[0]) print(sum([t[0][0] for t in test_prediction])) print(sum([t[0][1] for t in test_prediction])) reshaped = [[t[i][1] for t in test_prediction] for i in range(len(test_prediction[0]))] print(reshaped) print(len(test_prediction), test_prediction[0].shape) create_submission(reshaped, "submission12.csv") # score on kaggle = 2.60553
# clf = OneVsRestClassifier(LogisticRegression(random_state=0)) from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth=10) clf.fit(train_transformed, label_transformed) # train_prediction = clf.predict(train_transformed) # mkdirs(data_path('serialized/model1/')) # joblib.dump(clf, data_path('serialized/model1/model1.pkl')) # print(cross_validation(clf, train_transformed, label_transformed)) test_transformed, _ = transform_set("test.csv", train=False) print("=======================================") test_prediction = clf.predict_proba(test_transformed) print(test_prediction[0]) print(sum([t[0][0] for t in test_prediction])) print(sum([t[0][1] for t in test_prediction])) reshaped = [[t[i][1] for t in test_prediction] for i in range(len(test_prediction[0]))] print(reshaped) print(len(test_prediction), test_prediction[0].shape) create_submission(reshaped, "submission12.csv") # score on kaggle = 2.60553
def main(): # ========================================= # === Settings # ========================================= # Get logger logger = get_logger(__name__) logger.info('Settings') # Get argument parser = argparse.ArgumentParser() parser.add_argument('--config', default='./configs/model_0.json') parser.add_argument("--debug", action="store_true") args = parser.parse_args() logger.info(f'config: {args.config}') logger.info(f'debug: {args.debug}') # Get config config = json.load(open(args.config)) config.update({'args': {'config': args.config, 'debug': args.debug}}) if config["model"]["name"] == "lightgbm": config["model"]["model_params"]["nthread"] = cpu_count() # Create a directory for model output model_no = pathlib.Path(args.config).stem model_output_dir = (pathlib.Path(config['dataset']['output_directory']) / model_no) if not model_output_dir.exists(): model_output_dir.mkdir() logger.info(f'model_output_dir: {str(model_output_dir)}') logger.debug(f'model_output_dir exists: {model_output_dir.exists()}') config.update({'model_output_dir': str(model_output_dir)}) # ========================================= # === Loading data # ========================================= logger.info('Loading data') # Get train and test input_dir = pathlib.Path(config['dataset']['input_directory']) train = pd.read_csv(input_dir / 'train.csv') test = pd.read_csv(input_dir / 'test.csv') # Get target values target_column = config['data_type']['target'] y_train = train[target_column].values # ========================================= # === Loading features # ========================================= logger.info('Loading features') # Get features x_train, x_test = load_features(config) feature_name = x_test.columns logger.debug(f'number of features: {len(feature_name)}') # ========================================= # === Adversarial Validation # ========================================= logger.info("adversarial validation") train_adv = x_train test_adv = x_test train_adv['target'] = 0 test_adv['target'] = 1 train_test_adv = pd.concat([train_adv, test_adv], axis=0, sort=False).reset_index(drop=True) target = train_test_adv['target'].values train_set, val_set = train_test_split(train_test_adv, test_size=0.33, random_state=71, shuffle=True) x_train_adv = train_set[feature_name] y_train_adv = train_set['target'] x_val_adv = val_set[feature_name] y_val_adv = val_set['target'] logger.debug(f'the number of train set: {len(x_train_adv)}') logger.debug(f'the number of valid set: {len(x_val_adv)}') train_lgb = lgb.Dataset(x_train_adv, label=y_train_adv) val_lgb = lgb.Dataset(x_val_adv, label=y_val_adv) lgb_model_params = config["adversarial_validation"]["lgb_model_params"] lgb_train_params = config["adversarial_validation"]["lgb_train_params"] clf = lgb.train(lgb_model_params, train_lgb, valid_sets=[train_lgb, val_lgb], valid_names=['train', 'valid'], **lgb_train_params) feature_imp = pd.DataFrame(sorted( zip(clf.feature_importance(importance_type='gain'), feature_name)), columns=['value', 'feature']) plt.figure(figsize=(20, 10)) sns.barplot(x='value', y='feature', data=feature_imp.sort_values(by='value', ascending=False).head(20)) plt.title('LightGBM Features') plt.tight_layout() plt.savefig(model_output_dir / "feature_importance_adv.png") config.update({ 'adversarial_validation_result': { 'score': clf.best_score, 'feature_importances': feature_imp.set_index("feature").sort_values( by="value", ascending=False).head(20).to_dict()["value"] } }) # ========================================= # === Train model and predict # ========================================= logger.info('Train model and predict') # Get features x_train, x_test = load_features(config) feature_name = x_test.columns logger.debug(f'number of features: {len(feature_name)}') # Get folds folds_ids = Fold( n_splits=config['cv']['n_splits'], shuffle=config['cv']['shuffle'], random_state=config['cv']['random_state']).get_stratifiedkfold( x_train, y_train) # Train and predict model_name = config['model']['name'] model_cls = model_map[model_name] params = config['model'] runner = Runner(model_cls, params, model_output_dir, f'Train_{model_cls.__name__}') oof_preds, evals_result = runner.train_cv(x_train, y_train, folds_ids) config.update(evals_result) test_preds = runner.predict_cv(x_test) # ========================================= # === Make submission file # ========================================= sub = create_submission(test, test_preds, target_column) sub.to_csv(model_output_dir / 'submission.csv', index=False, header=True) # ========================================= # === Save files # ========================================= save_path = model_output_dir / 'output.json' json_dump(config, save_path) pd.DataFrame(oof_preds, columns=["target"]).to_csv(model_output_dir / 'oof.csv', index=False, header=True)
classes = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS'] category = train_frame['Category'] mapping = {clazz: num for (num, clazz) in enumerate(classes)} most_freq_class = Counter(category).most_common()[0][0] predicted = category.apply(lambda cat: mapping[most_freq_class]) expected = category.apply(lambda cat: mapping[cat]) mlb = MultiLabelBinarizer() expected_b = mlb.fit_transform(to_singleton(expected)) predicted_b = mlb.transform(to_singleton(predicted)) for (clazz, count) in Counter(category).most_common(): print("{}\t{}".format(clazz, count)) # todo: use validation.py print("Accuracy on training: {}".format(accuracy_score(expected_b, predicted_b))) print("Log los on training: {}".format(log_loss(expected_b, predicted_b))) test_prediction = np.full((submission_size, len(predicted_b[0])), predicted_b[0]) create_submission(test_prediction, 'baseline_sub.csv')
def main(): # ========================================= # === Settings # ========================================= # Get logger logger = get_logger(__name__) logger.info('Settings') # Get argument parser = argparse.ArgumentParser() parser.add_argument('--config', default='./configs/model_1dcnn_0.json') parser.add_argument("--debug", action="store_true") args = parser.parse_args() logger.info(f'config: {args.config}') logger.info(f'debug: {args.debug}') # Get config config = json.load(open(args.config)) config.update({'args': {'config': args.config, 'debug': args.debug}}) if config["model"]["name"] == "lightgbm": config["model"]["model_params"]["nthread"] = cpu_count() # Create a directory for model output model_no = pathlib.Path(args.config).stem model_output_dir = (pathlib.Path(config['dataset']['output_directory']) / model_no) if not model_output_dir.exists(): model_output_dir.mkdir() logger.info(f'model_output_dir: {str(model_output_dir)}') logger.debug(f'model_output_dir exists: {model_output_dir.exists()}') config.update({'model_output_dir': str(model_output_dir)}) # ========================================= # === Loading data # ========================================= logger.info('Loading data') # Get train and test input_dir = pathlib.Path(config['dataset']['input_directory']) train = pd.read_csv(input_dir / 'train.csv') test = pd.read_csv(input_dir / 'test.csv') spectrum = pd.read_csv(input_dir / 'spectrum_stack.csv') spectrum_fitting = pd.read_csv(input_dir / 'spectrum_fitting_stack.csv') wv_cols = [f"wavelength_{i}" for i in range(512)] wv_fit_cols = [f"fitting_wavelength_{i}" for i in range(512)] train_spectrum = pd.merge(train, spectrum, on="spectrum_filename", how="left") test_spectrum = pd.merge(test, spectrum, on="spectrum_filename", how="left") train_spectrum = pd.merge(train_spectrum, spectrum_fitting, on="spectrum_filename", how="left") test_spectrum = pd.merge(test_spectrum, spectrum_fitting, on="spectrum_filename", how="left") train_std = np.std(train_spectrum[wv_cols].values, axis=1, keepdims=True) test_std = np.std(test_spectrum[wv_cols].values, axis=1, keepdims=True) train_spectrum[wv_cols] = train_spectrum[wv_cols].values / train_std test_spectrum[wv_cols] = test_spectrum[wv_cols].values / test_std spectrum_cols = wv_cols + wv_fit_cols train_spectrum = train_spectrum[spectrum_cols] test_spectrum = test_spectrum[spectrum_cols] # Get target values target_column = config['data_type']['target'] y_train = train[target_column].values # ========================================= # === Loading features # ========================================= logger.info('Loading features') # Get features x_train, x_test = load_features(config) feature_name = x_test.columns logger.debug(f'number of features: {len(feature_name)}') # ========================================= # === features preprocess # ========================================= x_total = x_train.append(x_test).reset_index(drop=True) remove_features = [c for c in x_total.columns if c.find("layout_x") != -1] remove_features += [c for c in x_total.columns if c.find("layout_y") != -1] x_total.drop(columns=remove_features, inplace=True) x_total = pd.get_dummies( x_total, columns=["LabelEncoding_exc_wl", "LabelEncoding_layout_a"]) x_total.fillna(0, inplace=True) from sklearn.preprocessing import StandardScaler numeric_features = [ c for c in x_total.columns if c.find("LabelEncoding_") == -1 ] sc = StandardScaler() x_total[numeric_features] = sc.fit_transform(x_total[numeric_features]) x_train = x_total.iloc[:len(train)] x_test = x_total.iloc[len(train):].reset_index(drop=True) x_train = pd.concat([x_train, train_spectrum], axis=1) x_test = pd.concat([x_test, test_spectrum], axis=1) logger.debug(f'number of features with spec in train: {x_train.shape}') logger.debug(f'number of features with spec in test: {x_test.shape}') # ========================================= # === Train model and predict # ========================================= logger.info('Train model and predict') # Get folds folds_ids = Fold( n_splits=config['cv']['n_splits'], shuffle=config['cv']['shuffle'], random_state=config['cv']['random_state']).get_stratifiedkfold( x_train, y_train) # Train and predict model_name = config['model']['name'] model_cls = model_map[model_name] params = config['model'] runner = Runner(model_cls, params, model_output_dir, f'Train_{model_cls.__name__}') oof_preds, evals_result = runner.train_cv(x_train, y_train, folds_ids) config.update(evals_result) test_preds = runner.predict_cv(x_test) # ========================================= # === Make submission file # ========================================= sub = create_submission(test, test_preds, target_column) sub.to_csv(model_output_dir / 'submission.csv', index=False, header=True) # ========================================= # === Save files # ========================================= save_path = model_output_dir / 'output.json' json_dump(config, save_path) pd.DataFrame(oof_preds, columns=["target"]).to_csv(model_output_dir / 'oof.csv', index=False, header=True)