def load_data(data_path, processor_paths, fit=True): """ Wrap of LearningDataAdapter and PreProcess to load data in one line. data_path : string, path to data, currently support .csv file. processor_paths: dict, in the form of {'imputer': 'imputer_save_path', 'scaler' : 'scaler_save_path', 'encoder': 'encoder_save_path'} fit : boolean, default is True. If it is true, it means we need to fit imputer, scaler and encoder and save them in processor_paths. Otherwise, these transformations are already existed, and we only need to load them. """ # adapter adapter = LearningDataAdapter(for_learning=True) adapter.adapt_file(data_path) X_num, X_cat = adapter.X_num, adapter.X_cat w, y = adapter.w, adapter.y # preprocessor processor = PreProcess() if fit: processor.fit(X_num, X_cat, processor_paths) else: processor.load(processor_paths) X_trans = processor.transform(X_num, X_cat) return X_trans, y, w
print print "+ Connecting to {0} to populate column in table {1}".format(args.dbname, args.input_table_name) print print " Loading models." logre_evaluator = LoadModelEvaluator( "models/logre_imputer.pkl", "models/logre_scaler.pkl", "models/logre_encoder.pkl", "models/logre.pkl" ) gbdt300_evaluator = LoadModelEvaluator( "models/gbdt300_imputer.pkl", "models/gbdt300_scaler.pkl", "models/gbdt300_encoder.pkl", "models/gbdt300.pkl" ) adapter = LearningDataAdapter(for_learning=False) print f = open(args.output_fname, "w") f.write("eid,logre_signal_score,gbdt300\n") print " Predicting and updating." print " Started on {0}".format(time.ctime(time.time())) with PsqlReader( database=args.dbname, select_table_name=args.input_table_name, itersize=200000, arraysize=200000, rollback=False, debug=False,
import numpy as np import sklearn.preprocessing as preprocessing from sklearn.ensemble import GradientBoostingClassifier from sklearn.externals import joblib from LearningDataAdapter import LearningDataAdapter from ModelEvaluator import ModelEvaluator if __name__ == '__main__': # data preprocessing print 'Importing training data... ' adapter = LearningDataAdapter(for_learning=True) adapter.adapt_file('data/train.csv') X_num, X_cat = adapter.X_num, adapter.X_cat w, y = adapter.w, adapter.y print 'Fitting preprocessors... ' # Imputer imp = preprocessing.Imputer( missing_values='NaN', strategy='mean', axis=0 ) imp.fit(X_num) X_num_trans = imp.transform(X_num) joblib.dump(imp, 'models/gbdt300_imputer.pkl') # Scaler scaler = preprocessing.StandardScaler( with_mean=True, with_std=True
import numpy as np import sklearn.preprocessing as preprocessing from sklearn.ensemble import GradientBoostingClassifier from sklearn.externals import joblib from LearningDataAdapter import LearningDataAdapter from ModelEvaluator import ModelEvaluator if __name__ == '__main__': # data preprocessing print 'Importing training data... ' adapter = LearningDataAdapter(for_learning=True) adapter.adapt_file('data/train.csv') X_num, X_cat = adapter.X_num, adapter.X_cat w, y = adapter.w, adapter.y print 'Fitting preprocessors... ' # Imputer imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X_num) X_num_trans = imp.transform(X_num) joblib.dump(imp, 'models/gbdt300_imputer.pkl') # Scaler scaler = preprocessing.StandardScaler(with_mean=True, with_std=True) scaler.fit(X_num_trans) X_num_trans = scaler.transform(X_num_trans) joblib.dump(scaler, 'models/gbdt300_scaler.pkl')
import matplotlib.pyplot as plt from LearningDataAdapter import LearningDataAdapter from ModelEvaluator import ModelEvaluator from sklearn.metrics import roc_curve import glob from LearningScore import learning_curve if __name__ == '__main__': plt.switch_backend('agg') # obtain valid data print 'Importing test sample... ' adapter = LearningDataAdapter(for_learning=True) adapter.adapt_file('data/validate.csv') X_num, X_cat = adapter.X_num, adapter.X_cat imp = joblib.load("./models/imputer.pkl") scaler = joblib.load("./models/scaler.pkl") enc = joblib.load("./models/encoder.pkl") evaluator = ModelEvaluator( imputer=imp, scaler=scaler, encoder=enc) pred_x = evaluator.preprocess(X_num,X_cat) pred_w, pred_y = adapter.w, adapter.y print
print print ' Adding new column to table.' add_table_column(args.dbname, args.table_name) print print ' Loading models.' imp = joblib.load('models/imputer.pkl') scaler = joblib.load('models/scaler.pkl') enc = joblib.load('models/encoder.pkl') rf = joblib.load('models/rf.pkl') evaluator = ModelEvaluator(imputer=imp, scaler=scaler, encoder=enc, model=rf) adapter = LearningDataAdapter(for_learning=False) print print ' Predicting and updating.' print ' Started on {0}'.format(time.ctime(time.time())) with SqlDataLoader(database=args.dbname, table_name=args.table_name, itersize=200000, arraysize=200000, rollback=False, debug=False) as sql_loader: sql_loader.start() while sql_loader.curr_records: sys.stdout.write('.') sys.stdout.flush() adapter.adapt_records(sql_loader.curr_records)
print ' Number of events with successful choices: {0}'.format(n_success) print ' Ratio: {0}'.format(n_success / float(n_hot)) print if __name__ == '__main__': print '+ Loading pickled objects... ' imp = joblib.load('models/imputer.pkl') scaler = joblib.load('models/scaler.pkl') enc = joblib.load('models/encoder.pkl') rf = joblib.load('models/rf.pkl') print print '+ Importing test sample... ' adapter = LearningDataAdapter(for_learning=True) adapter.adapt_file('data/validate.csv') print print '+ Predicting test sample candidate scores... ' evaluator = ModelEvaluator(imputer=imp, scaler=scaler, encoder=enc, model=rf) score = evaluator.predict_proba(adapter.X_num, adapter.X_cat)[:, 1] print print '+ Assessing model results... ' print print ' Selecting best candidate... '