def main(): """ Get data from db and save it as csv """ bq = _bq.BQHandler() io = _io.IO() starttime = dt.datetime.strptime('2010-01-01', "%Y-%m-%d") endtime = dt.datetime.strptime('2019-01-01', "%Y-%m-%d") logging.info('Reading data...') bq.set_params(starttime, endtime, batch_size=2500000, project=options.project, dataset=options.src_dataset, table=options.src_table) data = bq.get_rows() # data = bq.get_rows(starttime, # endtime, # project=options.project, # dataset=options.src_dataset, # table=options.src_table) logging.info('Data loaded.') data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=False, train_type_column='train_type' ) #print(data.shape data = io.calc_delay_avg(data) data = io.classify(data) log_class_dist(data.loc[:,'class']) print(options.no_balance) balanced_data = data if not options.no_balance: count = data.groupby('class').size().min() balanced_data = pd.concat([data.loc[data['class'] == 0].sample(n=count), data.loc[data['class'] == 1].sample(n=count), data.loc[data['class'] == 2].sample(n=count), data.loc[data['class'] == 3].sample(n=count)]) print(balanced_data.head(5)) print(balanced_data.groupby('class').size()) balanced_data.set_index(['time', 'trainstation'], inplace=True) logging.info('Saving data...') #print(data) bq.dataset_to_table(balanced_data, options.dst_dataset, options.dst_table)
def main(): """ Get data from db and save it as csv """ bq = _bq.BQHandler() io = _io.IO() times = [] times.append({ 'starttime': dt.datetime.strptime('2014-01-01', "%Y-%m-%d"), 'endtime': dt.datetime.strptime('2014-02-01', "%Y-%m-%d") }) times.append({ 'starttime': dt.datetime.strptime('2016-06-01', "%Y-%m-%d"), 'endtime': dt.datetime.strptime('2016-07-01', "%Y-%m-%d") }) times.append({ 'starttime': dt.datetime.strptime('2017-02-01', "%Y-%m-%d"), 'endtime': dt.datetime.strptime('2017-03-01', "%Y-%m-%d") }) logging.info('Using times: {}'.format(times)) for t in times: start = t['starttime'] end = t['endtime'] logging.info('Processing time range {} - {}'.format( start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M'))) logging.info('Reading data...') data = bq.get_rows(start, end, parameters=['*'], project=options.project, dataset=options.src_dataset, table=options.src_table) #print(data.shape data.set_index(['time', 'trainstation'], inplace=True) #print(data) bq.dataset_to_table(data, options.dst_dataset, options.dst_table)
def main(): """ Main program """ local_device_protos = device_lib.list_local_devices() logging.info( [x.name for x in local_device_protos if x.device_type == 'GPU']) bq = _bq.BQHandler() io = _io.IO(gs_bucket=options.gs_bucket) viz = _viz.Viz(io) starttime, endtime = io.get_dates(options) #save_path = options.save_path+'/'+options.config_name logging.info('Using dataset {}.{} and time range {} - {}'.format( options.feature_dataset, options.feature_table, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = list( set(options.label_params + options.feature_params + options.meta_params)) aggs = io.get_aggs_from_param_names(options.feature_params) logging.info('Reading data...') bq.set_params(batch_size=2500000, loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names, only_winters=options.only_winters) data = bq.get_rows(starttime, endtime) data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=[], aggs=aggs) if options.y_avg_hours is not None: data = io.calc_running_delay_avg(data, options.y_avg_hours) if options.y_avg: data = io.calc_delay_avg(data) data.sort_values(by=['time', 'trainstation'], inplace=True) if options.month: logging.info('Adding month to the dataset...') data['month'] = data['time'].map(lambda x: x.month) options.feature_params.append('month') if options.normalize: logging.info('Normalizing data...') xscaler = StandardScaler() yscaler = StandardScaler() labels = data.loc[:, options.label_params].astype( np.float32).values.reshape((-1, 1)) scaled_labels = pd.DataFrame(yscaler.fit_transform(labels), columns=['delay']) non_scaled_data = data.loc[:, options.meta_params + ['class']] scaled_features = pd.DataFrame(xscaler.fit_transform( data.loc[:, options.feature_params]), columns=options.feature_params) data = pd.concat([non_scaled_data, scaled_features, scaled_labels], axis=1) fname = options.save_path + '/xscaler.pkl' io.save_scikit_model(xscaler, fname, fname) fname = options.save_path + '/yscaler.pkl' io.save_scikit_model(yscaler, fname, fname) if options.pca: logging.info('Doing PCA analyzis for the data...') ipca = IncrementalPCA(n_components=options.pca_components, whiten=options.whiten, copy=False) non_processed_data = data.loc[:, options.meta_params + options.label_params] processed_data = data.loc[:, options.feature_params] ipca.fit(processed_data) processed_features = pd.DataFrame(ipca.transform(processed_data)) data = pd.concat([non_processed_data, processed_data], axis=1) fname = options.output_path + '/ipca_explained_variance.png' viz.explained_variance(ipca, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) # Divide data to normal and delayed cases data_test = data[(data.loc[:, 'class'] >= options.class_limit)] data = data[(data.loc[:, 'class'] < options.class_limit)] data_train, data_val = train_test_split(data, test_size=0.33) data_train_x = data_train.loc[:, options.feature_params].values data_train_y = data_train.loc[:, options.label_params].values data_val_x = data_val.loc[:, options.feature_params].values data_val_y = data_val.loc[:, options.label_params].values # Initialization logging.info('Building model...') model = convlstm.Autoencoder(data_train_x.shape[1]).get_model() losses, val_losses, accs, val_accs, steps = [], [], [], [], [] boardcb = TensorBoard(log_dir=options.log_dir, histogram_freq=0, write_graph=True, write_images=True) logging.info('Data shape: {}'.format( data_train.loc[:, options.feature_params].values.shape)) history = model.fit(data_train_x, data_train_x, validation_data=(data_val_x, data_val_x), epochs=3, callbacks=[boardcb]) #, batch_size=64) history_fname = options.save_path + '/history.pkl' io.save_keras_model(options.save_file, history_fname, model, history.history) # Reconstruction errors logging.info('Plotting reconstruction errors...') errors = {} logging.info('Train:') errors = get_reconst_error(model, data_train_x, data_train_y.ravel(), errors, 'Train') logging.info('Validation:') errors = get_reconst_error(model, data_val_x, data_val_y.ravel(), errors, 'Validation') logging.info('Test:') data_test_x = data_test.loc[:, options.feature_params].values data_test_y = data_test.loc[:, options.label_params].values errors = get_reconst_error(model, data_test_x, data_test_y.ravel(), errors, 'Test') for i in np.arange(4): fname = options.output_path + '/reconstruction_error_{}.png'.format(i) viz.reconstruction_error(errors, desired_class=i, filename=fname) fname = options.output_path + '/reconstruction_error_all.png'.format(i) viz.reconstruction_error(errors, filename=fname)
def main(): """ Get data from db and save it as csv """ # Helpers bq = bqhandler.BQHandler() io = IO(gs_bucket=options.gs_bucket) viz = Viz(io) predictor = Predictor(io, ModelLoader(io), options) ### OPTIONS ################################################################ # Configuration starttime, endtime = io.get_dates(options) logging.info('Using dataset {} and time range {} - {}'.format(options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) ### MODELS ################################################################# # Initialise classifier if hasattr(options, 'classifier_file'): classifier = io.load_scikit_model(options.classifier_file) else: if options.classifier == 'svc': params = {'kernel': options.kernel, 'gamma': options.gamma, 'C': options.penalty, 'probability': options.probability} #classifier = SVC(**params) classifier = SVCClassifier(params, limit=options.class_limit) elif options.classifier == 'graphsvc': classifier = GraphSVCClassifier() graph_data = pd.read_csv(options.graph_data, names=['date', 'start_hour', 'src', 'dst', 'type', 'sum_delay','sum_ahead','add_delay','add_ahead','train_count']) classifier.fetch_connections(graph_data) elif options.classifier == 'gaussiannb': classifier = GaussianNBClassifier() elif options.classifier == 'lstm': num_of_features = len(options.feature_params) if options.month: num_of_features += 1 class_weight=None if hasattr(options, 'class_weight'): class_weight=eval(options.class_weight) params = {'length': options.time_steps, 'batch_size': options.batch_size, 'epochs': options.epochs, 'num_of_features': num_of_features, 'log_dir': options.log_dir, 'class_weight':class_weight} classifier = LSTMClassifier(**params) else: raise('Model not specificied or wrong. Add "classifier: bgm" to config file.') # Initialise regression model if options.regression == 'rfr': regressor = RandomForestRegressor(n_estimators=options.n_estimators, n_jobs=-1, min_samples_leaf=options.min_samples_leaf, min_samples_split=options.min_samples_split, max_features=options.max_features, max_depth=options.max_depth, bootstrap=options.bootstrap ) #regressor = _trans.Regressor(model=model) else: raise('Model not specificied or wrong. Add "classifier: bgm" to config file.') # Initialise transformer #transformer = _trans.Selector(classifier=classifier) # Initialise pipeline #pipe = Pipeline( # [('selector', transformer), # ('regression', regressor)] #) ### DATA ################################################################### sum_columns = ['delay'] if 'train_count' in options.meta_params: sum_columns.append('train_count') # Pick only selected month where = {} if options.pick_month is not None: where = {'EXTRACT(MONTH from time)': options.pick_month} logging.info('Reading data...') bq.set_params(loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names, locations=options.locations, only_winters=options.only_winters, reason_code_table=options.reason_code_table, where=where) data = bq.get_rows(starttime, endtime) data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=sum_columns, aggs=aggs) data['delay'] = data.loc[:, 'delay'].replace(-99, np.nan) data.sort_values(by=['trainstation', 'time'], inplace=True) logging.info('Processing {} rows...'.format(len(data))) # Filter only timesteps with large distribution in the whole network if options.filter_delay_limit is not None: data = io.filter_delay_with_limit(data, options.filter_delay_limit) # Binary class logging.info('Adding binary class to the dataset with limit {}...'.format(options.delay_limit)) def set_class(x): if x > options.delay_limit: return binary_labels[1] elif x < options.delay_limit: return binary_labels[0] return np.nan data['class'] = data['delay'].map(lambda x: set_class(x)) # Separate train and validation sets data_train, data_test = train_test_split(data, test_size=0.3, shuffle=False) # Balance if options.balance: logging.info('Balancing training data...') count = data_train.groupby('class').size().min() # SVC can't handle more than 50 000 samples if options.classifier == 'svc': count = min(count, 50000) data_train = pd.concat([data_train.loc[data_train['class'] == 0].sample(n=count), data_train.loc[data_train['class'] == 1].sample(n=count)]) logging.info('Train data:') io.log_class_dist(data_train.loc[:, 'class'].values, labels=binary_labels) logging.info('Test data:') io.log_class_dist(data_test.loc[:, 'class'].values, labels=binary_labels) # Adding month if options.month: logging.info('Adding month to the datasets...') data_train['month'] = data_train.loc[:,'time'].map(lambda x: x.month) data_test['month'] = data_test.loc[:,'time'].map(lambda x: x.month) options.feature_params.append('month') #data_train.set_index('time', inplace=True) #y_train_class = data_train.loc[:,['class']].astype(np.int32).values.ravel() #y_train_delay = data_train.loc[:,['delay']].astype(np.int32).values.ravel() y_train_class = data_train.loc[:,['class']].values.ravel() y_train_delay = data_train.loc[:,['delay']].values.ravel() #y_test_class = data_test.loc[:,['class']].astype(np.int32).values.ravel() #y_test_delay = data_test.loc[:,['delay']].astype(np.int32).values.ravel() y_test_class = data_test.loc[:,['class']].values.ravel() y_test_delay = data_test.loc[:,['delay']].values.ravel() X_train = data_train.loc[:,options.feature_params].astype(np.float32).values X_test = data_test.loc[:,options.feature_params].astype(np.float32).values if options.smote: logging.info('Smoting...') sm = SMOTE() X_train_class, y_class = sm.fit_resample(X_train, y_train_class) io.log_class_dist(y_class, labels=binary_labels) # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # io.log_class_dist(y_train[:,1], [-1,1]) # If asked, save used train and test splits into big query if options.save_data: tname = options.model+'_'+options.feature_dataset+'_'+options.config_name+'_train' columns = [options.feature_params, ['delay'], ['class']] bq.nparray_to_table([X_train, y_train_class, y_train_delay], columns, options.project, options.feature_dataset, tname ) tname = options.model+'_'+options.feature_dataset+'_'+options.config_name+'_test' bq.nparray_to_table([X_test, y_test_class, y_test_delay], columns, options.project, options.feature_dataset, tname ) if options.normalize: logging.info('Normalizing data...') #scale=(0,1) if hasattr(options, 'xscaler_file'): xscaler = io.load_scikit_model(options.xscaler_file) X_train = xscaler.transform(X_train) X_test = xscaler.transform(X_test) else: xscaler = MinMaxScaler(feature_range=(-1,1)) #xscaler = StandardScaler() X_train = xscaler.fit_transform(X_train) X_test = xscaler.transform(X_test) fname = options.save_path+'/xscaler.pkl' io.save_scikit_model(xscaler, fname, fname) if hasattr(options, 'yscaler_file'): yscaler = io.load_scikit_model(options.yscaler_file) y_train_delay = yscaler.transform(y_train_delay) y_test_delay = yscaler.transform(y_test_delay) else: #yscaler = MinMaxScaler(feature_range=(0,1)) yscaler=StandardScaler() y_train_delay = yscaler.fit_transform(y_train_delay.reshape(-1,1)).ravel() y_test_delay = yscaler.transform(y_test_delay.reshape(-1,1)).ravel() fname = options.save_path+'/yscaler.pkl' io.save_scikit_model(yscaler, fname, fname) data_train.loc[:,options.feature_params].to_csv('data/x_train.csv', index=False) data_test.loc[:,options.feature_params].to_csv('data/x_test.csv', index=False) data_train.loc[:,['class']].fillna(-99).astype(np.int).to_csv('data/y_train.csv', index=False) data_test.loc[:,['class']].fillna(-99).astype(np.int).to_csv('data/y_test.csv', index=False) sys.exit() ### TRAIN ################################################################## if options.cv: logging.info('Doing random search for hyper parameters...') raise("No param_grid set for given model ({})".format(options.regression)) random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=int(options.n_iter_search), n_jobs=-1) random_search.fit(X_train, y_train) logging.info("RandomizedSearchCV done.") fname = options.output_path+'/random_search_cv_results.txt' report_cv_results(random_search.cv_results_, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) sys.exit() else: logging.info('Training classifier...') if options.classifier == 'graphsvc': classifier.fit(X_train, y_train_class, stations=data_train.loc[:, 'trainstation'].values) else: history = classifier.fit(X_train, y_train_class, X_test, y_test_class) # Save classifier if options.classifier == 'lstm': history_fname = options.save_path+'/history.pkl' fname = options.save_path+'/classifier.h5' io.save_keras_model(fname, history_fname, classifier, history.history) else: fname = options.save_path+'/classifier.pkl' io.save_scikit_model(classifier, filename=fname, ext_filename=fname) # Drop data with no delay information X_train = X_train[~np.isnan(y_train_delay)] y_train_delay = y_train_delay[~np.isnan(y_train_delay)] y_train_class = y_train_class[~np.isnan(y_train_class)] y_pred_train_bin = classifier.predict(X_train, type='bool') # debug #y_pred_train_bin #indices = np.random.choice(np.arange(y_pred_train_bin.size), # replace=False, # size=int(y_pred_train_bin.size * 0.2)) #y_pred_train_bin[indices] = True #print('y_pred_train_bin: {}'.format(y_pred_train_bin.shape)) #print('y_train_delay: {}'.format(y_train_delay.shape)) #print('y_train_class: {}'.format(y_train_class.shape)) # Pick only severe values #y_train_delay_ = y_train_delay[(len(y_train_class)-len(y_pred_train_bin)):] #X_train_ = X_train[(len(y_train_class)-len(y_pred_train_bin)):] y_train_delay_ = y_train_delay[(len(y_train_delay)-len(y_pred_train_bin)):] X_train_ = X_train[(len(y_train_delay)-len(y_pred_train_bin)):] #print('y_train_delay_: {}'.format(y_train_delay_.shape)) y_train_severe = y_train_delay_[y_pred_train_bin] X_train_severe = X_train_[y_pred_train_bin] logging.info('Training regressor...') regressor.fit(X_train_severe, y_train_severe) # Save regressor io.save_scikit_model(regressor, filename=options.save_file, ext_filename=options.save_file) # Learning history # fname = options.output_path+'/learning_over_time.png' # viz.plot_nn_perf(history.history, metrics={'Error': {'mean_squared_error': 'MSE', # 'mean_absolute_error': 'MAE'}}, # filename=fname) ### RESULTS FOR VALIDATION SET ############################################# # Drop data with missing delay X_test = X_test[~np.isnan(y_test_class)] y_test_class = y_test_class[~np.isnan(y_test_class)] data_test = data_test[~np.isnan(data_test.delay)] # Metrics #y_pred_proba = classifier.predict_proba(X_test) y_pred = classifier.predict(X_test) y_pred_proba = classifier.y_pred_proba #y_test_delay = y_test_delay[~np.isnan(y_test_delay)] # Classification performance # LSTM don't have first timesteps y_test_class = y_test_class[(len(X_test)-len(y_pred)):] acc = accuracy_score(y_test_class, y_pred) precision = precision_score(y_test_class, y_pred, average='micro') recall = recall_score(y_test_class, y_pred, average='micro') f1 = f1_score(y_test_class, y_pred, average='micro') logging.info('Classification accuracy: {}'.format(acc)) logging.info('Classification precision: {}'.format(precision)) logging.info('Classification recall: {}'.format(recall)) logging.info('Classification F1 score: {}'.format(f1)) io.log_class_dist(y_pred, binary_labels) # Confusion matrices fname = '{}/confusion_matrix_validation.png'.format(options.output_path) viz.plot_confusion_matrix(y_test_class, y_pred, np.arange(2), filename=fname) fname = '{}/confusion_matrix_validation_normalised.png'.format(options.output_path) viz.plot_confusion_matrix(y_test_class, y_pred, np.arange(2), True, filename=fname) # Precision-recall curve fname = '{}/precision-recall-curve_validation.png'.format(options.output_path) viz.prec_rec_curve(y_test_class, y_pred_proba, filename=fname) # ROC fname = '{}/roc_validation.png'.format(options.output_path) viz.plot_binary_roc(y_test_class, y_pred_proba, filename=fname) if options.regression == 'rfr': fname = options.output_path+'/rfc_feature_importance.png' viz.rfc_feature_importance(regressor.feature_importances_, fname, feature_names=options.feature_params) # Regression performance y_pred_reg, y_test_reg = predictor.pred(data=data_test) #y_test_reg = y_test[(len(y_test)-len(y_pred)):,0] rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred)) mae = mean_absolute_error(y_test_reg, y_pred) r2 = r2_score(y_test_reg, y_pred) logging.info('Regression RMSE: {}'.format(rmse)) logging.info('Regression MAE: {}'.format(mae)) logging.info('Regression R2 score: {}'.format(r2)) error_data = {'acc': [acc], 'precision': [precision], 'recall': [recall], 'f1': [f1], 'rmse': [rmse], 'mae': [mae], 'r2': [r2]} fname = '{}/training_time_classification_validation_errors.csv'.format(options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname) ############################################################################ # EVALUATE ############################################################################ if options.evaluate: logging.info('Loading test data...') test_data = bq.get_rows(dt.datetime.strptime('2010-01-01', "%Y-%m-%d"), dt.datetime.strptime('2019-01-01', "%Y-%m-%d"), loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.test_table, reason_code_table=options.reason_code_table, locations=options.locations, parameters=all_param_names) test_data = io.filter_train_type(labels_df=test_data, train_types=['K','L'], sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['delay'], aggs=aggs) # Sorting is actually not necessary. It's been useful for debugging. #test_data.sort_values(by=['time', 'trainstation'], inplace=True) # Filter only timesteps with large distribution in the whole network if options.filter_delay_limit is not None: test_data = io.filter_delay_with_limit(test_data, options.filter_delay_limit) test_data.set_index('time', inplace=True) logging.info('Test data contain {} rows...'.format(len(test_data))) logging.info('Adding binary class to the test dataset with limit {}...'.format(options.delay_limit)) test_data['class'] = test_data['delay'].map(lambda x: binary_labels[1] if x > options.delay_limit else binary_labels[0]) io.log_class_dist(test_data.loc[:, 'class'].values, labels=binary_labels) if options.month: logging.info('Adding month to the test dataset...') test_data['month'] = test_data.index.map(lambda x: x.month) times = [('2014-01-01', '2014-02-01'), ('2016-06-01', '2016-07-01'), ('2017-02-01', '2017-03-01'), ('2011-02-01', '2011-03-01')] for start, end in times: try: y_pred_proba, y_pred, y = predict_timerange(test_data, options.feature_params, classifier, xscaler, start, end) perf_metrics(y_pred_proba, y_pred, y, start, end, viz, io) except EmptyDataError: logging.info('No data for {} - {}'.format(start, end))
def main(): """ Get data from db and save it as csv """ bq = _bq.BQHandler() times = [] # times.append({'starttime': dt.datetime.strptime('2009-11-29', "%Y-%m-%d"), # 'endtime': dt.datetime.strptime('2018-01-10', "%Y-%m-%d")}) # #times.append({'starttime': dt.datetime.strptime('2014-06-02', "%Y-%m-%d"), # 'endtime': dt.datetime.strptime('2018-01-10', "%Y-%m-%d")}) times.append({ 'starttime': dt.datetime.strptime('2009-11-29', "%Y-%m-%d"), 'endtime': dt.datetime.strptime('2014-06-02', "%Y-%m-%d") }) logging.info('Using times: {}'.format(times)) #scaler = StandardScaler() data_to_scale = pd.DataFrame() daystep = 90 for t in times: starttime = t['starttime'] endtime = t['endtime'] start = starttime end = start + timedelta(days=daystep) if end > endtime: end = endtime while end <= endtime and start < end: logging.info('Processing time range {} - {}'.format( start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M'))) logging.info('Reading data...') try: data = bq.get_rows(start, end, project=options.project, dataset=options.src_dataset, table=options.src_table) logging.info('Imputing missing values...') data = imputer.fit_transform(data) data_to_scale = pd.concat([data_to_scale, data]) data.set_index(['time', 'trainstation'], inplace=True) if len(data) < 1 or len(data) < 1: start = end end = start + timedelta(days=daystep) continue bq.dataset_to_table(data, options.dst_dataset, options.dst_table) except ValueError as e: logging.warning(e) start = end end = start + timedelta(days=daystep) if end > endtime: end = endtime
def main(): """ Main program """ local_device_protos = device_lib.list_local_devices() logging.info( [x.name for x in local_device_protos if x.device_type == 'GPU']) bq = _bq.BQHandler() io = _io.IO(gs_bucket=options.gs_bucket) viz = _viz.Viz() starttime, endtime = io.get_dates(options) logging.info('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) # In classification use always class as label param option.label_params = 'class' all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) logging.info('Building model...') model = convlstm.Classifier().get_model() logging.info('Reading data...') bq.set_params(starttime, endtime, batch_size=2500000, loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names) data = bq.get_rows() data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['train_count', 'delay'], aggs=aggs) log_class_dist(data.loc[:, 'class']) data.sort_values(by=['time', 'trainstation'], inplace=True) if options.normalize: logging.info('Normalizing data...') xscaler = StandardScaler() non_scaled_data = data.loc[:, options.meta_params + ['class']] scaled_features = pd.DataFrame(xscaler.fit_transform( data.loc[:, options.feature_params]), columns=options.feature_params) data = pd.concat([non_scaled_data, scaled_features], axis=1) if options.pca: logging.info('Doing PCA analyzis for the data...') ipca = IncrementalPCA(n_components=options.pca_components, whiten=options.whiten, copy=False) non_processed_data = data.loc[:, options.meta_params + options.label_params] processed_data = data.loc[:, options.feature_params] ipca.fit(processed_data) processed_features = pd.DataFrame(ipca.transform(processed_data)) data = pd.concat([non_processed_data, processed_data], axis=1) fname = options.output_path + '/ipca_explained_variance.png' viz.explained_variance(ipca, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) data_train, data_test = train_test_split(data, test_size=0.33) # Define model batch_size = io.get_batch_size(data_train, options.pad_strategy, quantile=options.quantile) batch_size = 512 logging.info('Batch size: {}'.format(batch_size)) # Initialization losses, val_losses, accs, val_accs, steps = [], [], [], [], [] boardcb = TensorBoard(log_dir=options.log_dir, histogram_freq=0, write_graph=True, write_images=True) logging.info('Data shape: {}'.format( data_train.loc[:, options.feature_params].values.shape)) class_weights = class_weight.compute_class_weight( 'balanced', np.unique(data_train.loc[:, 'class'].values), data_train.loc[:, 'class'].values) weights = {} i = 0 for w in class_weights: weights[i] = w i += 1 logging.info('Class weights: {}'.format(weights)) data_gen = TimeseriesGenerator( data_train.loc[:, options.feature_params].values, to_categorical(data_train.loc[:, 'class'].values), length=24, sampling_rate=1, batch_size=batch_size) data_test_gen = TimeseriesGenerator( data_test.loc[:, options.feature_params].values, to_categorical(data_test.loc[:, 'class'].values), length=24, sampling_rate=1, batch_size=batch_size) logging.info('X batch size: {}'.format(data_gen[0][0].shape)) logging.info('Y batch size: {}'.format(data_gen[1][0].shape)) history = model.fit_generator(data_gen, validation_data=data_test_gen, epochs=3, class_weight=class_weights, callbacks=[boardcb]) #, batch_size=64) model_fname = options.save_path + '/model.json' weights_fname = options.save_path + '/weights.h5' history_fname = options.save_path + '/history.pkl' io.save_model(model_fname, weights_fname, history_fname, model, history.history) scores = model.evaluate_generator(data_test_gen) i = 0 error_data = {} for name in model.metrics_names: logging.info('{}: {:.4f}'.format(name, scores[i])) error_data[name] = [scores[i]] i += 1 fname = '{}/training_time_validation_errors.csv'.format( options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname) pred_proba = model.predict_generator(data_test_gen) pred = np.argmax(pred_proba, axis=1) log_class_dist(pred) #print(history.history) fname = options.output_path + '/learning_over_time.png' viz.plot_nn_perf(history.history, metrics={ '[%]': { 'acc': 'Accuracy', 'F1': 'F1 Score', 'Precision': 'Precision', 'Recall': 'Recall' } }, filename=fname)
'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR, 'CRITICAL': logging.CRITICAL } logging.basicConfig(format=( "[%(levelname)s] %(asctime)s %(filename)s:%(funcName)s:%(lineno)s %(message)s" ), level=logging_level[options.logging_level]) logging.info('Using configuration: {} | {}'.format(options.config_filename, options.config_name)) # Helpers bq = bqhandler.BQHandler() io = IO(gs_bucket=options.gs_bucket) viz = Viz(io) state = State() starttime, endtime = io.get_dates(options) logging.info('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) if options.save_data: tname = options.model + '_' + options.feature_dataset + '_' + options.config_name + '_train' tname = tname.replace('-', '_') bq.delete_table(options.project, options.feature_dataset, tname) tname = options.model + '_' + options.feature_dataset + '_' + options.config_name + '_test' tname = tname.replace('-', '_')
def main(): """ Get data from db and save it as csv """ bq = _bq.BQHandler() io = _io.IO(gs_bucket=options.gs_bucket) viz = _viz.Viz() starttime, endtime = io.get_dates(options) print('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) if options.pca: ipca = IncrementalPCA(n_components=options.pca_components, whiten=options.whiten, copy=False) rmses, maes, r2s, vars, start_times, end_times, end_times_obj = [], [], [], [], [], [], [] start = starttime end = endtime print('Processing time range {} - {}'.format( start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M'))) try: print('Reading data...') data = bq.get_rows(start, end, loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names) data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['delay'], aggs=aggs) if options.y_avg_hours is not None: data = io.calc_running_delay_avg(data, options.y_avg_hours) data.sort_values(by=['time', 'trainstation'], inplace=True) if options.impute: print('Imputing missing values...') data.drop(columns=['train_type'], inplace=True) data = imputer.fit_transform(data) data.loc[:, 'train_type'] = None if options.model == 'ard' and len(data) > options.n_samples: print('Sampling {} values from data...'.format(options.n_samples)) data = data.sample(options.n_samples) #l_data = data.loc[:,options.meta_params + options.label_params] #f_data = data.loc[:,options.meta_params + options.feature_params] except ValueError as e: f_data, l_data = [], [] #f_data.rename(columns={'trainstation':'loc_name'}, inplace=True) #logging.debug('Labels shape: {}'.format(l_data.shape)) print('Processing {} rows...'.format(len(data))) #assert l_data.shape[0] == f_data.shape[0] target = data.loc[:, options.label_params].astype(np.float32).values #print(f_data.columns) #features = f_data.drop(columns=['loc_name', 'time']).astype(np.float32).values features = data.loc[:, options.feature_params].astype(np.float32).values X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33) logging.debug('Features shape: {}'.format(X_train.shape)) n_samples, n_dims = X_train.shape if options.normalize: print('Normalizing data...') print(X_train) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) if options.pca: print('Doing PCA analyzis for the data...') X_train = ipca.fit_transform(X_train) fname = options.output_path + '/ipca_explained_variance.png' viz.explained_variance(ipca, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) X_test = ipca.fit_transform(X_test) logging.debug('Features shape after pre-processing: {}'.format( X_train.shape)) print('Training...') print(X_train.shape) input_dim = X_train.shape[1] #k1 = gpflow.kernels.Matern52(input_dim, lengthscales=0.3) #k_seasonal = gpflow.kernels.Periodic(input_dim=input_dim, period=2190, name='k_seasonal') #k_small = gpflow.kernels.Periodic(input_dim=input_dim, period=120, name='k_small') k_weather = gpflow.kernels.RBF(input_dim=input_dim, ARD=True) #k_noise = gpflow.kernels.White(input_dim=input_dim) #k = k_seasonal + k_weather + k_noise k = k_weather Z = np.random.rand(150, input_dim) if options.cv: logging.info('Doing random search for hyper parameters...') param_grid = {"length_scale": [0.1, 1, 2], "whiten": [True, False]} model = GP(dim=input_dim, Z=Z) random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=int(options.n_iter_search), n_jobs=-1) random_search.fit(X_train, y_train) logging.info("RandomizedSearchCV done.") sys.exit() else: model = GP(dim=input_dim, Z=Z) model.fit(X_train.astype(np.float64), y_train.reshape((-1, 1)).astype(np.float64)) model.save(options.save_file) print('Training finished') print(model.model) # Z_list = options.z_list.split(',') #for size in Z_list: # with tf.Session() as sess: #custom_config = gpflow.settings.get_settings() #custom_config.verbosity.tf_compile_verb = True #with gpflow.settings.temp_settings(custom_config), gpflow.session_manager.get_session().as_default(): #Z = X_train[::5].copy() # Z = np.random.rand(int(size), 19) # print('Training with inducing points: {}'.format(Z.shape)) # # # model = gpflow.models.SVGP(X_train.astype(np.float64), # # y_train.reshape((-1,1)).astype(np.float64), # # kern=k, # # likelihood=gpflow.likelihoods.Gaussian(), # # Z=Z, # # #Z=X_train.copy(), # # minibatch_size=100, # # whiten=options.normalize # # ) # # #model.likelihood.variance = 0.01 # # # # model.compile(session=sess) # # opt = gpflow.train.ScipyOptimizer() # # opt.minimize(model) # # model = GP(dim=19, # Z=Z # ) # model.fit(X_train.astype(np.float64), # y_train.reshape((-1,1)).astype(np.float64)) # # model.save(options.save_file) # # print('Training finished') # print(model.model) #fname=options.output_path+'/svga_performance.png' #viz.plot_svga(model, fname) # k_long_term = 66.0**2 * RBF(length_scale=67.0) # k_seasonal = 2.4**2 * RBF(length_scale=90.0)* ExpSineSquared(length_scale=150, periodicity=1.0, periodicity_bounds=(0,10000)) # k_medium_term = 0.66**2 * RationalQuadratic(length_scale=1.2, alpha=0.78) # k_noise = 0.18**2 * RBF(length_scale=0.134) + WhiteKernel(noise_level=0.19**2) # #kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise # kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise # # model = GaussianProcessRegressor(kernel=kernel_gpml, #alpha=0, # optimizer=None, normalize_y=True) # Metrics y_pred, var = model.predict_f(X_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) rmses.append(rmse) maes.append(mae) r2s.append(r2) vars.append(var.mean()) start_times.append(start.strftime('%Y-%m-%dT%H:%M:%S')) end_times.append(end.strftime('%Y-%m-%dT%H:%M:%S')) end_times_obj.append(end) print('RMSE: {:.2f}'.format(rmse)) print('MAE: {:.2f}'.format(mae)) print('Variance: {:.2f}-{:.2f}'.format(var.min(), var.max())) print('R2 score: {:.2f}'.format(r2)) #io.save_scikit_model(model, filename=options.save_file, ext_filename=options.save_file) if options.model == 'rf': fname = options.output_path + '/rfc_feature_importance.png' viz.rfc_feature_importance(model.feature_importances_, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) try: fname = options.output_path + '/learning_over_time.png' viz.plot_learning_over_time(end_times_obj, rmses, maes, r2s, filename=fname) io._upload_to_bucket(filename=fname, ext_filename=fname) except Exception as e: logging.error(e) error_data = { 'start_times': start_times, 'end_times': end_times, 'rmse': rmses, 'mae': maes, 'var': vars, 'r2': r2s } fname = '{}/training_time_validation_errors.csv'.format( options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname)
def main(): """ Get data from db and save it as csv """ bq = _bq.BQHandler() io = _io.IO(gs_bucket=options.gs_bucket) viz = _viz.Viz() starttime, endtime = io.get_dates(options) logging.info('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) # Get params all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) io._download_from_bucket(options.save_file, options.save_file) logging.info('Loadung model from {}...'.format(options.save_file)) predictor = io.load_scikit_model(options.save_file) # Init error dicts avg_delay = {} avg_pred_delay = {} station_count = 0 all_times = set() station_rmse = {} station_median_abs_err = {} station_r2 = {} # If stations are given as argument use them, else use all stations logging.info('Loading stations from {}...'.format(options.stations_file)) stationList = io.get_train_stations(options.stations_file) if options.stations is not None: stations = options.stations.split(',') else: stations = stationList.keys() # Get data #stationName = '{} ({})'.format(stationList[station]['name'], station) #logging.info('Processing station {}'.format(stationName)) # Read data and filter desired train types (ic and commuter) logging.info('Loading data...') data = bq.get_rows(starttime, endtime, loc_col='trainstation', project=options.project, dataset='trains_testset', table='features_1', parameters=all_param_names, locations=stations) data = io.filter_train_type(labels_df=data, train_types=['K', 'L'], sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['delay'], aggs=aggs) assert len(data) > 0, "Empty data" if options.y_avg_hours is not None: data = io.calc_running_delay_avg(data, options.y_avg_hours) data.sort_values(by=['time', 'trainstation'], inplace=True) logging.info('Processing {} rows...'.format(len(data))) # Pick times for creating error time series all_times = data.loc[:, 'time'].unique() #station_count += 1 # Pick feature and label data from all data l_data = data.loc[:, options.meta_params + options.label_params] f_data = data.loc[:, options.meta_params + options.feature_params] target = l_data['delay'].astype(np.float64).values.ravel() features = f_data.drop(columns=['trainstation', 'time']).astype( np.float64).values # Get data logging.info('Predicting...') y_pred = predictor.predict(features) # Calculate quantiles logging.info('Calculating fractiles...') groups, avg, pred = io.pred_fractiles(l_data, y_pred, stationList) # Go through stations for station in stations: data = pred.loc[pred['trainstation'] == station, :] times = data.loc[:, 'time'] if len(data) < 1: continue group = pred.loc[pred['trainstation'] == station, 'group'].values[0] stationName = '{} ({} | Group {})'.format(stationList[station]['name'], station, group) logging.info('Processing station {} (having {} rows)...'.format( station, len(data))) logging.info('Calculating errors for given station...') rmse = math.sqrt( metrics.mean_squared_error(data.loc[:, 'delay'], data.loc[:, 'pred_delay'])) median_abs_err = metrics.median_absolute_error( data.loc[:, 'delay'], data.loc[:, 'pred_delay']) r2 = metrics.r2_score(data.loc[:, 'delay'], data.loc[:, 'pred_delay']) # Put errors to timeseries station_rmse[station] = rmse station_median_abs_err[station] = median_abs_err station_r2[station] = r2 logging.info('RMSE for station {}: {}'.format(stationName, rmse)) logging.info('Mean absolute error for station {}: {}'.format( stationName, median_abs_err)) logging.info('R2 score for station {}: {}'.format(stationName, r2)) # Create csv and upload it to pucket times_formatted = [t.strftime('%Y-%m-%dT%H:%M:%S') for t in times] delay_data = { 'times': times_formatted, 'delay': data.loc[:, 'delay'].values, 'predicted delay': data.loc[:, 'pred_delay'].values, 'low': data.loc[:, 'pred_delay_low'].values, 'high': data.loc[:, 'pred_delay_high'].values } fname = '{}/delays_{}.csv'.format(options.vis_path, station) io.write_csv(delay_data, fname, fname) # Draw visualisation fname = '{}/{}.png'.format(options.vis_path, station) viz.plot_delay(times, data.loc[:, 'delay'].values, data.loc[:, 'pred_delay'].values, 'Delay for station {}'.format(stationName), fname, data.loc[:, 'pred_delay_low'].values, data.loc[:, 'pred_delay_high'].values) io._upload_to_bucket(filename=fname, ext_filename=fname) # Save all station related results to csv and upload them to bucket fname = '{}/station_rmse.csv'.format(options.vis_path) io.dict_to_csv(station_rmse, fname, fname) fname = '{}/station_median_absolute_error.csv'.format(options.vis_path) io.dict_to_csv(station_median_abs_err, fname, fname) fname = '{}/station_r2.csv'.format(options.vis_path) io.dict_to_csv(station_r2, fname, fname) # Create timeseries of avg actual delay and predicted delay all_times = sorted(list(all_times)) avg_delay = avg.loc[:, 'avg_delay'].dropna().values.ravel() avg_pred_delay = avg.loc[:, 'avg_pred_delay'].dropna().values.ravel() # Calculate average over all times and stations rmse = math.sqrt(metrics.mean_squared_error(avg_delay, avg_pred_delay)) median_abs_err = metrics.median_absolute_error(avg_delay, avg_pred_delay) r2 = metrics.r2_score(avg_delay, avg_pred_delay) logging.info('RMSE for average delay over all stations: {}'.format(rmse)) logging.info( 'Mean absolute error for average delay over all stations: {}'.format( median_abs_err)) logging.info('R2 score for average delay over all stations: {}'.format(r2)) # Write average data into file avg_errors = { 'rmse': rmse, 'mae': median_abs_err, 'r2': r2, 'nro_of_samples': len(avg_delay) } fname = '{}/avg_erros.csv'.format(options.vis_path) io.dict_to_csv(avg_errors, fname, fname) # Create timeseries of average delay and predicted delays over all stations all_times_formatted = [t.strftime('%Y-%m-%dT%H:%M:%S') for t in all_times] delay_data = { 'times': all_times_formatted, 'delay': avg_delay, 'predicted delay': avg_pred_delay } # write csv fname = '{}/avg_delays_all_stations.csv'.format(options.vis_path) io.write_csv(delay_data, fname, fname) for i in np.arange(0, 3): fname = '{}/avg_group_{}.png'.format(options.vis_path, (i + 1)) times = groups[i].index.values if len(times) < 2: continue g_avg_delay = groups[i].loc[:, 'avg_delay'].values.ravel() g_avg_pred_delay = groups[i].loc[:, 'avg_pred_delay'].values.ravel() g_avg_pred_delay_low = groups[ i].loc[:, 'avg_pred_delay_low'].values.ravel() g_avg_pred_delay_high = groups[ i].loc[:, 'avg_pred_delay_high'].values.ravel() viz.plot_delay(times, g_avg_delay, g_avg_pred_delay, 'Average delay for group {}'.format(i + 1), fname, g_avg_pred_delay_low, g_avg_pred_delay_high) io._upload_to_bucket(filename=fname, ext_filename=fname) # visualise fname = '{}/avg_all_stations.png'.format(options.vis_path) viz.plot_delay(all_times, avg_delay, avg_pred_delay, 'Average delay for all station', fname) io._upload_to_bucket(filename=fname, ext_filename=fname)
class Options(): pass options = Options() options.starttime = '2010-01-01' options.endtime = '2018-01-01' options.config_filename = 'cnf/rf.ini' options.config_name = 'all_params_1' options.stations_file = 'cnf/stations.json' options.stations = None #'PSL,OL,TPE,OV,PM,II,KEM,HKI' options.gs_bucket = 'trains-data' _config.read(options) bq = _bq.BQHandler() io = _io.IO(gs_bucket=options.gs_bucket) viz = _viz.Viz() starttime, endtime = io.get_dates(options) # Get params all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) print('Loading stations from {}...'.format(options.stations_file)) stationList = io.get_train_stations(options.stations_file) if options.stations is not None: stations = options.stations.split(',') else: stations = stationList.keys()
def main(): """ Get data from db and save it as csv """ bq = _bq.BQHandler() io = _io.IO(gs_bucket=options.gs_bucket) viz = _viz.Viz(io) starttime, endtime = io.get_dates(options) logging.info('Using dataset {} and time range {} - {}'.format(options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) if options.model == 'bgm': model = BayesianGaussianMixture(weight_concentration_prior_type="dirichlet_process", n_components=options.n_components) elif options.model == 'rfc': model = RandomForestClassifier(n_jobs=-1) else: raise('Model not specificied or wrong. Add "model: bgm" to config file.') if options.pca: ipca = IncrementalPCA(n_components=options.pca_components, whiten = options.whiten, copy = False) sum_columns = [] if options.reason_code_table is not None: sum_columns = ['count'] logging.info('Processing time range {} - {}'.format(starttime.strftime('%Y-%m-%d %H:%M'), endtime.strftime('%Y-%m-%d %H:%M'))) logging.info('Reading data...') data = bq.get_rows(starttime, endtime, loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names, reason_code_table=options.reason_code_table) data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=sum_columns, aggs=aggs) data.sort_values(by=['time', 'trainstation'], inplace=True) logging.info('Processing {} rows...'.format(len(data))) if options.month: logging.info('Adding month to the dataset...') data['month'] = data['time'].map(lambda x: x.month) options.feature_params.append('month') if options.balance: logging.info('Balancing data...') count = data.groupby('class').size().min() data = pd.concat([data.loc[data['class'] == 0].sample(n=count), data.loc[data['class'] == 1].sample(n=count), data.loc[data['class'] == 2].sample(n=count), data.loc[data['class'] == 3].sample(n=count)]) target = data.loc[:,options.label_params].astype(np.int32).values.ravel() features = data.loc[:,options.feature_params].astype(np.float32).values X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.10) logging.debug('Features shape: {}'.format(X_train.shape)) io.log_class_dist(y_train, np.arange(4)) n_samples, n_dims = X_train.shape if options.normalize: logging.info('Normalizing data...') scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) if options.pca: logging.info('Doing PCA analyzis for the data...') X_train = ipca.fit_transform(X_train) fname = options.output_path+'/ipca_explained_variance.png' viz.explained_variance(ipca, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) X_test = ipca.fit_transform(X_test) logging.debug('Features shape after pre-processing: {}'.format(X_train.shape)) if options.cv: logging.info('Doing random search for hyper parameters...') if options.model == 'bgm': param_grid = {"n_components": [1, 2, 4, 8, 16], "covariance_type": ['full', 'tied', 'diag', 'spherical'], "init_params": ['kmeans', 'random'] } elif options.model == 'rfc': raise("Not implemented. Get back to work!") else: raise("No param_grid set for given model ({})".format(options.model)) random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=int(options.n_iter_search), n_jobs=-1) random_search.fit(X_train, y_train) logging.info("RandomizedSearchCV done.") fname = options.output_path+'/random_search_cv_results.txt' report_cv_results(random_search.cv_results_, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) sys.exit() else: logging.info('Training...') model.fit(X_train, y_train) # Metrics y_pred = model.predict(X_test) acc = accuracy_score(y_test, y_pred) logging.info('Accuracy: {}'.format(acc)) io.save_scikit_model(model, filename=options.save_file, ext_filename=options.save_file) # Performance y_pred = model.predict(X_test) acc = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average='micro') recall = recall_score(y_test, y_pred, average='micro') f1 = f1_score(y_test, y_pred, average='micro') logging.info('Accuracy: {}'.format(acc)) logging.info('Precision: {}'.format(precision)) logging.info('Recall: {}'.format(recall)) logging.info('F1 score: {}'.format(f1)) io.log_class_dist(y_pred, labels=np.arange(4)) error_data = {'acc': [acc], 'precision': [precision], 'recall': [recall], 'f1': [f1]} fname = '{}/training_time_validation_errors.csv'.format(options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname) # Confusion matrices fname = '{}/confusion_matrix_validation.png'.format(options.output_path) viz.plot_confusion_matrix(y_test, y_pred, np.arange(4), filename=fname) fname = '{}/confusion_matrix_validation_normalised.png'.format(options.output_path) viz.plot_confusion_matrix(y_test, y_pred, np.arange(4), True, filename=fname) # Save models io.save_scikit_model(model, filename=options.save_file, ext_filename=options.save_file) if options.normalize: fname=options.save_path+'/xscaler.pkl' io.save_scikit_model(scaler, filename=fname, ext_filename=fname) if options.model == 'rfc': fname = options.output_path+'/rfc_feature_importance.png' viz.rfc_feature_importance(model.feature_importances_, fname, feature_names=options.feature_params)
def main(): """ Get data from db and save it as csv """ # Helpers bq = bqhandler.BQHandler() io = IO(gs_bucket=options.gs_bucket) viz = Viz(io) # Configuration starttime, endtime = io.get_dates(options) logging.info('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) # Initialise classifier if hasattr(options, 'classifier_file'): classifier = io.load_scikit_model(options.classifier_file) else: if options.classifier == 'svc': params = { 'kernel': options.kernel, 'gamma': options.gamma, 'C': options.penalty, 'probability': options.probability } classifier = SVC(**params) elif options.classifier == 'rfc': classifier = RandomForestClassifier(n_jobs=-1) else: raise ( 'Model not specificied or wrong. Add "classifier: bgm" to config file.' ) # Initialise regression model if options.regression == 'rfr': model = RandomForestRegressor( n_estimators=options.n_estimators, n_jobs=-1, min_samples_leaf=options.min_samples_leaf, min_samples_split=options.min_samples_split, max_features=options.max_features, max_depth=options.max_depth, bootstrap=options.bootstrap) regressor = _trans.Regressor(model=model) else: raise ( 'Model not specificied or wrong. Add "classifier: bgm" to config file.' ) # Initialise transformer transformer = _trans.Selector(classifier=classifier) # Initialise pipeline pipe = Pipeline([('selector', transformer), ('regression', regressor)]) sum_columns = ['delay'] if options.reason_code_table is not None: sum_columns = ['count'] # Pick only selected month where = {} if options.pick_month is not None: where = {'EXTRACT(MONTH from time)': options.pick_month} logging.info('Reading data...') bq.set_params(loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names, reason_code_table=options.reason_code_table, where=where) data = bq.get_rows(starttime, endtime) data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=sum_columns, aggs=aggs) data.sort_values(by=['time', 'trainstation'], inplace=True) logging.info('Processing {} rows...'.format(len(data))) # Filter only timesteps with large distribution in the whole network if options.filter_delay_limit is not None: data = io.filter_delay_with_limit(data, options.filter_delay_limit) # Binary class logging.info('Adding binary class to the dataset with limit {}...'.format( options.delay_limit)) data['class'] = data['delay'].map(lambda x: 1 if x > options.delay_limit else -1) # Separate train and validation sets data_train, data_test = train_test_split(data, test_size=0.3) # Balance if options.balance: logging.info('Balancing training data...') count = data_train.groupby('class').size().min() # SVC can't handle more than 50 000 samples if options.classifier == 'svc': count = min(count, 50000) data_train = pd.concat([ data_train.loc[data_train['class'] == -1].sample(n=count), data_train.loc[data_train['class'] == 1].sample(n=count) ]) logging.info('Train data:') io.log_class_dist(data_train.loc[:, 'class'].values, labels=[-1, 1]) logging.info('Test data:') io.log_class_dist(data_test.loc[:, 'class'].values, labels=[-1, 1]) # Adding month if options.month: logging.info('Adding month to the datasets...') data_train['month'] = data_train.loc[:, 'time'].map(lambda x: x.month) data_test['month'] = data_test.loc[:, 'time'].map(lambda x: x.month) options.feature_params.append('month') data_train.set_index('time', inplace=True) y_train = data_train.loc[:, ['delay', 'class']].astype(np.int32).values y_test = data_test.loc[:, ['delay', 'class']].astype(np.int32).values X_train = data_train.loc[:, options.feature_params].astype(np.float32).values X_test = data_test.loc[:, options.feature_params].astype(np.float32).values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # io.log_class_dist(y_train[:,1], [-1,1]) # If asked, save used train and test splits into big query if options.save_data: tname = options.model + '_' + options.feature_dataset + '_' + options.config_name + '_train' bq.nparray_to_table([X_train, y_train], [options.feature_params, ['delay', 'class']], options.project, options.feature_dataset, tname) tname = options.model + '_' + options.feature_dataset + '_' + options.config_name + '_test' bq.nparray_to_table([X_test, y_test], [options.feature_params, ['delay', 'class']], options.project, options.feature_dataset, tname) if options.normalize: logging.info('Normalizing data...') if hasattr(options, 'xscaler_file'): scaler = io.load_scikit_model(options.xscaler_file) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) else: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) if options.cv: logging.info('Doing random search for hyper parameters...') raise ("No param_grid set for given model ({})".format( options.regression)) random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=int(options.n_iter_search), n_jobs=-1) random_search.fit(X_train, y_train) logging.info("RandomizedSearchCV done.") fname = options.output_path + '/random_search_cv_results.txt' report_cv_results(random_search.cv_results_, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) sys.exit() else: logging.info('Training...') transformer.set_y(y_train[:, 0]) regressor.set_classifier(transformer) pipe.fit(X_train, y_train[:, 1]) # Metrics y_pred_proba = pipe.steps[0][1].predict_proba(X_test) y_pred = pipe.steps[0][1].predict(X_test, type='int') io.save_scikit_model(pipe, filename=options.save_file, ext_filename=options.save_file) # Classification performance y_test_class = y_test[:, 1] acc = accuracy_score(y_test_class, y_pred) precision = precision_score(y_test_class, y_pred, average='micro') recall = recall_score(y_test_class, y_pred, average='micro') f1 = f1_score(y_test_class, y_pred, average='micro') logging.info('Classification accuracy: {}'.format(acc)) logging.info('Classification precision: {}'.format(precision)) logging.info('Classification recall: {}'.format(recall)) logging.info('Classification F1 score: {}'.format(f1)) io.log_class_dist(y_pred, [-1, 1]) # Confusion matrices fname = '{}/confusion_matrix_validation.png'.format(options.output_path) viz.plot_confusion_matrix(y_test_class, y_pred, np.arange(2), filename=fname) fname = '{}/confusion_matrix_validation_normalised.png'.format( options.output_path) viz.plot_confusion_matrix(y_test_class, y_pred, np.arange(2), True, filename=fname) # Precision-recall curve fname = '{}/precision-recall-curve.png'.format(options.output_path) viz.prec_rec_curve(y_test_class, y_pred_proba, filename=fname) # ROC fname = '{}/roc.png'.format(options.output_path) viz.plot_binary_roc(y_test_class, y_pred_proba, filename=fname) if options.normalize: fname = options.save_path + '/xscaler.pkl' io.save_scikit_model(scaler, filename=fname, ext_filename=fname) if options.regression == 'rfr': fname = options.output_path + '/rfc_feature_importance.png' viz.rfc_feature_importance(pipe.steps[1][1].get_feature_importances(), fname, feature_names=options.feature_params) # Regression performance #y_pred = pipe.steps[1][1].predict(X_test) y_test_reg = y_test[:, 0] pipe.set_params(selector__full=True) y_pred = pipe.predict(X_test, full=True) rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred)) mae = mean_absolute_error(y_test_reg, y_pred) r2 = r2_score(y_test_reg, y_pred) logging.info('Regression RMSE: {}'.format(rmse)) logging.info('Regression MAE: {}'.format(mae)) logging.info('Regression R2 score: {}'.format(r2)) error_data = { 'acc': [acc], 'precision': [precision], 'recall': [recall], 'f1': [f1], 'rmse': [rmse], 'mae': [mae], 'r2': [r2] } fname = '{}/training_time_classification_validation_errors.csv'.format( options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname) ############################################################################ # EVALUATE ############################################################################ if options.evaluate: logging.info('Loading test data...') test_data = bq.get_rows(dt.datetime.strptime('2010-01-01', "%Y-%m-%d"), dt.datetime.strptime('2019-01-01', "%Y-%m-%d"), loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.test_table, parameters=all_param_names) test_data = io.filter_train_type(labels_df=test_data, train_types=['K', 'L'], sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['delay'], aggs=aggs) # Sorting is actually not necessary. It's been useful for debugging. test_data.sort_values(by=['time', 'trainstation'], inplace=True) test_data.set_index('time', inplace=True) logging.info('Test data contain {} rows...'.format(len(test_data))) logging.info( 'Adding binary class to the test dataset with limit {}...'.format( options.delay_limit)) #logging.info('Adding binary class to the dataset with limit {}...'.format(limit)) #data['class'] = data['count'].map(lambda x: 1 if x > options.delay_count_limit else -1) test_data['class'] = test_data['delay'].map( lambda x: 1 if x > options.delay_limit else -1) io.log_class_dist(test_data.loc[:, 'class'].values, labels=[-1, 1]) if options.month: logging.info('Adding month to the test dataset...') test_data['month'] = test_data.index.map(lambda x: x.month) times = [('2014-01-01', '2014-02-01'), ('2016-06-01', '2016-07-01'), ('2017-02-01', '2017-03-01'), ('2011-02-01', '2017-03-01')] for start, end in times: try: y_pred_proba, y_pred, y = predict_timerange( test_data, options.feature_params, pipe.steps[0][1], scaler, start, end) perf_metrics(y_pred_proba, y_pred, y, start, end, viz, io) except EmptyDataError: logging.info('No data for {} - {}'.format(start, end))
def main(): """ Get data from db and save it as csv """ #a = mlfdb.mlfdb() a = _bq.BQHandler() io = _io.IO(gs_bucket='trains-data') viz = _viz.Viz() if not os.path.exists(options.save_path): os.makedirs(options.save_path) starttime, endtime = io.get_dates(options) logging.debug(options.what) what = options.what.split(',') logging.debug(what) all_param_names = [ 'time', 'trainstation', 'train_type', 'train_count', 'total_delay', 'delay', 'name', 'lat', 'lon' ] logging.info('Loading classification dataset from db') logging.info('Using time range {} - {}'.format( starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) # Read data and filter desired train_types (ic and commuter) l_data = a.get_rows(starttime, endtime, loc_col='trainstation', project='trains-197305', dataset='trains_2009_18', table='features', parameters=all_param_names) # data = io.filter_train_type(labels_df=data, # train_types=['K','L'], # sum_types=True, # train_type_column='train_type', # location_column='trainstation', # time_column='time', # sum_columns=['delay'], # aggs=aggs) # l_data.rename(columns={0: 'trainstation', 1:'time', 2: 'lon', 3: 'lat', 4: 'train type', 5: 'delay', 6: 'train count', 7: 'total delay'}, inplace=True) #l_data.set_index(pd.DatetimeIndex(pd.to_datetime(l_data.loc[:,'time'].astype(int), unit='s')), inplace=True) #l_data.set_index('time', drop=False, inplace=True) passangers = io.filter_train_type(labels_df=l_data, train_types=['L', 'K'], sum_types=True) l_data.set_index(pd.to_datetime(l_data.loc[:, 'time']), inplace=True) #passangers.set_index(pd.to_datetime(passangers.loc[:,'time']), inplace=True) # ################################################################################ if 'histograms' in what: # All delays filename = options.save_path + '/hist_all_delays_all.png' viz.hist_all_delays( l_data.loc[:, ['train_type', 'train_count', 'delay', 'total_delay']], filename) # Different train types for name, t in train_types.items(): filename = options.save_path + '/hist_all_delays_{}.png'.format( name) df = l_data[l_data.loc[:, 'train_type'].isin([t])] viz.hist_all_delays(df.loc[:, statlist], filename) # All passanger trains filename = options.save_path + '/hist_all_delays_passanger.png' viz.hist_all_delays(passangers.loc[:, statlist], filename) # all parameters passangers.replace(-99, np.nan, inplace=True) delayed_data = passangers[passangers.loc[:, 'delay'] > 50] d = {'A': passangers, 'B': delayed_data} comp_data = pd.concat(d.values(), axis=1, keys=d.keys()) filename = options.save_path + '/histograms_compare.png' viz.all_hist(comp_data, filename=filename) # ################################################################################ if 'history' in what: # Mean delays over time # All trains filename = options.save_path + '/mean_delays_over_time_all.png' s = l_data.groupby(l_data.index)[statlist].mean() viz.plot_delays(s, filename) # for passanger trains filename = options.save_path + '/mean_delays_over_time_passanger.png' s = passangers.groupby(passangers.index)[statlist].mean() viz.plot_delays(s, filename) # for different train_types for name, t in train_types.items(): filename = options.save_path + '/mean_delays_over_time_{}.png'.format( name) df = l_data[l_data.loc[:, 'train_type'].isin([t])] s = df.groupby(df.index)[statlist].mean() viz.plot_delays(s, filename) # Median delays over time # All trains filename = options.save_path + '/median_delays_over_time_all.png' s = l_data.groupby(l_data.index)[statlist].median() viz.plot_delays(s, filename) # for passanger trains filename = options.save_path + '/median_delays_over_time_passanger.png' s = passangers.groupby(passangers.index)[statlist].median() viz.plot_delays(s, filename) # for different train_types for name, t in train_types.items(): filename = options.save_path + '/median_delays_over_time_{}.png'.format( name) df = l_data[l_data.loc[:, 'train_type'].isin([t])] s = df.groupby(df.index)[statlist].median() viz.plot_delays(s, filename) # ################################################################################ if 'heatmap' in what: # locs = a.get_locations_by_dataset(options.dataset, # starttime=starttime, # endtime=endtime, # rettype='dict') # # Heatmap bad some stations #locs = l_data.loc[:, 'trainstation'].unique().values.ravel() locs = io.get_train_stations('cnf/stations.json') #print(locs) if not os.path.exists(options.save_path + '/heatmap'): os.makedirs(options.save_path + '/heatmap') heatmap_year(l_data, passangers, 2018, locs) for year in np.arange(2010, 2019, 1): heatmap_year(l_data, passangers, year, locs) # ################################################################################ if 'detailed_heatmap' in what: locs = a.get_locations_by_dataset(options.dataset, starttime=starttime, endtime=endtime, rettype='dict') # Heatmap bad some stations if not os.path.exists(options.save_path + '/detailed_heatmap'): os.makedirs(options.save_path + '/detailed_heatmap') d = starttime while d < endtime: heatmap_day(l_data, passangers, d, locs) d += dt.timedelta(days=1)
def main(): """ Main program """ local_device_protos = device_lib.list_local_devices() logging.info( [x.name for x in local_device_protos if x.device_type == 'GPU']) bq = _bq.BQHandler() io = _io.IO(gs_bucket=options.gs_bucket) viz = _viz.Viz() starttime, endtime = io.get_dates(options) #save_path = options.save_path+'/'+options.config_name logging.info('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) logging.info('Reading data...') bq.set_params(starttime, endtime, batch_size=2500000, loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names, only_winters=options.only_winters) data = bq.get_rows() data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['train_count', 'delay'], aggs=aggs) if options.y_avg_hours is not None: data = io.calc_running_delay_avg(data, options.y_avg_hours) if options.y_avg: data = io.calc_delay_avg(data) data.sort_values(by=['time', 'trainstation'], inplace=True) if options.normalize: logging.info('Normalizing data...') xscaler = StandardScaler() yscaler = StandardScaler() non_scaled_data = data.loc[:, options.meta_params] labels = data.loc[:, options.label_params].astype( np.float32).values.reshape((-1, 1)) yscaler.fit(labels) scaled_labels = pd.DataFrame(yscaler.transform(labels), columns=['delay']) scaled_features = pd.DataFrame(xscaler.fit_transform( data.loc[:, options.feature_params].astype(np.float32)), columns=options.feature_params) data = pd.concat([non_scaled_data, scaled_features, scaled_labels], axis=1) if options.pca: logging.info('Doing PCA analyzis for the data...') ipca = IncrementalPCA(n_components=options.pca_components, whiten=options.whiten, copy=False) non_processed_data = data.loc[:, options.meta_params + options.label_params] processed_data = data.loc[:, options.feature_params] ipca.fit(processed_data) processed_features = pd.DataFrame(ipca.transform(processed_data)) data = pd.concat([non_processed_data, processed_data], axis=1) fname = options.output_path + '/ipca_explained_variance.png' viz.explained_variance(ipca, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) data_train, data_test = train_test_split(data, test_size=0.33) X_test, y_test = io.extract_batch(data_test, options.time_steps, batch_size=None, pad_strategy=options.pad_strategy, quantile=options.quantile, label_params=options.label_params, feature_params=options.feature_params) # Define model batch_size = io.get_batch_size(data_train, options.pad_strategy, quantile=options.quantile) logging.info('Batch size: {}'.format(batch_size)) model = LSTM.LSTM(options.time_steps, len(options.feature_params), 1, options.n_hidden, options.lr, options.p_drop, batch_size=batch_size) # Initialization rmses, mses, maes, steps, train_mse = [], [], [], [], [] saver = tf.train.Saver() sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) summary_writer = tf.summary.FileWriter(options.log_dir, graph=tf.get_default_graph()) #tf.summary.scalar('Training MSE', model.loss) tf.summary.scalar('Validation_MSE', model.mse) tf.summary.scalar('Validation_RMSE', model.rmse) tf.summary.scalar('Validation_MAE', model.mae) tf.summary.histogram('y_pred_hist', model.y_pred) merged_summary_op = tf.summary.merge_all() train_summary_op = tf.summary.scalar('Training_MSE', model.loss) train_step = 0 start = 0 while True: # If slow is set, go forward one time step at time, # else proceed whole batch size if options.slow: X_train, y_train = io.extract_batch( data_train, options.time_steps, start=start, pad_strategy=options.pad_strategy, quantile=options.quantile, label_params=options.label_params, feature_params=options.feature_params) else: X_train, y_train = io.extract_batch( data_train, options.time_steps, train_step, pad_strategy=options.pad_strategy, quantile=options.quantile, label_params=options.label_params, feature_params=options.feature_params) if (len(X_train) < options.time_steps): break if options.cv: logging.info('Doing random search for hyper parameters...') param_grid = { "C": [0.001, 0.01, 0.1, 1, 10], "epsilon": [0.01, 0.1, 0.5], "kernel": ['rbf', 'linear', 'poly', 'sigmoid', 'precomputed'], "degree": [2, 3, 4], "shrinking": [True, False], "gamma": [0.001, 0.01, 0.1], "coef0": [0, 0.1, 1] } random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=int( options.n_iter_search), n_jobs=-1) random_search.fit(X_train, y_train) logging.info("RandomizedSearchCV done.") fname = options.output_path + '/random_search_cv_results.txt' report_cv_results(random_search.cv_results_, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) sys.exit() else: if train_step == 0: logging.info('Training...') feed_dict = {model.X: X_train, model.y: y_train} _, loss, train_summary = sess.run( [model.train_op, model.loss, train_summary_op], feed_dict=feed_dict) summary_writer.add_summary(train_summary, train_step * batch_size) # Metrics feed_dict = {model.X: X_test, model.y: y_test} #model.cell_init_state: state} val_loss, rmse, mse, mae, y_pred, summary = sess.run( [ model.loss, model.rmse, model.mse, model.mae, model.y_pred, merged_summary_op ], feed_dict=feed_dict) train_mse.append(loss) mses.append(mse) rmses.append(rmse) maes.append(mae) steps.append(train_step) summary_writer.add_summary(summary, train_step * batch_size) if train_step % 50 == 0: logging.info("Step {}:".format(train_step)) logging.info("Training loss: {:.4f}".format(loss)) logging.info("Validation MSE: {:.4f}".format(val_loss)) logging.info('Validation RMSE: {}'.format(rmse)) logging.info('Validation MAE: {}'.format(mae)) logging.info('................') saver.save(sess, options.save_file) train_step += 1 start += 1 # <-- while True: saver.save(sess, options.save_file) if options.normalize: fname = options.save_path + '/yscaler.pkl' io.save_scikit_model(yscaler, fname, fname) io._upload_dir_to_bucket(options.save_path, options.save_path) try: fname = options.output_path + '/learning_over_time.png' metrics = [{ 'metrics': [{ 'values': mses, 'label': 'Validation MSE' }, { 'values': train_mse, 'label': 'Train MSE' }], 'y_label': 'MSE' }, { 'metrics': [{ 'values': rmses, 'label': 'Validation RMSE' }], 'y_label': 'RMSE' }, { 'metrics': [{ 'values': maes, 'label': 'Validation MAE' }], 'y_label': 'MAE' }] viz.plot_learning(metrics, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) except Exception as e: logging.error(e) error_data = { 'steps': steps, 'mse': mses, 'rmse': rmses, 'mae': maes, 'train_mse': train_mse } fname = '{}/training_time_validation_errors.csv'.format( options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname)