def fit_model(io: IO, model: Sequential, preprocessed: List[Preprocessed]): epochs = io.get("epochs") model.reset_states() logline("splitting into training set and testing set ({}%)".format( io.get("split"))) split = gen_split(preprocessed, io) log_dir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") for i in range(epochs): logline("generating input and expected data for epoch {}/{}".format( i + 1, epochs)) train_x, train_y = trim_params(gen_fit_params(split), io) logline("training epoch {}/{}".format(i + 1, epochs)) callbacks = [] if io.get("profile"): debug("profiling") callbacks.append( tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)) model.fit(train_x, train_y, batch_size=io.get("batch_size"), epochs=1, shuffle=False, callbacks=callbacks) model.reset_states()
def output_split(all: List[Preprocessed], train: List[Preprocessed], io: IO): obj = { "training_set": list(map(lambda x: x.file_name, train)), "test_set": list(map(lambda x: x.file_name, filter(lambda x: x not in train, all))), } with open(io.get("output_train"), "w+") as out_file: json.dump(obj, out_file) logline("wrote training/testing config to {}".format(io.get("output_train")))
def read_test_files(io: IO) -> List[Preprocessed]: with open(io.get("input_preprocessed"), "rb") as preprocessed_file: file_configs = pickle.load(preprocessed_file) with open(io.get("input_train"), "rb") as train_config_file: train_config = json.load(train_config_file) test_files_names = train_config["test_set"] preprocessed = map(lambda x: Preprocessed(x), file_configs) test_files = list( filter(lambda x: x.file_name in test_files_names, preprocessed)) return test_files
def main(screen): vm = VirtualMachine() vm.initialize(program_path='/home/facetoe/Downloads/chio/INVADERS') io = IO(screen) io.initialize(screen) while True: vm.tick() if vm.needs_refresh: io.draw(vm.gfx_buffer) vm.needs_refresh = False sleep(0.01)
def __init__(self): self.io = IO() self.core = Core() banner = self.io.readBanner() args = self.io.getArguments() salt = args.salt if salt == None: salt = DEFAULT_SALT saltyBanner = self.core.addSalt(banner, salt) print(saltyBanner)
def fit_model(io: IO, model: Sequential, preprocessed: List[Preprocessed]): epochs = io.get("epochs") model.reset_states() logline("splitting into training set and testing set ({}%)".format(io.get("split"))) split = gen_split(preprocessed, io) for i in range(epochs): logline("generating input and expected data for epoch {}/{}".format(i + 1, epochs)) train_x, train_y = trim_params(gen_fit_params(split), io) logline("training epoch {}/{}".format(i + 1, epochs)) model.fit(train_x, train_y, batch_size=io.get("batch_size"), epochs=1, shuffle=False) model.reset_states()
def output_split(all: List[Preprocessed], train: List[Preprocessed], io: IO): obj = { "training_set": list(map(lambda x: x.file_name, train)), "test_set": list(map(lambda x: x.file_name, filter(lambda x: x not in train, all))), } pathlib.Path(os.path.dirname(io.get("output_train"))).mkdir(parents=True, exist_ok=True) with open(io.get("output_train"), "w+") as out_file: json.dump(obj, out_file) logline("wrote training/testing config to {}".format( io.get("output_train")))
def gen_split(preprocessed: List[Preprocessed], io: IO) -> List[Preprocessed]: split = io.get("split") if split == 100: output_split(preprocessed, preprocessed, io) return preprocessed shuffled = random.sample(preprocessed, len(preprocessed)) total_len = sum(map(lambda x: len(x.features), preprocessed)) train_len = (total_len / 100.0) * split train_items = list() current_len = 0 for i in range(len(preprocessed) - 1): new_len = current_len + len(shuffled[i].features) if new_len >= train_len: output_split(preprocessed, train_items, io) return train_items current_len = new_len train_items.append(shuffled[i]) output_split(preprocessed, train_items, io) return train_items
def predictions_to_out_file(predictions: np.array, io: IO): obj = {"items": [], "genre": {"hard": 0.5, "uptempo": 0.5}} interval = io.get("interval") melodies = list() cur_time = 0 for i in range(len(predictions)): prediction = predictions[i] beat, melody = prediction if is_positive_beat(beat): cur_obj = {} cur_obj["type"] = "beat" cur_obj["time"] = cur_time obj["items"].append(cur_obj) if is_positive_melody(melody): cur_obj = {} cur_obj["type"] = "melody" cur_obj["time"] = cur_time cur_obj["duration"] = interval melodies.append(cur_obj) cur_time += interval obj["items"] = obj["items"] + stitch_melodies(melodies, io) return obj
def gen_outputs(file: MarkedAudioFile, io: IO) -> List[ExpectedOutput]: """Gen a list of marked outputs for given file""" out_len = len(file.bins_file.bins) outputs = [ExpectedOutput(False, False) for x in range(out_len)] interval = io.get("interval") for timestamp in file.json_file.timestamps: # Round it to the range timestamp_time = timestamp.timestamp * 1000 closest = get_closest(timestamp_time, io) timestamp_index = int(closest / interval) if timestamp_index >= out_len: continue if timestamp.beat_type == "beat": output_mark = outputs[timestamp_index] output_mark.is_beat = True elif timestamp.beat_type == "melody": closest_end = get_closest(timestamp_time + (timestamp.length * 1000)) for i in range(int((closest_end - closest) / interval)): outputs[timestamp_index + i].is_melody = True return outputs
def get_io() -> IO: return IO( { "i": IOInput( glob("../../data/tracks/*.wav"), list, has_input=True, arg_name="input_files", descr="Input .wav files", alias="input_files", is_generic=True, ), "a": IOInput( "../../data/analysis.json", str, has_input=True, arg_name="analysis", descr="Analysis JSON file", alias="analysis", ), "o": IOInput( "../../data/preprocessed.pickle", str, has_input=True, arg_name="output_file", descr="File in which the features and outputs get placed", alias="output_file", ), "n": IOInput( 50, int, has_input=True, arg_name="interval", descr="Interval at which data is sent", alias="interval" ), } )
def collect_input_paths(io: IO) -> List[str]: """Turn the input glob into file paths""" all_files = list(set(io.get("input_files"))) wav_files = list( filter(lambda in_file: in_file.split(".")[-1] == "wav", all_files)) return wav_files
def run_tests(io: IO, model: Sequential, test_files: List[Preprocessed]): model.reset_states() for file in test_files: logline("creating test params for {}".format(file.file_name)) test_x, test_y = get_test_params(file) logline("making predictions") predictions: List[List[float]] = model.predict(test_x, batch_size=1, verbose=1) model.reset_states() mse_total: List[float] = list() correct = 0 diff_score = 0 for i in range(len(predictions)): prediction = predictions[i] actual: List[float] = test_y[i] diff = abs(actual[0] - prediction[0]) diff_score += diff if is_in_range(diff): correct += 1 mse_total.append(mean_squared_error(actual, prediction)) logline( "predicted {}/{} within range ({}%) correct, score was {}/{}, mse was {}" .format( correct, len(predictions), round(correct / len(predictions) * 100, 2), diff_score, len(predictions), round(sum(mse_total) / len(predictions), 4), )) out_obj = predictions_to_out_file(predictions, io) pathlib.Path(io.get("output_annotated")).mkdir(parents=True, exist_ok=True) out_path = os.path.join(io.get("output_annotated"), "{}.json".format(file.file_name)) with open(out_path, "w+") as out_file: json.dump(out_obj, out_file) logline("wrote object to {}".format(out_path))
def start_server(io: IO): global interval interval = io.get("interval") port = io.get("port") httpd = HTTPServer(("", port), partial(WebServer, directory=os.path.join(CUR_DIR, "public"))) logline("listening at port", port) enter_group() try: httpd.serve_forever() except KeyboardInterrupt: pass httpd.server_close() exit_group() logline("stopped listening")
def get_io() -> IO: return IO({ "i": IOInput( "./data/preprocessed.pickle", str, has_input=True, arg_name="input_preprocessed", descr="Input preprocessed file", alias="input_preprocessed", is_generic=True, ), "iw": IOInput( "./data/weights.h5", str, has_input=True, arg_name="input_weights", descr="Input weights file", alias="input_weights", ), "im": IOInput( "./data/model.json", str, has_input=True, arg_name="input_model", descr="Input file for the model", alias="input_model", ), "it": IOInput( "./data/train_config.json", str, has_input=True, arg_name="input_train", descr="Input file for the train config", alias="input_train", ), "o": IOInput( "./data/annotated/", str, has_input=True, arg_name="output_annotated", descr="Directory where annotated files are stored", alias="output_annotated", ), "n": IOInput(50, int, has_input=True, arg_name="interval", descr="Interval at which data is sent", alias="interval"), })
def get_closest(timestamp_time: float, io: IO) -> int: """Get the closest multiple of INTERVAL to the timestamp""" interval = io.get("interval") lowerbound = (timestamp_time // interval) * interval upperbound = lowerbound + interval lowerbound_diff = timestamp_time - lowerbound upperbound_diff = upperbound - timestamp_time return lowerbound if lowerbound_diff <= upperbound_diff else upperbound
def trim_params(params: Tuple[np.ndarray, np.ndarray], io: IO) -> Tuple[np.ndarray, np.ndarray]: batch_size = io.get("batch_size") x_param, y_param = params length = x_param.shape[0] remainder = length % batch_size if remainder == 0: return params return x_param[:-remainder], y_param[:-remainder]
def collect_input_paths(io: IO) -> Union[None, List[str]]: """Turn the input glob into file paths""" all_files = io.get("input_files") wav_files = list( filter(lambda in_file: in_file.split(".")[-1] == "wav", all_files)) if len(wav_files) == 0: return None annotated_files = list(filter(has_json_file, wav_files)) return annotated_files
def stitch_melodies(obj: List[Dict[str, Union[str, float]]], io: IO) -> List[Dict[str, Union[str, float]]]: new_melodies = list() interval = io.get("interval") i = 0 while i < len(obj): if len(new_melodies) > 0: if new_melodies[-1]["time"] == obj[i]["time"] - interval: new_melodies[-1]["time"] += interval i += 1 continue new_melodies.append(obj[i]) i += 1 return new_melodies
def get_io() -> IO: return IO({ "p": IOInput(1234, int, has_input=True, arg_name="port", descr="The port on which to host it", alias="port"), "n": IOInput(50, int, has_input=True, arg_name="interval", descr="Interval at which data is sent", alias="interval"), })
def predictions_to_out_file(predictions: np.array, io: IO): obj: Dict[str, Any] = {"items": [], "genre": {"hard": 0.5, "uptempo": 0.5}} interval = io.get("interval") cur_time = 0 for i in range(len(predictions)): prediction: Tuple[float] = predictions[i] confidence: float = prediction[0] if is_in_range(confidence): cur_obj = {} cur_obj["type"] = "beat" cur_obj["time"] = cur_time obj["items"].append(cur_obj) cur_time += interval return obj
def match_files(io: IO, input_paths: List[str]): """Match found files to analysis file contents""" analysis_file = io.get("analysis") logline(analysis_file) analysis = AnalysisFile(analysis_file) mapped: Dict[str, str] = {} reverse_map: Dict[str, str] = {} for in_path in input_paths: file_name = in_path.split("/")[-1].split(".")[0] for track_analysis in analysis.tracks: if track_analysis.name.lower() in file_name.lower(): mapped[in_path] = track_analysis.name reverse_map[track_analysis.name] = file_name break logline("came up with the following mapping:") logline("") for file_name in mapped: logline('"{}" -> "{}"'.format(file_name, mapped[file_name])) unmapped_amount: int = 0 for in_path in input_paths: if in_path not in mapped: warn('input file "{}" not mapped'.format(in_path)) unmapped_amount += 1 for track_analysis in analysis.tracks: if track_analysis.name not in reverse_map: warn('analysed file "{}" not mapped'.format(track_analysis.name)) unmapped_amount += 1 logline("") if unmapped_amount > 0: try: correct = input("is this correct? Y/n") if correct.lower() == "n": return None except KeyboardInterrupt: return None return analysis, mapped
def gen_outputs(file: MarkedAudioFile, io: IO) -> List[ExpectedOutput]: """Gen a list of marked outputs for given file""" out_len = len(file.bins_file.bins) # TODO: change outputs = [ExpectedOutput(0) for _ in range(out_len)] interval = io.get("interval") for timestamp in file.timestamps: # Round it to the range timestamp_time = timestamp.timestamp * 1000 closest = get_closest(timestamp_time, io) timestamp_index = int(closest / interval) if timestamp_index >= out_len: continue output_mark = outputs[timestamp_index] output_mark.beat_confidence = timestamp.confidence return outputs
def get_io() -> IO: return IO( { "i": IOInput( "./data/preprocessed.pickle", str, has_input=True, arg_name="input_file", descr="Input preprocessed file", alias="input_file", is_generic=True, ), "ow": IOInput( "./data/weights.h5", str, has_input=True, arg_name="output_weights", descr="File in which the weights gets stored", alias="output_weights", ), "ot": IOInput( "./data/train_config.json", str, has_input=True, arg_name="output_train", descr="File in which the training config gets stored", alias="output_train", ), "s": IOInput( 80, int, has_input=True, arg_name="split", descr="The split between training and test sets", alias="split", ), "b": IOInput(32, int, has_input=True, arg_name="batch_size", descr="The batch size", alias="batch_size",), "e": IOInput(10, int, has_input=True, arg_name="epochs", descr="The amount of epochs", alias="epochs",), } )
def run_tests(io: IO, model: Sequential, test_files: List[Preprocessed]): model.reset_states() for file in test_files: logline("creating test params for {}".format(file.file_name)) test_x, test_y = get_test_params(file) logline("making predictions") predictions = model.predict(test_x, batch_size=1, verbose=1) model.reset_states() mse_total = list() correct = 0 for i in range(len(predictions)): prediction = predictions[i] actual = test_y[i] if actual[0] == is_positive_beat( prediction[0]) and actual[1] == is_positive_melody( prediction[1]): correct += 1 mse_total.append(mean_squared_error(actual, prediction)) logline("predicted {}/{} ({}%) correct, mse was {}".format( correct, len(predictions), round(correct / len(predictions) * 100, 2), round(sum(mse_total) / len(predictions), 4), )) out_obj = predictions_to_out_file(predictions, io) out_path = os.path.join(io.get("output_annotated"), "{}.json".format(file.file_name)) with open(out_path, "w+") as out_file: json.dump(out_obj, out_file) logline("wrote object to {}".format(out_path))
def main(): """ Main program """ # Print GPU availability local_device_protos = device_lib.list_local_devices() logging.info( [x.name for x in local_device_protos if x.device_type == 'GPU']) bq = BQHandler() io = IO(gs_bucket=options.gs_bucket) viz = Viz(io) starttime, endtime = io.get_dates(options) logging.info('Using dataset {}.{} and time range {} - {}'.format( options.feature_dataset, options.feature_table, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = list( set(options.label_params + options.feature_params + options.meta_params)) aggs = io.get_aggs_from_param_names(options.feature_params) logging.info('Building model...') dim = len(options.feature_params) if options.month: dim += 1 model = convlstm.Regression(options, dim).get_model() logging.info('Reading data...') bq.set_params(batch_size=2500000, loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names, locations=options.train_stations, only_winters=options.only_winters, reason_code_table=options.reason_code_table) data = bq.get_rows(starttime, endtime) data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['train_count', 'delay'], aggs=aggs) if options.y_avg_hours is not None: data = io.calc_running_delay_avg(data, options.y_avg_hours) if options.y_avg: data = io.calc_delay_avg(data) data.sort_values(by=['time', 'trainstation'], inplace=True) if options.month: logging.info('Adding month to the dataset...') data['month'] = data['time'].map(lambda x: x.month) options.feature_params.append('month') if options.normalize: logging.info('Normalizing data...') xscaler = StandardScaler() yscaler = StandardScaler() labels = data.loc[:, options.label_params].astype( np.float32).values.reshape((-1, 1)) yscaler.fit(labels) scaled_labels = pd.DataFrame(yscaler.transform(labels), columns=['delay']) non_scaled_data = data.loc[:, options.meta_params + ['class']] scaled_features = pd.DataFrame(xscaler.fit_transform( data.loc[:, options.feature_params]), columns=options.feature_params) data = pd.concat([non_scaled_data, scaled_features, scaled_labels], axis=1) fname = options.save_path + '/xscaler.pkl' io.save_scikit_model(xscaler, fname, fname) fname = options.save_path + '/yscaler.pkl' io.save_scikit_model(yscaler, fname, fname) if options.pca: logging.info('Doing PCA analyzis for the data...') ipca = IncrementalPCA(n_components=options.pca_components, whiten=options.whiten, copy=False) non_processed_data = data.loc[:, options.meta_params + options.label_params] processed_data = data.loc[:, options.feature_params] ipca.fit(processed_data) processed_features = pd.DataFrame(ipca.transform(processed_data)) data = pd.concat([non_processed_data, processed_data], axis=1) fname = options.output_path + '/ipca_explained_variance.png' viz.explained_variance(ipca, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) data_train, data_test = train_test_split(data, test_size=0.33) # Define model batch_size = 512 logging.info('Batch size: {}'.format(batch_size)) # Initialization losses, val_losses, accs, val_accs, steps = [], [], [], [], [] boardcb = TensorBoard(log_dir=options.log_dir + '/lstm', histogram_freq=0, write_graph=True, write_images=True) logging.info('Data shape: {}'.format( data_train.loc[:, options.feature_params].values.shape)) data_gen = TimeseriesGenerator( data_train.loc[:, options.feature_params].values, data_train.loc[:, options.label_params].values, length=24, sampling_rate=1, batch_size=batch_size) data_test_gen = TimeseriesGenerator( data_test.loc[:, options.feature_params].values, data_test.loc[:, options.label_params].values, length=24, sampling_rate=1, batch_size=batch_size) logging.info('X batch size: {}'.format(data_gen[0][0].shape)) logging.info('Y batch size: {}'.format(data_gen[1][0].shape)) history = model.fit_generator(data_gen, validation_data=data_test_gen, epochs=options.epochs, callbacks=[boardcb]) #, batch_size=64) history_fname = options.save_path + '/history.pkl' io.save_keras_model(options.save_file, history_fname, model, history.history) scores = model.evaluate_generator(data_test_gen) i = 0 error_data = {} for name in model.metrics_names: logging.info('{}: {:.4f}'.format(name, scores[i])) error_data[name] = [scores[i]] i += 1 fname = '{}/training_time_validation_errors.csv'.format( options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname) pred = model.predict_generator(data_test_gen) #io.log_class_dist(pred, 4) #print(history.history) fname = options.output_path + '/learning_over_time.png' viz.plot_nn_perf(history.history, metrics={ 'Error': { 'mean_squared_error': 'MSE', 'mean_absolute_error': 'MAE' } }, filename=fname)
def main(): """ Get data from db and save it as csv """ # Helpers bq = bqhandler.BQHandler() io = IO(gs_bucket=options.gs_bucket) viz = Viz(io) predictor = Predictor(io, ModelLoader(io), options) ### OPTIONS ################################################################ # Configuration starttime, endtime = io.get_dates(options) logging.info('Using dataset {} and time range {} - {}'.format(options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) ### MODELS ################################################################# # Initialise classifier if hasattr(options, 'classifier_file'): classifier = io.load_scikit_model(options.classifier_file) else: if options.classifier == 'svc': params = {'kernel': options.kernel, 'gamma': options.gamma, 'C': options.penalty, 'probability': options.probability} #classifier = SVC(**params) classifier = SVCClassifier(params, limit=options.class_limit) elif options.classifier == 'graphsvc': classifier = GraphSVCClassifier() graph_data = pd.read_csv(options.graph_data, names=['date', 'start_hour', 'src', 'dst', 'type', 'sum_delay','sum_ahead','add_delay','add_ahead','train_count']) classifier.fetch_connections(graph_data) elif options.classifier == 'gaussiannb': classifier = GaussianNBClassifier() elif options.classifier == 'lstm': num_of_features = len(options.feature_params) if options.month: num_of_features += 1 class_weight=None if hasattr(options, 'class_weight'): class_weight=eval(options.class_weight) params = {'length': options.time_steps, 'batch_size': options.batch_size, 'epochs': options.epochs, 'num_of_features': num_of_features, 'log_dir': options.log_dir, 'class_weight':class_weight} classifier = LSTMClassifier(**params) else: raise('Model not specificied or wrong. Add "classifier: bgm" to config file.') # Initialise regression model if options.regression == 'rfr': regressor = RandomForestRegressor(n_estimators=options.n_estimators, n_jobs=-1, min_samples_leaf=options.min_samples_leaf, min_samples_split=options.min_samples_split, max_features=options.max_features, max_depth=options.max_depth, bootstrap=options.bootstrap ) #regressor = _trans.Regressor(model=model) else: raise('Model not specificied or wrong. Add "classifier: bgm" to config file.') # Initialise transformer #transformer = _trans.Selector(classifier=classifier) # Initialise pipeline #pipe = Pipeline( # [('selector', transformer), # ('regression', regressor)] #) ### DATA ################################################################### sum_columns = ['delay'] if 'train_count' in options.meta_params: sum_columns.append('train_count') # Pick only selected month where = {} if options.pick_month is not None: where = {'EXTRACT(MONTH from time)': options.pick_month} logging.info('Reading data...') bq.set_params(loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names, locations=options.locations, only_winters=options.only_winters, reason_code_table=options.reason_code_table, where=where) data = bq.get_rows(starttime, endtime) data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=sum_columns, aggs=aggs) data['delay'] = data.loc[:, 'delay'].replace(-99, np.nan) data.sort_values(by=['trainstation', 'time'], inplace=True) logging.info('Processing {} rows...'.format(len(data))) # Filter only timesteps with large distribution in the whole network if options.filter_delay_limit is not None: data = io.filter_delay_with_limit(data, options.filter_delay_limit) # Binary class logging.info('Adding binary class to the dataset with limit {}...'.format(options.delay_limit)) def set_class(x): if x > options.delay_limit: return binary_labels[1] elif x < options.delay_limit: return binary_labels[0] return np.nan data['class'] = data['delay'].map(lambda x: set_class(x)) # Separate train and validation sets data_train, data_test = train_test_split(data, test_size=0.3, shuffle=False) # Balance if options.balance: logging.info('Balancing training data...') count = data_train.groupby('class').size().min() # SVC can't handle more than 50 000 samples if options.classifier == 'svc': count = min(count, 50000) data_train = pd.concat([data_train.loc[data_train['class'] == 0].sample(n=count), data_train.loc[data_train['class'] == 1].sample(n=count)]) logging.info('Train data:') io.log_class_dist(data_train.loc[:, 'class'].values, labels=binary_labels) logging.info('Test data:') io.log_class_dist(data_test.loc[:, 'class'].values, labels=binary_labels) # Adding month if options.month: logging.info('Adding month to the datasets...') data_train['month'] = data_train.loc[:,'time'].map(lambda x: x.month) data_test['month'] = data_test.loc[:,'time'].map(lambda x: x.month) options.feature_params.append('month') #data_train.set_index('time', inplace=True) #y_train_class = data_train.loc[:,['class']].astype(np.int32).values.ravel() #y_train_delay = data_train.loc[:,['delay']].astype(np.int32).values.ravel() y_train_class = data_train.loc[:,['class']].values.ravel() y_train_delay = data_train.loc[:,['delay']].values.ravel() #y_test_class = data_test.loc[:,['class']].astype(np.int32).values.ravel() #y_test_delay = data_test.loc[:,['delay']].astype(np.int32).values.ravel() y_test_class = data_test.loc[:,['class']].values.ravel() y_test_delay = data_test.loc[:,['delay']].values.ravel() X_train = data_train.loc[:,options.feature_params].astype(np.float32).values X_test = data_test.loc[:,options.feature_params].astype(np.float32).values if options.smote: logging.info('Smoting...') sm = SMOTE() X_train_class, y_class = sm.fit_resample(X_train, y_train_class) io.log_class_dist(y_class, labels=binary_labels) # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # io.log_class_dist(y_train[:,1], [-1,1]) # If asked, save used train and test splits into big query if options.save_data: tname = options.model+'_'+options.feature_dataset+'_'+options.config_name+'_train' columns = [options.feature_params, ['delay'], ['class']] bq.nparray_to_table([X_train, y_train_class, y_train_delay], columns, options.project, options.feature_dataset, tname ) tname = options.model+'_'+options.feature_dataset+'_'+options.config_name+'_test' bq.nparray_to_table([X_test, y_test_class, y_test_delay], columns, options.project, options.feature_dataset, tname ) if options.normalize: logging.info('Normalizing data...') #scale=(0,1) if hasattr(options, 'xscaler_file'): xscaler = io.load_scikit_model(options.xscaler_file) X_train = xscaler.transform(X_train) X_test = xscaler.transform(X_test) else: xscaler = MinMaxScaler(feature_range=(-1,1)) #xscaler = StandardScaler() X_train = xscaler.fit_transform(X_train) X_test = xscaler.transform(X_test) fname = options.save_path+'/xscaler.pkl' io.save_scikit_model(xscaler, fname, fname) if hasattr(options, 'yscaler_file'): yscaler = io.load_scikit_model(options.yscaler_file) y_train_delay = yscaler.transform(y_train_delay) y_test_delay = yscaler.transform(y_test_delay) else: #yscaler = MinMaxScaler(feature_range=(0,1)) yscaler=StandardScaler() y_train_delay = yscaler.fit_transform(y_train_delay.reshape(-1,1)).ravel() y_test_delay = yscaler.transform(y_test_delay.reshape(-1,1)).ravel() fname = options.save_path+'/yscaler.pkl' io.save_scikit_model(yscaler, fname, fname) data_train.loc[:,options.feature_params].to_csv('data/x_train.csv', index=False) data_test.loc[:,options.feature_params].to_csv('data/x_test.csv', index=False) data_train.loc[:,['class']].fillna(-99).astype(np.int).to_csv('data/y_train.csv', index=False) data_test.loc[:,['class']].fillna(-99).astype(np.int).to_csv('data/y_test.csv', index=False) sys.exit() ### TRAIN ################################################################## if options.cv: logging.info('Doing random search for hyper parameters...') raise("No param_grid set for given model ({})".format(options.regression)) random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=int(options.n_iter_search), n_jobs=-1) random_search.fit(X_train, y_train) logging.info("RandomizedSearchCV done.") fname = options.output_path+'/random_search_cv_results.txt' report_cv_results(random_search.cv_results_, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) sys.exit() else: logging.info('Training classifier...') if options.classifier == 'graphsvc': classifier.fit(X_train, y_train_class, stations=data_train.loc[:, 'trainstation'].values) else: history = classifier.fit(X_train, y_train_class, X_test, y_test_class) # Save classifier if options.classifier == 'lstm': history_fname = options.save_path+'/history.pkl' fname = options.save_path+'/classifier.h5' io.save_keras_model(fname, history_fname, classifier, history.history) else: fname = options.save_path+'/classifier.pkl' io.save_scikit_model(classifier, filename=fname, ext_filename=fname) # Drop data with no delay information X_train = X_train[~np.isnan(y_train_delay)] y_train_delay = y_train_delay[~np.isnan(y_train_delay)] y_train_class = y_train_class[~np.isnan(y_train_class)] y_pred_train_bin = classifier.predict(X_train, type='bool') # debug #y_pred_train_bin #indices = np.random.choice(np.arange(y_pred_train_bin.size), # replace=False, # size=int(y_pred_train_bin.size * 0.2)) #y_pred_train_bin[indices] = True #print('y_pred_train_bin: {}'.format(y_pred_train_bin.shape)) #print('y_train_delay: {}'.format(y_train_delay.shape)) #print('y_train_class: {}'.format(y_train_class.shape)) # Pick only severe values #y_train_delay_ = y_train_delay[(len(y_train_class)-len(y_pred_train_bin)):] #X_train_ = X_train[(len(y_train_class)-len(y_pred_train_bin)):] y_train_delay_ = y_train_delay[(len(y_train_delay)-len(y_pred_train_bin)):] X_train_ = X_train[(len(y_train_delay)-len(y_pred_train_bin)):] #print('y_train_delay_: {}'.format(y_train_delay_.shape)) y_train_severe = y_train_delay_[y_pred_train_bin] X_train_severe = X_train_[y_pred_train_bin] logging.info('Training regressor...') regressor.fit(X_train_severe, y_train_severe) # Save regressor io.save_scikit_model(regressor, filename=options.save_file, ext_filename=options.save_file) # Learning history # fname = options.output_path+'/learning_over_time.png' # viz.plot_nn_perf(history.history, metrics={'Error': {'mean_squared_error': 'MSE', # 'mean_absolute_error': 'MAE'}}, # filename=fname) ### RESULTS FOR VALIDATION SET ############################################# # Drop data with missing delay X_test = X_test[~np.isnan(y_test_class)] y_test_class = y_test_class[~np.isnan(y_test_class)] data_test = data_test[~np.isnan(data_test.delay)] # Metrics #y_pred_proba = classifier.predict_proba(X_test) y_pred = classifier.predict(X_test) y_pred_proba = classifier.y_pred_proba #y_test_delay = y_test_delay[~np.isnan(y_test_delay)] # Classification performance # LSTM don't have first timesteps y_test_class = y_test_class[(len(X_test)-len(y_pred)):] acc = accuracy_score(y_test_class, y_pred) precision = precision_score(y_test_class, y_pred, average='micro') recall = recall_score(y_test_class, y_pred, average='micro') f1 = f1_score(y_test_class, y_pred, average='micro') logging.info('Classification accuracy: {}'.format(acc)) logging.info('Classification precision: {}'.format(precision)) logging.info('Classification recall: {}'.format(recall)) logging.info('Classification F1 score: {}'.format(f1)) io.log_class_dist(y_pred, binary_labels) # Confusion matrices fname = '{}/confusion_matrix_validation.png'.format(options.output_path) viz.plot_confusion_matrix(y_test_class, y_pred, np.arange(2), filename=fname) fname = '{}/confusion_matrix_validation_normalised.png'.format(options.output_path) viz.plot_confusion_matrix(y_test_class, y_pred, np.arange(2), True, filename=fname) # Precision-recall curve fname = '{}/precision-recall-curve_validation.png'.format(options.output_path) viz.prec_rec_curve(y_test_class, y_pred_proba, filename=fname) # ROC fname = '{}/roc_validation.png'.format(options.output_path) viz.plot_binary_roc(y_test_class, y_pred_proba, filename=fname) if options.regression == 'rfr': fname = options.output_path+'/rfc_feature_importance.png' viz.rfc_feature_importance(regressor.feature_importances_, fname, feature_names=options.feature_params) # Regression performance y_pred_reg, y_test_reg = predictor.pred(data=data_test) #y_test_reg = y_test[(len(y_test)-len(y_pred)):,0] rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred)) mae = mean_absolute_error(y_test_reg, y_pred) r2 = r2_score(y_test_reg, y_pred) logging.info('Regression RMSE: {}'.format(rmse)) logging.info('Regression MAE: {}'.format(mae)) logging.info('Regression R2 score: {}'.format(r2)) error_data = {'acc': [acc], 'precision': [precision], 'recall': [recall], 'f1': [f1], 'rmse': [rmse], 'mae': [mae], 'r2': [r2]} fname = '{}/training_time_classification_validation_errors.csv'.format(options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname) ############################################################################ # EVALUATE ############################################################################ if options.evaluate: logging.info('Loading test data...') test_data = bq.get_rows(dt.datetime.strptime('2010-01-01', "%Y-%m-%d"), dt.datetime.strptime('2019-01-01', "%Y-%m-%d"), loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.test_table, reason_code_table=options.reason_code_table, locations=options.locations, parameters=all_param_names) test_data = io.filter_train_type(labels_df=test_data, train_types=['K','L'], sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['delay'], aggs=aggs) # Sorting is actually not necessary. It's been useful for debugging. #test_data.sort_values(by=['time', 'trainstation'], inplace=True) # Filter only timesteps with large distribution in the whole network if options.filter_delay_limit is not None: test_data = io.filter_delay_with_limit(test_data, options.filter_delay_limit) test_data.set_index('time', inplace=True) logging.info('Test data contain {} rows...'.format(len(test_data))) logging.info('Adding binary class to the test dataset with limit {}...'.format(options.delay_limit)) test_data['class'] = test_data['delay'].map(lambda x: binary_labels[1] if x > options.delay_limit else binary_labels[0]) io.log_class_dist(test_data.loc[:, 'class'].values, labels=binary_labels) if options.month: logging.info('Adding month to the test dataset...') test_data['month'] = test_data.index.map(lambda x: x.month) times = [('2014-01-01', '2014-02-01'), ('2016-06-01', '2016-07-01'), ('2017-02-01', '2017-03-01'), ('2011-02-01', '2011-03-01')] for start, end in times: try: y_pred_proba, y_pred, y = predict_timerange(test_data, options.feature_params, classifier, xscaler, start, end) perf_metrics(y_pred_proba, y_pred, y, start, end, viz, io) except EmptyDataError: logging.info('No data for {} - {}'.format(start, end))
'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR, 'CRITICAL': logging.CRITICAL } logging.basicConfig(format=( "[%(levelname)s] %(asctime)s %(filename)s:%(funcName)s:%(lineno)s %(message)s" ), level=logging_level[options.logging_level]) logging.info('Using configuration: {} | {}'.format(options.config_filename, options.config_name)) # Helpers bq = bqhandler.BQHandler() io = IO(gs_bucket=options.gs_bucket) viz = Viz(io) state = State() starttime, endtime = io.get_dates(options) logging.info('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) if options.save_data: tname = options.model + '_' + options.feature_dataset + '_' + options.config_name + '_train' tname = tname.replace('-', '_') bq.delete_table(options.project, options.feature_dataset, tname) tname = options.model + '_' + options.feature_dataset + '_' + options.config_name + '_test' tname = tname.replace('-', '_') bq.delete_table(options.project, options.feature_dataset, tname)
def main(): """ Get data from db and save it as csv """ bq = BQHandler() io = IO(gs_bucket=options.gs_bucket) viz = Viz(io=io) starttime, endtime = io.get_dates(options) logging.info('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) if options.model == 'rf': model = RandomForestRegressor( n_estimators=options.n_estimators, n_jobs=-1, min_samples_leaf=options.min_samples_leaf, min_samples_split=options.min_samples_split, max_features=options.max_features, max_depth=options.max_depth, bootstrap=options.bootstrap) elif options.model == 'lr': model = SGDRegressor(warm_start=True, max_iter=options.n_loops, shuffle=options.shuffle, power_t=options.power_t, penalty=options.regularizer, learning_rate=options.learning_rate, eta0=options.eta0, alpha=options.alpha, tol=0.0001) elif options.model == 'svr': model = SVR() elif options.model == 'ard': model = ARDRegression(n_iter=options.n_loops, alpha_1=options.alpha_1, alpha_2=options.alpha_2, lambda_1=options.lambda_1, lambda_2=options.lambda_2, threshold_lambda=options.threshold_lambda, fit_intercept=options.fit_intercept, copy_X=options.copy_X) elif options.model == 'gp': k_long_term = 66.0**2 * RBF(length_scale=67.0) k_seasonal = 2.4**2 * RBF(length_scale=90.0) * ExpSineSquared( length_scale=150, periodicity=1.0, periodicity_bounds=(0, 10000)) k_medium_term = 0.66**2 * RationalQuadratic(length_scale=1.2, alpha=0.78) k_noise = 0.18**2 * RBF(length_scale=0.134) + WhiteKernel( noise_level=0.19**2) #kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise model = GaussianProcessRegressor( kernel=kernel_gpml, #alpha=0, optimizer=None, normalize_y=True) elif options.model == 'llasso': model = LocalizedLasso(num_iter=options.n_loops, batch_size=options.batch_size) elif options.model == 'nlasso': model = NetworkLasso(num_iter=options.n_loops, batch_size=options.batch_size) graph_data = pd.read_csv(options.graph_data, names=[ 'date', 'start_hour', 'src', 'dst', 'type', 'sum_delay', 'sum_ahead', 'add_delay', 'add_ahead', 'train_count' ]) #stations_to_pick = options.stations_to_pick.split(',') #graph = model.fetch_connections(graph_data, stations_to_pick) model.fetch_connections(graph_data) if options.pca: ipca = IncrementalPCA(n_components=options.pca_components, whiten=options.whiten, copy=False) rmses, maes, r2s, skills, start_times, end_times, end_times_obj = [], [], [], [], [], [], [] X_complete = [] # Used for feature selection start = starttime end = start + timedelta(days=int(options.day_step), hours=int(options.hour_step)) if end > endtime: end = endtime while end <= endtime and start < end: logging.info('Processing time range {} - {}'.format( start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M'))) # Load data ############################################################ try: logging.info('Reading data...') data = bq.get_rows(start, end, loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names, only_winters=options.only_winters) data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['train_count', 'delay'], aggs=aggs) # Filter only timesteps with large distribution in the whole network if options.filter_delay_limit is not None: data = io.filter_delay_with_limit(data, options.filter_delay_limit) if options.y_avg_hours is not None: data = io.calc_running_delay_avg(data, options.y_avg_hours) if options.y_avg: data = io.calc_delay_avg(data) data.sort_values(by=['time', 'trainstation'], inplace=True) if options.impute: logging.info('Imputing missing values...') data.drop(columns=['train_type'], inplace=True) data = imputer.fit_transform(data) data.loc[:, 'train_type'] = None if options.month: logging.info('Adding month to the dataset...') data['month'] = data['time'].map(lambda x: x.month) if 'month' not in options.feature_params: options.feature_params.append('month') if options.model == 'ard' and len(data) > options.n_samples: logging.info('Sampling {} values from data...'.format( options.n_samples)) data = data.sample(options.n_samples) l_data = data.loc[:, options.label_params] f_data = data.loc[:, options.feature_params] except ValueError as e: f_data, l_data = [], [] if len(f_data) < 2 or len(l_data) < 2: start = end end = start + timedelta(days=int(options.day_step), hours=int(options.hour_step)) continue logging.info('Processing {} rows...'.format(len(f_data))) train, test = train_test_split(data, test_size=0.1) X_train = train.loc[:, options.feature_params].astype(np.float32).values y_train = train.loc[:, options.label_params].astype( np.float32).values.ravel() X_test = test.loc[:, options.feature_params].astype(np.float32).values y_test = test.loc[:, options.label_params].astype( np.float32).values.ravel() logging.debug('Features shape: {}'.format(X_train.shape)) if options.normalize: logging.info('Normalizing data...') xscaler, yscaler = StandardScaler(), StandardScaler() X_train = xscaler.fit_transform(X_train) X_test = xscaler.transform(X_test) if len(options.label_params) == 1: y_train = yscaler.fit_transform(y_train.reshape(-1, 1)).ravel() #y_test = yscaler.transform(y_test.reshape(-1, 1)).ravel() else: y_train = yscaler.fit_transform(y_train) #y_test = yscaler.transform(y_test) if options.pca: logging.info('Doing PCA analyzis for the data...') X_train = ipca.fit_transform(X_train) fname = options.output_path + '/ipca_explained_variance.png' viz.explained_variance(ipca, fname) #io._upload_to_bucket(filename=fname, ext_filename=fname) X_test = ipca.fit_transform(X_test) if options.model == 'llasso': graph_data = pd.read_csv(options.graph_data, names=[ 'date', 'start_hour', 'src', 'dst', 'type', 'sum_delay', 'sum_ahead', 'add_delay', 'add_ahead', 'train_count' ]) graph = model.fetch_connections(graph_data) logging.debug('Features shape after pre-processing: {}'.format( X_train.shape)) # FIT ################################################################## if options.cv: logging.info('Doing random search for hyper parameters...') if options.model == 'rf': param_grid = { "n_estimators": [10, 100, 200, 800], "max_depth": [3, 20, None], "max_features": ["auto", "sqrt", "log2", None], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4, 10], "bootstrap": [True, False] } elif options.model == 'lr': param_grid = { "penalty": [None, 'l2', 'l1'], "alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1], "l1_ratio": [0.1, 0.15, 0.2, 0.5], "shuffle": [True, False], "learning_rate": ['constant', 'optimal', 'invscaling'], "eta0": [0.001, 0.01, 0.1], "power_t": [0.1, 0.25, 0.5] } elif options.model == 'svr': param_grid = { "C": [0.001, 0.01, 0.1, 1, 10], "epsilon": [0.01, 0.1, 0.5], "kernel": ['rbf', 'linear', 'poly', 'sigmoid', 'precomputed'], "degree": [2, 3, 4], "shrinking": [True, False], "gamma": [0.001, 0.01, 0.1], "coef0": [0, 0.1, 1] } else: raise ("No param_grid set for given model ({})".format( options.model)) random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=int( options.n_iter_search), n_jobs=-1) random_search.fit(X_train, y_train) logging.info("RandomizedSearchCV done.") fname = options.output_path + '/random_search_cv_results.txt' io.report_cv_results(random_search.cv_results_, fname) #io._upload_to_bucket(filename=fname, ext_filename=fname) sys.exit() else: logging.info('Training...') if options.model in ['rf', 'svr', 'ard', 'gp']: model.fit(X_train, y_train) if options.feature_selection: X_complete = X_train y_complete = y_train meta_complete = data.loc[:, options.meta_params] elif options.model in ['llasso']: model.fit(X_train, y_train, stations=train.loc[:, 'trainstation'].values) elif options.model in ['nlasso']: model.partial_fit(X_train, y_train, stations=train.loc[:, 'trainstation'].values) else: model.partial_fit(X_train, y_train) if options.feature_selection: try: X_complete = np.append(X_complete, X_train) y_complete = np.append(Y_complete, y_train) meta_complete = meta_complete.append( data.loc[:, options.meta_params]) except (ValueError, NameError): X_complete = X_train y_complete = y_train meta_complete = data.loc[:, options.meta_params] # EVALUATE ############################################################# # Check training score to estimate amount of overfitting # Here we assume that we have a datetime index (from time columns) y_pred_train = model.predict(X_train) rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train)) mae_train = np.sqrt(mean_squared_error(y_train, y_pred_train)) logging.info('Training data RMSE: {} and MAE: {}'.format( rmse_train, mae_train)) #try: if True: print(train) #range = ('2013-02-01','2013-02-28') range = ('2010-01-01', '2010-01-02') X_train_sample = train.loc[range[0]:range[1], options.feature_params].astype( np.float32).values target = train.loc[range[0]:range[1], options.label_params].astype( np.float32).values.ravel() y_pred_sample = model.predict(X_train_sample) times = train.loc[range[0]:range[1], 'time'].values df = pd.DataFrame(times + y_pred_sample) print(df) sys.exit() # Draw visualisation fname = '{}/timeseries_training_data.png'.format( options.output_path) viz.plot_delay(times, target, y_pred, 'Delay for station {}'.format(stationName), fname) fname = '{}/scatter_all_stations.png'.format(options.vis_path) viz.scatter_predictions(times, target, y_pred, savepath=options.vis_path, filename='scatter_{}'.format(station)) #except KeyError: # pass # Mean delay over the whole dataset (both train and validation), # used to calculate Brier Skill if options.y_avg: mean_delay = 3.375953418071136 else: mean_delay = 6.011229358531166 if options.model == 'llasso': print('X_test shape: {}'.format(X_test.shape)) y_pred, weights = model.predict(X_test, test.loc[:, 'trainstation'].values) else: y_pred = model.predict(X_test) if options.normalize: y_pred = yscaler.inverse_transform(y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) rmse_stat = math.sqrt( mean_squared_error(y_test, np.full_like(y_test, mean_delay))) skill = 1 - rmse / rmse_stat rmses.append(rmse) maes.append(mae) r2s.append(r2) skills.append(skill) start_times.append(start.strftime('%Y-%m-%dT%H:%M:%S')) end_times.append(end.strftime('%Y-%m-%dT%H:%M:%S')) end_times_obj.append(end) if options.model in ['rf', 'lr', 'ard', 'gp']: logging.info('R2 score for training: {}'.format( model.score(X_train, y_train))) logging.info('RMSE: {}'.format(rmse)) logging.info('MAE: {}'.format(mae)) logging.info('R2 score: {}'.format(r2)) logging.info('Brier Skill Score score: {}'.format(skill)) start = end end = start + timedelta(days=int(options.day_step), hours=int(options.hour_step)) if end > endtime: end = endtime # SAVE ##################################################################### io.save_scikit_model(model, filename=options.save_file, ext_filename=options.save_file) if options.normalize: fname = options.save_path + '/xscaler.pkl' io.save_scikit_model(xscaler, filename=fname, ext_filename=fname) fname = options.save_path + '/yscaler.pkl' io.save_scikit_model(yscaler, filename=fname, ext_filename=fname) if options.model == 'rf': fname = options.output_path + '/rfc_feature_importance.png' viz.rfc_feature_importance(model.feature_importances_, fname, feature_names=options.feature_params) #io._upload_to_bucket(filename=fname, ext_filename=fname) try: fname = options.output_path + '/learning_over_time.png' viz.plot_learning_over_time(end_times_obj, rmses, maes, r2s, filename=fname) #io._upload_to_bucket(filename=fname, ext_filename=fname) except Exception as e: logging.error(e) error_data = { 'start_times': start_times, 'end_times': end_times, 'rmse': rmses, 'mae': maes, 'r2': r2s, 'skill': skills } fname = '{}/training_time_validation_errors.csv'.format( options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname) # FEATURE SELECTION ######################################################## if options.feature_selection: logging.info('Doing feature selection...') selector = SelectFromModel(model, prefit=True) print(pd.DataFrame(data=X_complete)) X_selected = selector.transform(X_complete) selected_columns = f_data.columns.values[selector.get_support()] logging.info( 'Selected following parameters: {}'.format(selected_columns)) data_sel = meta_complete.join( pd.DataFrame(data=y_complete, columns=options.label_params)).join( pd.DataFrame(data=X_selected, columns=selected_columns)) print(pd.DataFrame(data=X_selected, columns=selected_columns)) print(data_sel)
def load_preprocessed(io: IO) -> List[Preprocessed]: with open(io.get("input_file"), "rb") as in_file: return list(map(lambda x: Preprocessed(x), pickle.load(in_file)))
def main(): """ Get data from db and save it as csv """ bq = BQHandler() io = IO(gs_bucket=options.gs_bucket) viz = Viz(io=io) predictor = Predictor(io, ModelLoader(io), options, options.station_specific_classifier, options.station_specific_regressor) predictor.regressor_save_file = options.save_path + '/classifier.pkl' predictor.classifier_save_file = options.save_path + '/regressor.pkl' # Mean delay over the whole dataset (both train and validation), # used to calculate Brier Skill mean_delay = options.mean_delay starttime, endtime = io.get_dates(options) logging.info('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) # Get params all_param_names = list( set(options.label_params + options.feature_params + options.meta_params + options.classifier_feature_params + options.regressor_feature_params)) # Param list is modified after retrieving data classifier_feature_params = copy.deepcopy( options.classifier_feature_params) regressor_feature_params = copy.deepcopy(options.regressor_feature_params) all_feature_params = list( set(options.feature_params + options.meta_params + options.classifier_feature_params + options.regressor_feature_params)) aggs = io.get_aggs_from_param_names(all_feature_params) # Init error dicts avg_delay = {} avg_pred_delay = {} avg_proba = {} station_count = 0 all_times = set() station_rmse = {} station_mae = {} station_r2 = {} station_skill = {} # For aggregated binary classification metrics time_list, target_list, y_pred_bin_list, y_pred_bin_proba_list = [], [], [], [] # If stations are given as argument use them, else use all stations stationList = io.get_train_stations(options.stations_file) all_data = None if options.locations is not None: stations = options.locations else: stations = stationList.keys() # Go through stations for station in stations: stationName = '{} ({})'.format(stationList[station]['name'], station) logging.info('Processing station {}'.format(stationName)) if hasattr(options, 'classifier_model_file'): predictor.classifier_save_file = options.classifier_model_file.replace( '{location}', station) elif options.station_specific_classifier: predictor.classifier_save_file = options.save_path + '/{}'.format( station) + '/classifier.pkl' if hasattr(options, 'regressor_model_file'): predictor.regressor_save_file = options.regressor_model_file.replace( '{location}', station) elif options.station_specific_regressor: predictor.regressor_save_file = options.save_path + '/{}'.format( station) + '/regressor.pkl' station_rmse[station] = {} station_mae[station] = {} station_r2[station] = {} station_skill[station] = {} # Read data and filter desired train types (ic and commuter) table = 'features_testset' if hasattr(options, 'test_table'): table = options.test_table data = bq.get_rows(starttime, endtime, loc_col='trainstation', project=options.project, dataset='trains_data', table=table, parameters=all_param_names, only_winters=options.only_winters, reason_code_table=options.reason_code_table, reason_codes_exclude=options.reason_codes_exclude, reason_codes_include=options.reason_codes_include, locations=[station]) data = io.filter_train_type(labels_df=data, train_types=['K', 'L'], sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['train_count', 'delay'], aggs=aggs) if len(data) == 0: continue if options.y_avg_hours is not None: data = io.calc_running_delay_avg(data, options.y_avg_hours) if options.y_avg: data = io.calc_delay_avg(data) if options.month: logging.info('Adding month to the dataset...') data = data.assign( month=lambda df: df.loc[:, 'time'].map(lambda x: x.month)) if 'month' not in options.feature_params: options.feature_params.append('month') if 'month' not in options.regressor_feature_params: options.regressor_feature_params.append('month') if 'month' not in options.classifier_feature_params: options.classifier_feature_params.append('month') data.sort_values(by=['time'], inplace=True) logging.info('Processing {} rows...'.format(len(data))) if all_data is None: all_data = data else: all_data.append(data, ignore_index=True) # Pick times for creating error time series times = data.loc[:, 'time'] station_count += 1 # Run prediction try: #target, y_pred = predictor.pred(times, data) y_pred, y_pred_bin, y_pred_bin_proba = predictor.pred(times, data) # Drop first times which LSTM are not able to predict #times = times[(len(data)-len(y_pred)):] except (PredictionError, ModelError) as e: logging.error(e) continue target = data.loc[:, options.label_params].reset_index( drop=True).values.ravel() if len(y_pred) < 1 or len(target) < 1: continue # Create timeseries of predicted and happended delay i = 0 for t in times: try: if t not in avg_delay.keys(): avg_delay[t] = [target[i]] avg_pred_delay[t] = [y_pred[i]] if predictor.y_pred_bin_proba is not None: avg_proba[t] = [predictor.y_pred_bin_proba[i, 1]] else: avg_delay[t].append(target[i]) avg_pred_delay[t].append(y_pred[i]) if predictor.y_pred_bin_proba is not None: avg_proba[t].append(predictor.y_pred_bin_proba[i, 1]) except IndexError as e: # LSTM don't have first time steps because it don't # have necessary history pass i += 1 # For creating visualisation all_times = all_times.union(set(times)) # If only average plots are asked, continue to next station if options.only_avg == 1: continue # Calculate errors for given station, first for all periods and then for whole time range if predictor.y_pred_bin is not None: time_list += list(times) #feature_list += list() target_list += list(target) y_pred_bin_list += list(predictor.y_pred_bin) y_pred_bin_proba_list += list(predictor.y_pred_bin_proba) splits = viz._split_to_parts(list(times), [ target, y_pred, predictor.y_pred_bin, predictor.y_pred_bin_proba ], 2592000) else: splits = viz._split_to_parts(list(times), [target, y_pred], 2592000) for i in range(0, len(splits)): logging.info('Month {}:'.format(i + 1)) if predictor.y_pred_bin is not None: times_, target_, y_pred_, y_pred_bin_, y_pred_bin_proba_ = splits[ i] viz.classification_perf_metrics(y_pred_bin_proba_, y_pred_bin_, target_, options, times_, station) else: times_, target_, y_pred_ = splits[i] rmse = math.sqrt(metrics.mean_squared_error(target_, y_pred_)) mae = metrics.mean_absolute_error(target_, y_pred_) r2 = metrics.r2_score(target_, y_pred_) rmse_stat = math.sqrt( metrics.mean_squared_error(target_, np.full_like(target_, mean_delay))) skill = 1 - rmse / rmse_stat # Put errors to timeseries station_rmse[station][i] = rmse station_mae[station][i] = mae station_r2[station][i] = r2 station_skill[station][i] = skill logging.info('RMSE of station {} month {}: {:.4f}'.format( stationName, i + 1, rmse)) logging.info('MAE of station {} month {}: {:.4f}'.format( stationName, i + 1, mae)) logging.info('R2 score of station {} month {}: {:.4f}'.format( stationName, i + 1, r2)) logging.info('Skill (RMSE) of station {} month {}: {:.4f}'.format( stationName, i + 1, skill)) mse = math.sqrt(metrics.mean_squared_error(target, y_pred)) mae = metrics.mean_absolute_error(target, y_pred) r2 = metrics.r2_score(target, y_pred) rmse_stat = math.sqrt( metrics.mean_squared_error(target, np.full_like(target, mean_delay))) skill = 1 - rmse / rmse_stat station_rmse[station]['all'] = rmse station_mae[station]['all'] = mae station_r2[station]['all'] = r2 station_skill[station]['all'] = skill logging.info('All periods:') logging.info('RMSE of station {} month {}: {:.4f}'.format( stationName, i + 1, rmse)) logging.info('MAE of station {} month {}: {:.4f}'.format( stationName, i + 1, mae)) logging.info('R2 score of station {} month {}: {:.4f}'.format( stationName, i + 1, r2)) logging.info('Skill (RMSE) of station {} month {}: {:.4f}'.format( stationName, i + 1, skill)) # Create csv and upload it to pucket times_formatted = [t.strftime('%Y-%m-%dT%H:%M:%S') for t in times] delay_data = { 'times': times_formatted, 'delay': target, 'predicted delay': y_pred } fname = '{}/delays_{}.csv'.format(options.vis_path, station) io.write_csv(delay_data, fname, fname) # Draw visualisation if predictor.y_pred_bin_proba is not None: fname = '{}/timeseries_proba_{}'.format(options.vis_path, station) proba = predictor.y_pred_bin_proba[:, 1] viz.plot_delay(times, target, None, 'Delay for station {}'.format(stationName), fname, all_proba=proba, proba_mode='same', color_threshold=options.class_limit) #else: fname = '{}/timeseries_regression_{}'.format(options.vis_path, station) viz.plot_delay(times, target, y_pred, 'Delay for station {}'.format(stationName), fname, all_proba=None) fname = '{}/scatter_all_stations.png'.format(options.vis_path) viz.scatter_predictions(times, target, y_pred, savepath=options.vis_path, filename='scatter_{}'.format(station)) # Save all station related results to csv and upload them to bucket fname = '{}/station_rmse.csv'.format(options.vis_path) io.dict_to_csv(station_rmse, fname, fname) fname = '{}/station_mae.csv'.format(options.vis_path) io.dict_to_csv(station_mae, fname, fname) fname = '{}/station_r2.csv'.format(options.vis_path) io.dict_to_csv(station_r2, fname, fname) fname = '{}/station_skill_rmse.csv'.format(options.vis_path) io.dict_to_csv(station_skill, fname, fname) # Create timeseries of avg actual delay and predicted delay all_times = sorted(list(all_times)) for t, l in avg_delay.items(): avg_delay[t] = sum(l) / len(l) for t, l in avg_pred_delay.items(): avg_pred_delay[t] = sum(l) / len(l) for t, l in avg_proba.items(): avg_proba[t] = sum(l) / len(l) avg_delay = list( OrderedDict(sorted(avg_delay.items(), key=lambda t: t[0])).values()) avg_pred_delay = list( OrderedDict(sorted(avg_pred_delay.items(), key=lambda t: t[0])).values()) avg_proba = list( OrderedDict(sorted(avg_proba.items(), key=lambda t: t[0])).values()) # Calculate average over all times and stations, first for all months separately, then for whole time range splits = viz._split_to_parts(list(times), [avg_delay, avg_pred_delay], 2592000) for i in range(0, len(splits)): times_, avg_delay_, avg_pred_delay_ = splits[i] try: rmse = math.sqrt( metrics.mean_squared_error(avg_delay_, avg_pred_delay_)) mae = metrics.mean_absolute_error(avg_delay_, avg_pred_delay_) r2 = metrics.r2_score(avg_delay_, avg_pred_delay_) rmse_stat = math.sqrt( metrics.mean_squared_error( avg_delay_, np.full_like(avg_delay_, mean_delay))) skill = 1 - rmse / rmse_stat except ValueError: logging.warning('Zero samples in some class') continue logging.info('Month: {}'.format(i + 1)) logging.info( 'RMSE of average delay over all stations: {:.4f}'.format(rmse)) logging.info( 'MAE of average delay over all stations: {:.4f}'.format(mae)) logging.info( 'R2 score of average delay over all stations: {:.4f}'.format(r2)) logging.info( 'Skill score (RMSE) of average delay over all stations: {:.4f}'. format(skill)) # Write average data into file avg_errors = { 'rmse': rmse, 'mae': mae, 'r2': r2, 'skill': skill, 'nro_of_samples': len(avg_delay) } fname = '{}/avg_erros_{}.csv'.format(options.vis_path, i) io.dict_to_csv(avg_errors, fname, fname) rmse = math.sqrt(metrics.mean_squared_error(avg_delay, avg_pred_delay)) #rmse_mean = np.mean(list(station_rmse.values())) mae = metrics.mean_absolute_error(avg_delay, avg_pred_delay) #mae_mean = np.mean(list(station_mae.values())) r2 = metrics.r2_score(avg_delay, avg_pred_delay) rmse_stat = math.sqrt( metrics.mean_squared_error(avg_delay, np.full_like(avg_delay, mean_delay))) skill = 1 - rmse / rmse_stat #skill_mean = 1 - rmse_mean/rmse_stat logging.info('All periods:') logging.info( 'RMSE of average delay over all stations: {:.4f}'.format(rmse)) #logging.info('Average RMSE of all station RMSEs: {:.4f}'.format(rmse_mean)) logging.info('MAE of average delay over all stations: {:.4f}'.format(mae)) #logging.info('Average MAE of all station MAEs: {:.4f}'.format(mae_mean)) logging.info( 'R2 score of average delay over all stations: {:.4f}'.format(r2)) logging.info( 'Skill score (RMSE) of average delay over all stations: {:.4f}'.format( skill)) #logging.info('Skill score (avg RMSE) of all stations: {:.4f}'.format(skill_mean)) # Write average data into file avg_errors = { 'rmse': rmse, 'mae': mae, 'r2': r2, #'rmse_mean': rmse_mean, #'mae_mean': mae_mean, 'skill': skill, #'skill_mean': skill_mean, 'nro_of_samples': len(avg_delay) } fname = '{}/avg_erros.csv'.format(options.vis_path) io.dict_to_csv(avg_errors, fname, fname) # Create timeseries of average delay and predicted delays over all stations all_times_formatted = [t.strftime('%Y-%m-%dT%H:%M:%S') for t in all_times] delay_data = { 'times': all_times_formatted, 'delay': avg_delay, 'predicted delay': avg_pred_delay } # write csv fname = '{}/avg_delays_all_stations.csv'.format(options.vis_path) io.write_csv(delay_data, fname, fname) # visualise if not avg_proba: proba = None else: proba = avg_proba fname = '{}/timeseries_avg_all_stations.png'.format(options.vis_path) if predictor.y_pred_bin is not None: viz.plot_delay(all_times, avg_delay, None, 'Average delay for all station', fname, all_proba=proba, proba_mode='same', color_threshold=options.class_limit) else: viz.plot_delay(all_times, avg_delay, avg_pred_delay, 'Average delay for all station', fname) fname = '{}/scatter_all_stations.png'.format(options.vis_path) viz.scatter_predictions(all_times, avg_delay, avg_pred_delay, savepath=options.vis_path, filename='scatter_all_stations') # Binary classification metrics if predictor.y_pred_bin is not None: all_data.sort_values(by=['time'], inplace=True) times = all_data.loc[:, 'time'].values try: target = all_data.loc[:, options.label_params].reset_index( drop=True).values.ravel() y_pred, y_pred_bin, y_pred_bin_proba = predictor.pred( times, all_data) # Drop first times which LSTM are not able to predict times = times[(len(all_data) - len(y_pred)):] splits = viz._split_to_parts(list(times), [ target, y_pred, predictor.y_pred_bin, predictor.y_pred_bin_proba ], 2592000) for i in range(0, len(splits)): #times_, target_, y_pred_bin_, y_pred_bin_proba_ = splits[i] times_, target_, y_pred_, y_pred_bin_, y_pred_bin_proba_ = splits[ i] viz.classification_perf_metrics(y_pred_bin_proba_, y_pred_bin_, target_, options, times_, 'all') except (PredictionError, ModelError) as e: logging.error(e) pass