def main(targets): ''' Reads targets and executes appropriate files for given data. ''' # make the clean target if 'clean' in targets: shutil.rmtree('data/raw', ignore_errors=True) shutil.rmtree('data/cleaned', ignore_errors=True) shutil.rmtree('data/out', ignore_errors=True) shutil.rmtree('data/test', ignore_errors=True) # make the data target if 'data' in targets: cfg = load_params(DATA_PARAMS) load(**cfg) cfg = load_params(CLEAN_PARAMS) clean_data(**cfg) # make the test target if 'test-project' in targets: cfg = load_params(TEST_DATA_PARAMS) load(**cfg) cfg = load_params(TEST_CLEAN_PARAMS) clean_data(**cfg) cfg = load_params(TEST_FEATURE_PARAMS) make_features(**cfg) cfg = load_params(TEST_MODEL_PARAMS) driver(**cfg) # make the full data target if 'full-project' in targets: cfg = load_params(DATA_PARAMS) load(**cfg) cfg = load_params(CLEAN_PARAMS) clean_data(**cfg) cfg = load_params(FEATURE_PARAMS) make_features(**cfg) cfg = load_params(MODEL_PARAMS) driver(**cfg) # if data is cleaned and just model pipeline is to be run if 'model' in targets: cfg = load_params(TEST_MODEL_PARAMS) driver(**cfg) return
def prepare_data(normalize): data = features.make_features(normalize).dropna() y = data["label"] X = data.drop(["label"], axis=1) X = (X - X.mean(axis=0)) / X.std(axis=0) print(X) return X, y
def train(dataset, labels): """ """ pollutants = ["NO2", "PM10", "PM25"] # split dataset NO2_df, PM10_df, PM25_df = split_pollutant_dataset(dataset) # build data dict ds = dict( ((poll, df) for poll, df in zip(pollutants, split_pollutant_dataset(dataset)))) # build features dict f = {} for poll in pollutants: f[poll] = {} f[poll]["X"] = make_features(ds[poll], **features_config[poll]) f[poll]["Y"] = get_Y(labels, ds[poll]) # train model for each pollutant model_dict = {} for poll in pollutants: xgb_model = xgb.XGBRegressor(max_depth=6, n_estimators=200, reg_lambda=1) # train model xgb_model.fit(f[poll]["X"], f[poll]["Y"]) # mse on training set y_pred = xgb_model.predict(f[poll]["X"]) mse = mean_squared_error(f[poll]["Y"], y_pred) print("%s: MSE on training set: %.3f" % (poll, mse)) # store model model_dict[poll] = xgb_model # return model dict return model_dict
def main(train_file, val_file, test_file, feature_file): # train data = import_formatted_data(train_file) X, y = split_x_y(data) transformed_X = make_features(X) with open(feature_file, "r") as f: features = [int(x) for x in f.readline().strip().split(" ")] selected_X = transformed_X[:, features] mod = SGDRegressor(max_iter=5000, penalty="l1") mod.fit(selected_X, y) train_preds = mod.predict(selected_X) val_data = import_formatted_data(val_file) val_X, val_y = split_x_y(val_data) transformed_val_X = make_features(val_X) selected_val_X = transformed_val_X[:, features] val_predictions = mod.predict(selected_val_X) test_data = import_formatted_data(test_file) test_X, test_y = split_x_y(test_data) transformed_test_X = make_features(test_X) selected_test_X = transformed_test_X[:, features] test_predictions = mod.predict(selected_test_X) print "Training" print "MSE:", mean_squared_error(y, train_preds) print "Mean, variance of real y:", mean(y), var(y) print "Mean, variance of pred y:", mean(train_preds), var(train_preds) print "Validation" print "MSE:", mean_squared_error(val_y, val_predictions) print "Mean, variance of real y:", mean(val_y), var(val_y) print "Mean, variance of Pred y:", mean(val_predictions), var( val_predictions) print "Test" print "MSE:", mean_squared_error(test_y, test_predictions) print "Mean, variance of real y:", mean(test_y), var(test_y) print "Mean, variance of Pred y:", mean(test_predictions), var( test_predictions)
def predict(model_dict, dataset): """ """ # split dataset NO2_df, PM10_df, PM25_df = split_pollutant_dataset(dataset) # build features NO2_f = make_features(NO2_df, **features_config["NO2"]) PM10_f = make_features(PM10_df, **features_config["PM10"]) PM25_f = make_features(PM25_df, **features_config["PM25"]) # apply each model Y_pred_NO2 = pd.DataFrame(model_dict["NO2"].predict(NO2_f), columns=["TARGET"], index=NO2_f.index) Y_pred_PM10 = pd.DataFrame(model_dict["PM10"].predict(PM10_f), columns=["TARGET"], index=PM10_f.index) Y_pred_PM25 = pd.DataFrame(model_dict["PM25"].predict(PM25_f), columns=["TARGET"], index=PM25_f.index) # concatenate result Y_pred = pd.concat([Y_pred_NO2, Y_pred_PM10, Y_pred_PM25], axis=0) # return return Y_pred
def train_pipeline(training_pipeline_params: TrainingPipelineParams): logger.info(f"Start training with params: {training_pipeline_params}") load_data(training_pipeline_params.input_data_path, training_pipeline_params.input_data_url) data = read_data(training_pipeline_params.input_data_path) logger.info(f"Raw data shape: {data.shape}") train_df, val_df = split_train_val_data( data, training_pipeline_params.splitting_params) logger.info(f"Train df shape: {train_df.shape}") logger.info(f"Val df shape: {val_df.shape}") pipeline = build_transformer(training_pipeline_params.feature_params) pipeline.fit(train_df) logger.info(f"Transform fitted.") train_features = make_features(pipeline, train_df) train_target = extract_target(train_df, training_pipeline_params.feature_params) logger.info(f"Train features shape: {train_features.shape}") val_features = make_features(pipeline, val_df) val_target = extract_target(val_df, training_pipeline_params.feature_params) logger.info(f"Val features shape: {train_features.shape}") model = get_model(training_pipeline_params.train_params) model = train_model(train_features, train_target, model) logger.info(f"Model trained.") predictions = predict_model(val_features, model) metrics = evaluate_model(predictions, val_target) path_to_model = save_artifacts(metrics, model, pipeline, training_pipeline_params) return path_to_model, metrics
def dataflow(X, y=None, cmd_plot=False): ''' Primary function responsible for predictions and GUI output from a pre-processed file. Returns signals used for plotting of features as well as generated summary statistics. ''' epochs = epochs_from_prep(X.copy(), None, settings.EPOCH_LENGTH, settings.OVERLAP_FACTOR, settings.SAMPLE_RATE, filter=False, removal=True) epochs = dataset(epochs, shuffle=False, exclude_ptt=False, only_rwa=True).epochs epochs = gru(load_graph=True, path=settings.BEST_MODEL).predict(epochs) epochs.sort(key=lambda x: x.index_start, reverse=False) yhat, timecol = reconstruct(X, epochs) full = epochs_from_prep(X, None, settings.EPOCH_LENGTH, settings.OVERLAP_FACTOR, settings.SAMPLE_RATE, filter=False, removal=False) full.sort(key=lambda x: x.index_start, reverse=False) wake, nrem, rem, illegal = timeseries(full) summary = summary_statistics(timecol, yhat, wake, nrem, rem, illegal) X, y, mask = make_features(X, y, settings.SAMPLE_RATE, removal=False) X = transpose(X) ss = X[6].copy().astype(float) for i, _ in enumerate(ss): if X[7, i]: ss[i] = 2.0 elif X[5, i]: ss[i] = 0.0 data = X[0] / settings.SAMPLE_RATE, [X[1], X[2], X[3], X[4], ss, yhat], [ 'RR', 'RWA', 'PTT', 'PWA', 'Sleep stage', 'Arousals' ], region(X[5]), region(X[7]), None, None, int(X[0, -1] / settings.SAMPLE_RATE) if cmd_plot: d = list(data) if y is not None: d[1] += [y] d[2] += ['y'] d[2][5] = 'yhat' plot_results(*d) return data, summary
def predict_pipeline(params: PredictionPipelineParams): logger.info(f"Start prediction.") data = read_data(params.data_path) logger.info(f"Data loaded. Raw data shape: {data.shape}") pipeline = load_transformer(params.transformer_path) logger.info(f"Transformer loaded: {pipeline}") model = load_model(params.model_path) logger.info(f"Model loaded: {model}") train_features = make_features(pipeline, data) logger.info(f"Test features shape: {train_features.shape}") predictions = predict_model(train_features, model) predictions_path = save_prediction(predictions, params.output_path) logger.info(f"Predictions saved in {predictions_path}")
labels_ph = tf.placeholder(tf.int32,(None)) wav_ph = tf.placeholder(tf.float32,(None,sample_rate)) bg_wavs_ph = tf.placeholder(tf.float32,[None,sample_rate]) keep_prob = tf.placeholder(tf.float32) # will be 0.5 for training, 1 for test learning_rate_ph = tf.placeholder(tf.float32,[],name="learning_rate_ph") is_training_ph = tf.placeholder(tf.bool) use_full_layer = tf.placeholder(tf.bool) slow_down = tf.placeholder(tf.bool) # scale_means_ph = tf.placeholder(tf.float32) # scale_stds_ph = tf.placeholder(tf.float32) processed_wavs = pp.tf_preprocess(wav_ph,bg_wavs_ph,is_training_ph,slow_down,extreme=FLAGS.extreme_time) features = make_features(processed_wavs,is_training_ph,FLAGS.features) output_neurons = len(all_words) if style == "full" else len(wanted_words) full_output_neurons = len(all_words) final_layer, full_final_layer, open_max_layer = make_model(FLAGS.model,features,keep_prob,output_neurons,full_output_neurons,is_training_ph) final_layer = tf.cond(use_full_layer,lambda: full_final_layer, lambda: final_layer) probabilities = tf.nn.softmax(final_layer) loss_mean = tf.losses.sparse_softmax_cross_entropy(labels=labels_ph, logits=final_layer) # full_loss_mean = tf.losses.sparse_softmax_cross_entropy(labels=labels_ph,logits=full_final_layer) total_loss = tf.losses.get_total_loss() update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
NO2_df, PM10_df, PM25_df = split_pollutant_dataset(df) # split in train / dev for each pollutant NO2_train, NO2_dev = split_train_dev(NO2_df, zone_station_train, zone_station_dev) PM10_train, PM10_dev = split_train_dev(PM10_df, zone_station_train, zone_station_dev) PM25_train, PM25_dev = split_train_dev(PM25_df, zone_station_train, zone_station_dev) # make features and get labels # NO2 NO2_train_f, NO2_dev_f = make_features(NO2_train, NO2_dev, normalize=False, rolling_mean=True, deltas=[12]) Y_NO2_train = get_Y(Y, NO2_train) Y_NO2_dev = get_Y(Y, NO2_dev) X_NO2 = pd.concat([NO2_train_f, NO2_dev_f], axis=0, copy=False) Y_NO2 = pd.concat([Y_NO2_train, Y_NO2_dev], axis=0, copy=False) NO2_test_fold = build_test_fold(Y_NO2_train, Y_NO2_dev) # PM10 PM10_train_f, PM10_dev_f = make_features(PM10_train, PM10_dev) Y_PM10_train = get_Y(Y, PM10_train) Y_PM10_dev = get_Y(Y, PM10_dev) X_PM10 = pd.concat([PM10_train_f, PM10_dev_f], axis=0, copy=False) Y_PM10 = pd.concat([Y_PM10_train, Y_PM10_dev], axis=0, copy=False) PM10_test_fold = build_test_fold(Y_PM10_train, Y_PM10_dev)
def train_predict(train, test, Y_train, model_dict=None, output_path=None, pm=False, model="rf"): """ """ pollutants = ["NO2", "PM"] if pm else ["NO2", "PM10", "PM25"] print("%i regressor will be trained for each pollutant of %s" % (len(pollutants), pollutants)) # split dataset, build data dict train_ds = dict( ((poll, df) for poll, df in zip(pollutants, split_pollutant_dataset(train, pm)))) test_ds = dict( ((poll, df) for poll, df in zip(pollutants, split_pollutant_dataset(test, pm)))) # build features dict f = {} for poll in pollutants: f[poll] = {} f[poll]["X_train"], f[poll]["X_test"] = make_features( train_ds[poll], dev=test_ds[poll], **features_config[poll]) if Y_train is not None: f[poll]["Y"] = get_Y(Y_train, train_ds[poll]) # train model for each pollutant if model_dict is None: model_dict = {} for poll in pollutants: # shuffle X,Y X, Y = shuffle_XY(f[poll]["X_train"], f[poll]["Y"]) # init model if model == "rf": reg = RandomForestRegressor(**rf_config) else: reg = xgb.XGBRegressor(max_depth=6, **xgb_config[poll]) # train model print("Training a %s model on pollutant %s ..." % (model, poll)) reg.fit(X, Y) print("Training done on %s" % poll) # store model model_dict[poll] = reg if output_path is not None: print("Saving the dictionnary of models in %s" % output_path) with open(output_path, "wb") as fout: pickle.dump(model_dict, fout) # predict on train set preds = [] for poll in pollutants: # mse on training set Y_pred_poll = pd.DataFrame(model_dict[poll].predict( f[poll]["X_train"]), columns=["TARGET"], index=f[poll]["X_train"].index) preds.append(Y_pred_poll) mse = mean_squared_error(f[poll]["Y"], Y_pred_poll) print("%s: MSE on training set: %.3f" % (poll, mse)) # concat and compute global MSE Y_pred = pd.concat(preds, axis=0).sort_index() mse = mean_squared_error(Y_train, Y_pred) print("GLOBAL MSE on training set: %.3f" % mse) # predict on test set print("Computing prediction on test data...") preds = [] for poll in pollutants: Y_pred_poll = pd.DataFrame(model_dict[poll].predict(f[poll]["X_test"]), columns=["TARGET"], index=f[poll]["X_test"].index) preds.append(Y_pred_poll) # concatenate pred for each pollutant and sort index Y_pred = pd.concat(preds, axis=0).sort_index() print("Prediction done.") # return Y_pred
# training model.fit(PM25_seq_stat_train, Y_seq_PM25_train, nb_epoch=20, batch_size=32, verbose=2, validation_data=(PM25_seq_stat_dev, Y_seq_PM25_dev), callbacks=[tensordboard_cb]) # compare xgboost NO2_train, NO2_dev = split_train_dev(NO2_df, zone_station_train, zone_station_dev) NO2_train_f, NO2_dev_f = make_features(NO2_train, NO2_dev, rolling_mean=True, deltas=[24, 36, 48, 96]) Y_NO2_train = get_Y(Y, NO2_train) Y_NO2_dev = get_Y(Y, NO2_dev) PM25_train, PM25_dev = split_train_dev(PM25_df, zone_station_train, zone_station_dev) PM25_train_f, PM25_dev_f = make_features(PM25_train, PM25_dev, rolling_mean=True, deltas=[24, 36, 48, 96]) Y_PM25_train = get_Y(Y, PM25_train) Y_PM25_dev = get_Y(Y, PM25_dev) import xgboost as xgb
shift_config = { "temperature": [8, 14, 20, 96], "cloudcover": [2, 5, 48], "pressure": [2, 24, 72], "windbearingsin": [2, 6], "windbearingcos": [6, 6], "windspeed": [2, 4] } NO2_train, NO2_dev = split_train_dev(NO2_df, zone_station_train, zone_station_dev) NO2_train_f, NO2_dev_f = make_features( NO2_train, NO2_dev, rolling_mean=True, roll_mean_conf=roll_mean_conf, # shift_config=shift_config, temp_dec_freq=12, log=False, remove_temporal=True, rolling_std=True, deltas_std=[24, 48, 96, 120]) Y_NO2_train = get_Y(Y, NO2_train) Y_NO2_dev = get_Y(Y, NO2_dev) # xgboost xgb_model = xgb.XGBRegressor(max_depth=7, n_estimators=200, reg_lambda=1) xgb_model.fit(NO2_train_f, Y_NO2_train, eval_set=[(NO2_dev_f, Y_NO2_dev)], eval_metric="rmse") evaluate_mse(xgb_model, NO2_train_f, NO2_dev_f,
if __name__ == "__main__": if len(argv) != 5: print """USAGE: format_data.py <target sequence index> <guide sequence index> <label sequence index> Writes four files name raw.tab, train.tab, val.tab, test.tab. Train, valm and test have expanded data with features, while raw will just contain the target, guide, and label.""" exit() filename, target_i, guide_i, label_i = argv[1:5] #,argv[2],argv[3],argv[4] target_i, guide_i, label_i = int(target_i), int(guide_i), int(label_i) data = import_azimuth_data(filename, target_i, guide_i, label_i) # Target, guide, label mix_data = shuffle_data(data) feature_data = make_features(mix_data) train, val, test = split_train_val_test(feature_data) with open("raw.tab", "w") as f: for row in data: entry = [str(x) for x in row] f.write("\t".join(entry) + "\n") with open("train.tab", "w") as f: for row in train: entry = [str(x) for x in row] f.write("\t".join(entry) + "\n") with open("val.tab", "w") as f: for row in val:
import sys from features import make_features import scipy.io.wavfile as wav import numpy as np from keras.models import model_from_json folder_name = "5second" # read audio file and calculate mfcc audio_file_name = sys.argv[1] (rate, sig) = wav.read("1.wav") mfcc_feat = make_features(sig, rate, winlen=0.1, winstep=0.05, lowfreq=50, highfreq=5000) mfcc_feat[~np.isnan(mfcc_feat)] # create feature img_rows, img_cols = 401, 13 X_test = np.array([], dtype='float32') mfcc_feat = mfcc_feat[0:401, :] image = np.array([mfcc_feat]) if len(image) == 1: if len(X_test) == 0: X_test = np.array([image]) else: X_test = np.vstack([X_test, np.array([image])]) X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
def main(): all_data = compile_and_normalize() mix_data = shuffle_data(all_data) feature_data = make_features(mix_data) train_data, val_data, test_data = split_train_val_test(feature_data) modelfiles = [ "data/CRISPOR_readFraction_off_target/CRISPOR_readFraction_off_target.joblib", "data/Azimuth/Azimuth.joblib", "data/Res6tg/Res6tg.joblib", "data/Rule_set_1_log2change_on_target/Rule_set_1_log2change_on_target.joblib" ] featurefiles = [ "data/CRISPOR_readFraction_off_target/CRISPOR_readFraction_off_target_features.txt", "data/Azimuth/Azimuth_features.txt", "data/Res6tg/Res6tg_features.txt", "data/Rule_set_1_log2change_on_target/Rule_set_1_log2change_on_target_features.txt" ] ensemble = [] for modelfile, featurefile in zip(modelfiles, featurefiles): print modelfile print featurefile model = load(modelfile) features = get_features(featurefile) ensemble.append((model,features)) train_x, train_y = split_x_y(train_data) val_x, val_y = split_x_y(val_data) test_x, test_y = split_x_y(test_data) featureselector = SelectFromModel(RandomForestRegressor(), max_features=100) regressor = RandomForestRegressor() featureselector.fit(train_x, train_y) features = featureselector.get_support(indices=True) selected_train_x = train_x[:, features] selected_val_x = val_x[:, features] selected_test_x = test_x[:,features] regressor.fit(selected_train_x, train_y) train_predictions = regressor.predict(selected_train_x) val_predictions = regressor.predict(selected_val_x) test_predictions = regressor.predict(selected_test_x) train_error = mean_squared_error(train_y, train_predictions) validataion_error = mean_squared_error(val_y, val_predictions) test_error = mean_squared_error(test_y, test_predictions) for model, features in ensemble: add_train_x = add_model_feature(model, features, train_x) add_val_x = add_model_feature(model, features, val_x) add_test_x = add_model_feature(model, features, test_x) #model, features = select_features_and_model(add_train_x, train_y, add_val_x, val_y,"all_data_toplayer") modelfile = "all_data_toplayer.joblib" featurefile = "all_data_toplayer_features.txt" model = load(modelfile) features = get_features(featurefile) train_pred = model.predict(add_train_x[:,features]) val_pred = model.predict(add_val_x[:,features]) test_pred = model.predict(add_test_x[:,features]) train_MSE = mean_squared_error(train_y, train_pred) val_MSE = mean_squared_error(val_y, val_pred) test_MSE = mean_squared_error(test_y, test_pred) with open("all_data_top_layer_MSE.txt", "w") as f: f.write("ensemble train MSE: " + str(train_MSE) + "\n") f.write("ensemble val MSE: " + str(val_MSE) + "\n") f.write("ensemble test MSE: " + str(test_MSE) + "\n") f.write("train MSE: " + str(train_error) + "\n") f.write("val MSE: " + str(validataion_error) + "\n") f.write("test MSE: " + str(test_error) + "\n")
import features import parse_scores as parse score_path = 'score_data/' profile_path = 'profile_data/' records = parse.parse_score_file(score_path + 'franco_scores.csv') features = features.make_features(profile_path + 'data_analysis_random_profiles.json') record_dict = {} for record in records: record_dict[(record[0], record[1])] = [record[2], None] for feature in features: if feature.id in record_dict.keys(): record_dict[feature.id][1] = feature id_dict = {} for key in record_dict.keys(): profile_id = key[0] if profile_id in id_dict.keys(): id_dict[profile_id] += [record_dict[key]] else: id_dict[profile_id] = [record_dict[key]] id_groups = [[pair for pair in group if pair[1]] for group in id_dict.values()] data = [pair for pair in record_dict.values() if pair[1]]