def process_tp_example(devices, dbc_path, tp_type): fs = setup_fs(s3=False) db_list = load_dbc_files(dbc_paths) log_files = canedge_browser.get_log_files(fs, devices) proc = ProcessData(fs, db_list) for log_file in log_files: output_folder = "output" + log_file.replace(".MF4", "") if not os.path.exists(output_folder): os.makedirs(f"{output_folder}") df_raw, device_id = proc.get_raw_data(log_file) df_raw.to_csv(f"{output_folder}/tp_raw_data.csv") # replace transport protocol sequences with single frames tp = MultiFrameDecoder(tp_type) df_raw = tp.combine_tp_frames(df_raw) df_raw.to_csv(f"{output_folder}/tp_raw_data_combined.csv") # extract physical values as normal, but add tp_type df_phys = proc.extract_phys(df_raw) df_phys.to_csv(f"{output_folder}/tp_physical_values.csv") print("Finished saving CSV output for devices:", devices)
def lambda_handler(event, context=None): bucket = event["Records"][0]["s3"]["bucket"]["name"] key = event["Records"][0]["s3"]["object"]["key"] log_files = [bucket + "/" + key] fs = s3fs.S3FileSystem(anon=False) db_list = load_dbc_files(inp.dbc_paths) # initialize connection to InfluxDB influx = SetupInflux(inp.influx_url, inp.token, inp.org_id, inp.influx_bucket, inp.res) # process the log files and write extracted signals to InfluxDB proc = ProcessData(fs, db_list, inp.signals, inp.days_offset) for log_file in log_files: df_raw, device_id = proc.get_raw_data(log_file, inp.pw) if inp.tp_type != "": tp = MultiFrameDecoder(inp.tp_type) df_raw = tp.combine_tp_frames(df_raw) df_phys = proc.extract_phys(df_raw) proc.print_log_summary(device_id, log_file, df_phys) influx.write_signals(device_id, df_phys)
MultiFrameDecoder, ) from utils_db import SetupInflux import inputs as inp # initialize connection to InfluxDB + get latest data entries per device influx = SetupInflux(inp.influx_url, inp.token, inp.org_id, inp.influx_bucket, inp.res) start_times = influx.get_start_times(inp.devices, inp.default_start, inp.dynamic) # setup filesystem (local/S3), load DBC files and list log files for processing fs = setup_fs(inp.s3, inp.key, inp.secret, inp.endpoint, inp.pw) db_list = load_dbc_files(inp.dbc_paths) log_files = list_log_files(fs, inp.devices, start_times, inp.pw) # process log files and write extracted signals to InfluxDB proc = ProcessData(fs, db_list, inp.signals, inp.days_offset) for log_file in log_files: df_raw, device_id = proc.get_raw_data(log_file, inp.pw) if inp.tp_type != "": tp = MultiFrameDecoder(inp.tp_type) df_raw = tp.combine_tp_frames(df_raw) df_phys = proc.extract_phys(df_raw) proc.print_log_summary(device_id, log_file, df_phys) influx.write_signals(device_id, df_phys)
def fit_NSQIP(random_state=1, perform_cv=False): ''' Load NSQIP 2016-2018 > split 80/20 to training and testing > train 4 models on training set. Prints evaluation on NSQIP 2016-2018 test set (not reported in manuscript) Saves trained models as joblib pickle files Params: - random_state : set random state for model training - perform_cv : perform 5 fold crossvalidation using training split for hyperparameter tuning need to have only 1 model in model list, and uncomment relevant hyperparameters ''' # Read data data = pd.read_csv('puf16-18_lite_v4.csv') feat_to_exclude = ['PUFYEAR', 'CaseID', 'OTHBLEED', 'NOTHBLEED', 'DOTHBLEED', 'CPT', 'PRNCPTX',\ 'count', 'HtoODay', 'INOUT', 'SDSA', 'EMERGNCY', 'PRWBC', 'ASA', # removed ASA score as a predictor 'BLEEDDIS', 'RENAFAIL', 'STEROID', 'NOTHBLEED_d0' ] feat_used = list(set(data.columns.to_list()) - set(feat_to_exclude)) print(feat_used) # feat_used = ['percent_transfused'] # Fill NA, normalize, split to train, test, split X_train, X_val, y_train, y_val, imp, scaler = preprocess( data[feat_used], data.NOTHBLEED_d3) # Run in batch over a list of models model_list = [ LogisticRegression(n_jobs=-1, solver='saga', penalty='elasticnet', C=0.01, l1_ratio=1.0), DecisionTreeClassifier(class_weight='balanced', max_depth=9, random_state=random_state), RandomForestClassifier(n_jobs=-1, class_weight='balanced', n_estimators=200, random_state=random_state, max_features=5), XGBClassifier( objective='binary:logistic', booster='gbtree', n_jobs=-1, random_state=random_state, learning_rate=0.05, n_estimators=609, colsample_bytree=0.7, min_child_weight=4, max_depth=6, ) ] for model in model_list: # Hyperparameter tuning if perform_cv: param_test = { # Logistic Regression 'C': [0.03, 0.01, 0.003, 0.001], 'l1_ratio': [0] #[0, 0.5, 1] } # param_test = { # Decision Tree # 'max_depth' : [8, 9, 10] # } # param_test = { # Random Forest # # 'max_depth':[None, 5, 10], # # 'min_samples_split':[2, 20, 200], # 'max_features':[4, 5, 6] # } # param_test = { # XGBoost # 'min_child_weight':[1, 3, 4, 5], # 'max_depth': [4, 5, 6, 7], # 'gamma':[0, 0.05, 0.1], # 'subsample':[1, 0.9, 0.8], # 'colsample_bytree':[0.75, 0.7, 0.6], # } # perform grid search over available options grid = GridSearchCV(estimator=model, param_grid=param_test, scoring='average_precision', n_jobs=-1) grid_result = grid.fit(X_train, y_train) print(grid_result.best_params_) print(grid_result.best_score_) print( pd.DataFrame(grid_result.cv_results_)[[ 'params', 'mean_test_score', 'rank_test_score' ]]) # set parameters to optimal parameters params = model.get_params() for k in grid_result.best_params_: params[k] = grid_result.best_params_[k] model.set_params(**params) # Fit on NSQIP 2016-2018 training set print('Test performance') model = fit_model(model, X_train, y_train) # Evaluate on NSQIP 2016-2018 test set, finding threshold for 95% sensitivity thresh = eval_model(model, X_val, y_val) print('threshold: {:.4f}'.format(thresh)) # Save model data_pipeline = ProcessData(scaler, imp, None, None) data_pipeline.save_params(False, True, True, feat_used, thresh) data_pipeline.save_model(model) model_name = model.__class__.__name__ joblib.dump(data_pipeline, './result/' + model_name + '_pipeline.joblib') # Print parameters print('--------------------------') print(model.get_params()) print('--------------------------\n\n')
from utils import setup_fs, load_dbc_files, restructure_data, add_custom_sig, ProcessData # specify devices to process (from local/S3), DBC files and start time devices = ["LOG/958D2219"] dbc_paths = ["dbc_files/CSS-Electronics-SAE-J1939-DEMO.dbc"] start = datetime(year=2020, month=1, day=13, hour=0, tzinfo=timezone.utc) # setup filesystem (local/S3), load DBC files and list log files for processing fs = setup_fs(s3=False, key="", secret="", endpoint="") db_list = load_dbc_files(dbc_paths) log_files = canedge_browser.get_log_files(fs, devices, start_date=start) print(f"Found a total of {len(log_files)} log files") # -------------------------------------------- # perform data processing of each log file proc = ProcessData(fs, db_list, signals=[]) df_phys_all = pd.DataFrame() for log_file in log_files: df_raw, device_id = proc.get_raw_data(log_file) df_phys = proc.extract_phys(df_raw) proc.print_log_summary(device_id, log_file, df_phys) df_phys_all = df_phys_all.append(df_phys) # -------------------------------------------- # example: Add a custom signal def ratio(s1, s2): return s2 / s1 if s1 else np.nan