def process_tp_example(devices, dbc_path, tp_type):
    fs = setup_fs(s3=False)
    db_list = load_dbc_files(dbc_paths)
    log_files = canedge_browser.get_log_files(fs, devices)

    proc = ProcessData(fs, db_list)

    for log_file in log_files:
        output_folder = "output" + log_file.replace(".MF4", "")
        if not os.path.exists(output_folder):
            os.makedirs(f"{output_folder}")

        df_raw, device_id = proc.get_raw_data(log_file)
        df_raw.to_csv(f"{output_folder}/tp_raw_data.csv")

        # replace transport protocol sequences with single frames
        tp = MultiFrameDecoder(tp_type)
        df_raw = tp.combine_tp_frames(df_raw)
        df_raw.to_csv(f"{output_folder}/tp_raw_data_combined.csv")

        # extract physical values as normal, but add tp_type
        df_phys = proc.extract_phys(df_raw)
        df_phys.to_csv(f"{output_folder}/tp_physical_values.csv")

    print("Finished saving CSV output for devices:", devices)
def lambda_handler(event, context=None):
    bucket = event["Records"][0]["s3"]["bucket"]["name"]
    key = event["Records"][0]["s3"]["object"]["key"]
    log_files = [bucket + "/" + key]

    fs = s3fs.S3FileSystem(anon=False)
    db_list = load_dbc_files(inp.dbc_paths)

    # initialize connection to InfluxDB
    influx = SetupInflux(inp.influx_url, inp.token, inp.org_id,
                         inp.influx_bucket, inp.res)

    # process the log files and write extracted signals to InfluxDB
    proc = ProcessData(fs, db_list, inp.signals, inp.days_offset)

    for log_file in log_files:
        df_raw, device_id = proc.get_raw_data(log_file, inp.pw)

        if inp.tp_type != "":
            tp = MultiFrameDecoder(inp.tp_type)
            df_raw = tp.combine_tp_frames(df_raw)

        df_phys = proc.extract_phys(df_raw)
        proc.print_log_summary(device_id, log_file, df_phys)

        influx.write_signals(device_id, df_phys)
Example #3
0
    MultiFrameDecoder,
)
from utils_db import SetupInflux
import inputs as inp

# initialize connection to InfluxDB + get latest data entries per device
influx = SetupInflux(inp.influx_url, inp.token, inp.org_id, inp.influx_bucket,
                     inp.res)
start_times = influx.get_start_times(inp.devices, inp.default_start,
                                     inp.dynamic)

# setup filesystem (local/S3), load DBC files and list log files for processing
fs = setup_fs(inp.s3, inp.key, inp.secret, inp.endpoint, inp.pw)
db_list = load_dbc_files(inp.dbc_paths)
log_files = list_log_files(fs, inp.devices, start_times, inp.pw)

# process log files and write extracted signals to InfluxDB
proc = ProcessData(fs, db_list, inp.signals, inp.days_offset)

for log_file in log_files:
    df_raw, device_id = proc.get_raw_data(log_file, inp.pw)

    if inp.tp_type != "":
        tp = MultiFrameDecoder(inp.tp_type)
        df_raw = tp.combine_tp_frames(df_raw)

    df_phys = proc.extract_phys(df_raw)
    proc.print_log_summary(device_id, log_file, df_phys)

    influx.write_signals(device_id, df_phys)
Example #4
0
def fit_NSQIP(random_state=1, perform_cv=False):
    ''' Load NSQIP 2016-2018 > split 80/20 to training and testing > train 4 models on training set.
        Prints evaluation on NSQIP 2016-2018 test set (not reported in manuscript)
        Saves trained models as joblib pickle files
        Params:
            - random_state : set random state for model training
            - perform_cv : perform 5 fold crossvalidation using training split for hyperparameter tuning
                           need to have only 1 model in model list, and uncomment relevant hyperparameters
    '''
    # Read data
    data = pd.read_csv('puf16-18_lite_v4.csv')

    feat_to_exclude = ['PUFYEAR', 'CaseID', 'OTHBLEED', 'NOTHBLEED', 'DOTHBLEED', 'CPT', 'PRNCPTX',\
                       'count', 'HtoODay', 'INOUT', 'SDSA', 'EMERGNCY', 'PRWBC',
                       'ASA', # removed ASA score as a predictor
                       'BLEEDDIS', 'RENAFAIL', 'STEROID',
                       'NOTHBLEED_d0'
                       ]
    feat_used = list(set(data.columns.to_list()) - set(feat_to_exclude))
    print(feat_used)
    # feat_used = ['percent_transfused']

    # Fill NA, normalize, split to train, test, split
    X_train, X_val, y_train, y_val, imp, scaler = preprocess(
        data[feat_used], data.NOTHBLEED_d3)

    # Run in batch over a list of models
    model_list = [
        LogisticRegression(n_jobs=-1,
                           solver='saga',
                           penalty='elasticnet',
                           C=0.01,
                           l1_ratio=1.0),
        DecisionTreeClassifier(class_weight='balanced',
                               max_depth=9,
                               random_state=random_state),
        RandomForestClassifier(n_jobs=-1,
                               class_weight='balanced',
                               n_estimators=200,
                               random_state=random_state,
                               max_features=5),
        XGBClassifier(
            objective='binary:logistic',
            booster='gbtree',
            n_jobs=-1,
            random_state=random_state,
            learning_rate=0.05,
            n_estimators=609,
            colsample_bytree=0.7,
            min_child_weight=4,
            max_depth=6,
        )
    ]

    for model in model_list:
        # Hyperparameter tuning
        if perform_cv:
            param_test = {  # Logistic Regression
                'C': [0.03, 0.01, 0.003, 0.001],
                'l1_ratio': [0]  #[0, 0.5, 1]
            }
            # param_test = { # Decision Tree
            #     'max_depth' : [8, 9, 10]
            #     }
            # param_test = { # Random Forest
            #     # 'max_depth':[None, 5, 10],
            #     # 'min_samples_split':[2, 20, 200],
            #     'max_features':[4, 5, 6]
            #     }
            # param_test = { # XGBoost
            #     'min_child_weight':[1, 3, 4, 5],
            #     'max_depth': [4, 5, 6, 7],
            #     'gamma':[0, 0.05, 0.1],
            #     'subsample':[1, 0.9, 0.8],
            #     'colsample_bytree':[0.75, 0.7, 0.6],
            #     }

            # perform grid search over available options
            grid = GridSearchCV(estimator=model,
                                param_grid=param_test,
                                scoring='average_precision',
                                n_jobs=-1)
            grid_result = grid.fit(X_train, y_train)
            print(grid_result.best_params_)
            print(grid_result.best_score_)
            print(
                pd.DataFrame(grid_result.cv_results_)[[
                    'params', 'mean_test_score', 'rank_test_score'
                ]])

            # set parameters to optimal parameters
            params = model.get_params()
            for k in grid_result.best_params_:
                params[k] = grid_result.best_params_[k]
            model.set_params(**params)

        # Fit on NSQIP 2016-2018 training set
        print('Test performance')
        model = fit_model(model, X_train, y_train)

        # Evaluate on NSQIP 2016-2018 test set, finding threshold for 95% sensitivity
        thresh = eval_model(model, X_val, y_val)
        print('threshold: {:.4f}'.format(thresh))

        # Save model
        data_pipeline = ProcessData(scaler, imp, None, None)
        data_pipeline.save_params(False, True, True, feat_used, thresh)
        data_pipeline.save_model(model)
        model_name = model.__class__.__name__
        joblib.dump(data_pipeline,
                    './result/' + model_name + '_pipeline.joblib')

        # Print parameters
        print('--------------------------')
        print(model.get_params())
        print('--------------------------\n\n')
from utils import setup_fs, load_dbc_files, restructure_data, add_custom_sig, ProcessData

# specify devices to process (from local/S3), DBC files and start time
devices = ["LOG/958D2219"]
dbc_paths = ["dbc_files/CSS-Electronics-SAE-J1939-DEMO.dbc"]
start = datetime(year=2020, month=1, day=13, hour=0, tzinfo=timezone.utc)

# setup filesystem (local/S3), load DBC files and list log files for processing
fs = setup_fs(s3=False, key="", secret="", endpoint="")
db_list = load_dbc_files(dbc_paths)
log_files = canedge_browser.get_log_files(fs, devices, start_date=start)
print(f"Found a total of {len(log_files)} log files")

# --------------------------------------------
# perform data processing of each log file
proc = ProcessData(fs, db_list, signals=[])
df_phys_all = pd.DataFrame()

for log_file in log_files:
    df_raw, device_id = proc.get_raw_data(log_file)
    df_phys = proc.extract_phys(df_raw)
    proc.print_log_summary(device_id, log_file, df_phys)

    df_phys_all = df_phys_all.append(df_phys)

# --------------------------------------------
# example: Add a custom signal
def ratio(s1, s2):
    return s2 / s1 if s1 else np.nan