def create_formula(data, is_dataframe=False): if is_dataframe: features_all = list(data.columns) else: features_all = revo.rx_get_var_names(data) features_to_remove = ["label", "image", "class"] training_features = [ x for x in features_all if x not in features_to_remove ] formula = "label ~ " + " + ".join(training_features) return formula
def transform(dataset, context): from pandas import DataFrame table_name = None connection_string = "Driver=ODBC Driver 13 for SQL Server;Server=13.91.49.253;Database=Hospital;Uid=revotester;Pwd=T3sterPwd" def detect_table(table_name, connection_string): from revoscalepy import RxSqlServerData, rx_import detect_sql = RxSqlServerData(sql_query="IF EXISTS (select 1 from information_schema.tables where table_name = '{}') SELECT 1 AS ret ELSE SELECT 0 AS ret".format(table_name), connection_string=connection_string) does_exist = rx_import(detect_sql) if does_exist.iloc[0,0] == 1: return True else: return False missing = detect_table("LoS0", connection_string) if missing is False: table_name = "LengthOfStay" else: table_name = "LoS0" LengthOfStay_cleaned_sql = RxSqlServerData(table=table_name, connection_string=connection_string) # Get the mean and standard deviation of those variables. col_list = rx_get_var_names(LengthOfStay_cleaned_sql) f = "+".join(col_list) summary = rx_summary(formula=f, data=LengthOfStay_cleaned_sql, by_term=True).summary_data_frame names = ["hematocrit", "neutrophils", "sodium", "glucose", "bloodureanitro", "creatinine", "bmi", "pulse", "respiration"] statistics = summary[summary["Name"].isin(names)] statistics = statistics[["Name", "Mean", "StdDev"]] # standardization transform function def standardize(data, context): for n, row in statistics.iterrows(): data[[row["Name"]]] = (data[[row["Name"]]] - row["Mean"])/row["StdDev"] return data # number_of_issues transform function def calculate_number_of_issues(data, context): data["number_of_issues"] = to_numeric(data["hemo"]) + to_numeric(data["dialysisrenalendstage"]) + to_numeric(data["asthma"])\ + to_numeric(data["irondef"]) + to_numeric(data["pneum"]) + to_numeric(data["substancedependence"])\ + to_numeric(data["psychologicaldisordermajor"]) + to_numeric(data["depress"])\ + to_numeric(data["psychother"]) + to_numeric(data["fibrosisandother"]) + to_numeric(data["malnutrition"]) return data data = DataFrame(dataset) data = standardize(data, context) data = calculate_number_of_issues(data, context) return data
def create_formula(sql_data): features_all = rx_get_var_names(sql_data) features_to_remove = ["label", "patient_id"] training_features = [x for x in features_all if x not in features_to_remove] formula = "label ~ " + " + ".join(training_features) return formula
## Split the data set into a training and a testing set ########################################################################################################################################## # Randomly split the data into a training set and a testing set, with a splitting % p. # p % goes to the training set, and the rest goes to the testing set. Default is 70%. p = 70 ## Create the Train_Id table containing Lead_Id of training set. train_test_split("eid", "LoS", "Train_Id", p, connection_string) #rx_set_compute_context(sql) ## Point to the training set. It will be created on the fly when training models. variables_all = rx_get_var_names(LoS) variables_to_remove = ["eid", "vdate", "discharged", "facid"] training_variables = [x for x in variables_all if x not in variables_to_remove] LoS_Train = RxSqlServerData( sql_query="SELECT eid, {} FROM LoS WHERE eid IN (SELECT eid from Train_Id)" .format(', '.join(training_variables)), connection_string=connection_string, string_as_factors=True) col_type_info = rx_create_col_info(LoS_Train) ## Point to the testing set. It will be created on the fly when testing models. LoS_Test = RxSqlServerData( sql_query= "SELECT eid, {} FROM LoS WHERE eid NOT IN (SELECT eid from Train_Id)". format(', '.join(training_variables)),
rx_data_step(input_data=LoS_text, output_file=LengthOfStay_sql, overwrite=True) ########################################################################################################################################## ## Determine if LengthOfStay has missing values ########################################################################################################################################## # First, get the names and types of the variables to be treated. # For rxSummary to give correct info on characters, stringsAsFactors = True should be used. LengthOfStay_sql2 = RxSqlServerData(table="LengthOfStay", connection_string=connection_string, stringsAsFactors=True) #col = rxCreateColInfo(LengthOfStay_sql2) # Not yet available colnames = rx_get_var_names(LengthOfStay_sql2) # Then, get the names of the variables that actually have missing values. Assumption: no NA in eid, lengthofstay, or dates. var = [ x for x in colnames if x not in ["eid", "lengthofstay", "vdate", "discharged"] ] f = "+".join(var) summary = rx_summary(formula=f, data=LengthOfStay_sql2, by_term=True).summary_data_frame var_with_NA = summary[summary["MissingObs"] > 0] method = None if var_with_NA.empty: print("No missing values.") print("You can move to step 2.")
else: table_name = "LoS0" LengthOfStay_cleaned_sql = RxSqlServerData(table=table_name, connection_string=connection_string) ########################################################################################################################################## ## Feature Engineering: ## 1- Standardization: hematocrit, neutrophils, sodium, glucose, bloodureanitro, creatinine, bmi, pulse, respiration. ## 2- Number of preidentified medical conditions: number_of_issues. ########################################################################################################################################## # Get the mean and standard deviation of those variables. col_list = rx_get_var_names(LengthOfStay_cleaned_sql) f = "+".join(col_list) summary = rx_summary(formula=f, data=LengthOfStay_cleaned_sql, by_term=True).summary_data_frame names = [ "hematocrit", "neutrophils", "sodium", "glucose", "bloodureanitro", "creatinine", "bmi", "pulse", "respiration" ] statistics = summary[summary["Name"].isin(names)] statistics = statistics[["Name", "Mean", "StdDev"]] # standardization transform function def standardize(data, context): for n, row in statistics.iterrows():