Beispiel #1
0
def create_formula(data, is_dataframe=False):
    if is_dataframe:
        features_all = list(data.columns)
    else:
        features_all = revo.rx_get_var_names(data)
    features_to_remove = ["label", "image", "class"]
    training_features = [
        x for x in features_all if x not in features_to_remove
    ]
    formula = "label ~ " + " + ".join(training_features)
    return formula
Beispiel #2
0
def transform(dataset, context):
    from pandas import DataFrame
    table_name = None
    connection_string = "Driver=ODBC Driver 13 for SQL Server;Server=13.91.49.253;Database=Hospital;Uid=revotester;Pwd=T3sterPwd"

    def detect_table(table_name, connection_string):
        from revoscalepy import RxSqlServerData, rx_import
        detect_sql = RxSqlServerData(sql_query="IF EXISTS (select 1 from information_schema.tables where table_name = '{}') SELECT 1 AS ret ELSE SELECT 0 AS ret".format(table_name),
                                 connection_string=connection_string)
        does_exist = rx_import(detect_sql)
        if does_exist.iloc[0,0] == 1: return True
        else: return False

    missing = detect_table("LoS0", connection_string)
    if missing is False:
        table_name = "LengthOfStay"
    else:
        table_name = "LoS0"

    LengthOfStay_cleaned_sql = RxSqlServerData(table=table_name, connection_string=connection_string)

    # Get the mean and standard deviation of those variables.

    col_list = rx_get_var_names(LengthOfStay_cleaned_sql)
    f = "+".join(col_list)
    summary = rx_summary(formula=f, data=LengthOfStay_cleaned_sql, by_term=True).summary_data_frame

    names = ["hematocrit", "neutrophils", "sodium", "glucose", "bloodureanitro", "creatinine", "bmi", "pulse", "respiration"]
    statistics = summary[summary["Name"].isin(names)]
    statistics = statistics[["Name", "Mean", "StdDev"]]
    
    # standardization transform function
    def standardize(data, context):
        for n, row in statistics.iterrows():
            data[[row["Name"]]] = (data[[row["Name"]]] - row["Mean"])/row["StdDev"]
        return data

    # number_of_issues transform function
    def calculate_number_of_issues(data, context):
        data["number_of_issues"] = to_numeric(data["hemo"]) + to_numeric(data["dialysisrenalendstage"]) + to_numeric(data["asthma"])\
                                  + to_numeric(data["irondef"]) + to_numeric(data["pneum"]) + to_numeric(data["substancedependence"])\
                                  + to_numeric(data["psychologicaldisordermajor"]) + to_numeric(data["depress"])\
                                  + to_numeric(data["psychother"]) + to_numeric(data["fibrosisandother"]) + to_numeric(data["malnutrition"])
        return data

    data = DataFrame(dataset)
    data = standardize(data, context)
    data = calculate_number_of_issues(data, context)
    return data
Beispiel #3
0
def create_formula(sql_data):
    features_all = rx_get_var_names(sql_data)
    features_to_remove = ["label", "patient_id"]
    training_features = [x for x in features_all if x not in features_to_remove]
    formula = "label ~ " + " + ".join(training_features)
    return formula
##	Split the data set into a training and a testing set

##########################################################################################################################################

# Randomly split the data into a training set and a testing set, with a splitting % p.
# p % goes to the training set, and the rest goes to the testing set. Default is 70%.

p = 70

## Create the Train_Id table containing Lead_Id of training set.
train_test_split("eid", "LoS", "Train_Id", p, connection_string)

#rx_set_compute_context(sql)

## Point to the training set. It will be created on the fly when training models.
variables_all = rx_get_var_names(LoS)
variables_to_remove = ["eid", "vdate", "discharged", "facid"]
training_variables = [x for x in variables_all if x not in variables_to_remove]
LoS_Train = RxSqlServerData(
    sql_query="SELECT eid, {} FROM LoS WHERE eid IN (SELECT eid from Train_Id)"
    .format(', '.join(training_variables)),
    connection_string=connection_string,
    string_as_factors=True)

col_type_info = rx_create_col_info(LoS_Train)

## Point to the testing set. It will be created on the fly when testing models.
LoS_Test = RxSqlServerData(
    sql_query=
    "SELECT eid, {} FROM LoS WHERE eid NOT IN (SELECT eid from Train_Id)".
    format(', '.join(training_variables)),
rx_data_step(input_data=LoS_text, output_file=LengthOfStay_sql, overwrite=True)

##########################################################################################################################################

## Determine if LengthOfStay has missing values

##########################################################################################################################################

# First, get the names and types of the variables to be treated.
# For rxSummary to give correct info on characters, stringsAsFactors = True should be used.
LengthOfStay_sql2 = RxSqlServerData(table="LengthOfStay",
                                    connection_string=connection_string,
                                    stringsAsFactors=True)

#col = rxCreateColInfo(LengthOfStay_sql2)    # Not yet available
colnames = rx_get_var_names(LengthOfStay_sql2)

# Then, get the names of the variables that actually have missing values. Assumption: no NA in eid, lengthofstay, or dates.
var = [
    x for x in colnames
    if x not in ["eid", "lengthofstay", "vdate", "discharged"]
]
f = "+".join(var)
summary = rx_summary(formula=f, data=LengthOfStay_sql2,
                     by_term=True).summary_data_frame
var_with_NA = summary[summary["MissingObs"] > 0]

method = None
if var_with_NA.empty:
    print("No missing values.")
    print("You can move to step 2.")
else:
    table_name = "LoS0"

LengthOfStay_cleaned_sql = RxSqlServerData(table=table_name,
                                           connection_string=connection_string)

##########################################################################################################################################

## Feature Engineering:
## 1- Standardization: hematocrit, neutrophils, sodium, glucose, bloodureanitro, creatinine, bmi, pulse, respiration.
## 2- Number of preidentified medical conditions: number_of_issues.

##########################################################################################################################################

# Get the mean and standard deviation of those variables.
col_list = rx_get_var_names(LengthOfStay_cleaned_sql)
f = "+".join(col_list)
summary = rx_summary(formula=f, data=LengthOfStay_cleaned_sql,
                     by_term=True).summary_data_frame

names = [
    "hematocrit", "neutrophils", "sodium", "glucose", "bloodureanitro",
    "creatinine", "bmi", "pulse", "respiration"
]
statistics = summary[summary["Name"].isin(names)]
statistics = statistics[["Name", "Mean", "StdDev"]]


# standardization transform function
def standardize(data, context):
    for n, row in statistics.iterrows():