Exemple #1
0
def transform(dataset, context):
    from pandas import DataFrame
    table_name = None
    connection_string = "Driver=ODBC Driver 13 for SQL Server;Server=13.91.49.253;Database=Hospital;Uid=revotester;Pwd=T3sterPwd"

    def detect_table(table_name, connection_string):
        from revoscalepy import RxSqlServerData, rx_import
        detect_sql = RxSqlServerData(sql_query="IF EXISTS (select 1 from information_schema.tables where table_name = '{}') SELECT 1 AS ret ELSE SELECT 0 AS ret".format(table_name),
                                 connection_string=connection_string)
        does_exist = rx_import(detect_sql)
        if does_exist.iloc[0,0] == 1: return True
        else: return False

    missing = detect_table("LoS0", connection_string)
    if missing is False:
        table_name = "LengthOfStay"
    else:
        table_name = "LoS0"

    LengthOfStay_cleaned_sql = RxSqlServerData(table=table_name, connection_string=connection_string)

    # Get the mean and standard deviation of those variables.

    col_list = rx_get_var_names(LengthOfStay_cleaned_sql)
    f = "+".join(col_list)
    summary = rx_summary(formula=f, data=LengthOfStay_cleaned_sql, by_term=True).summary_data_frame

    names = ["hematocrit", "neutrophils", "sodium", "glucose", "bloodureanitro", "creatinine", "bmi", "pulse", "respiration"]
    statistics = summary[summary["Name"].isin(names)]
    statistics = statistics[["Name", "Mean", "StdDev"]]
    
    # standardization transform function
    def standardize(data, context):
        for n, row in statistics.iterrows():
            data[[row["Name"]]] = (data[[row["Name"]]] - row["Mean"])/row["StdDev"]
        return data

    # number_of_issues transform function
    def calculate_number_of_issues(data, context):
        data["number_of_issues"] = to_numeric(data["hemo"]) + to_numeric(data["dialysisrenalendstage"]) + to_numeric(data["asthma"])\
                                  + to_numeric(data["irondef"]) + to_numeric(data["pneum"]) + to_numeric(data["substancedependence"])\
                                  + to_numeric(data["psychologicaldisordermajor"]) + to_numeric(data["depress"])\
                                  + to_numeric(data["psychother"]) + to_numeric(data["fibrosisandother"]) + to_numeric(data["malnutrition"])
        return data

    data = DataFrame(dataset)
    data = standardize(data, context)
    data = calculate_number_of_issues(data, context)
    return data
Exemple #2
0
model_type = "linear"
conn_str = 'Driver=SQL Server;Server=<Server Name>;Database=MLDB;Uid=<User Name>;Pwd=<Password>;'
cnxn = pyodbc.connect(conn_str)
inputsql = 'select "RentalCount", "Year", "Month", "Day", "WeekDay", "Snow", "Holiday", "FWeekDay" from dbo.rental_data where Year < 2015'
rental_train_data = pd.read_sql(inputsql, cnxn)

# Used to sample data
# train = rental_train_data.sample(frac=0.8, random_state=1)
# test = rental_train_data.loc[~ rental_train_data.index.isin(train.index)]
# print("Train {0} / test {1}".format(len(train), len(test)))

rental_train_data["Holiday"] = rental_train_data["Holiday"].astype("category")
rental_train_data["Snow"] = rental_train_data["Snow"].astype("category")
rental_train_data["WeekDay"] = rental_train_data["WeekDay"].astype("category")

if model_type == "linear":
    linmod_model = rx_lin_mod("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = rental_train_data)
    trained_model = rx_serialize_model(linmod_model, realtime_scoring_only = True)
if model_type == "dtree":
	dtree_model = rx_dtree("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = rental_train_data)
	trained_model = rx_serialize_model(dtree_model, realtime_scoring_only = True)

print(rx_summary("~ Month + Day + WeekDay + Snow + Holiday", rental_train_data))

# Dump learned model to file
with open(r'c:\temp\trained_model.pickle', mode='wb') as f:
    f.write(trained_model)

cursor=cnxn.cursor()
cursor.execute("INSERT INTO rental_models(model_name, lang, native_model) VALUES(?, ?, ?)", (model_type + "_model", "Python", trained_model))
cnxn.commit()
# SQL Server data
# Importing data from SQL Server using pyodbc
# Using the revoscalepy library
import numpy as np
import pandas as pd
import pyodbc
from revoscalepy import rx_lin_mod, rx_predict, rx_summary

# Connecting and reading the data
con = pyodbc.connect('DSN=AWDW;UID=RUser;PWD=Pa$$w0rd')
query = """SELECT CustomerKey, Age,
             YearlyIncome, TotalChildren,
             NumberCarsOwned
           FROM dbo.vTargetMail;"""
TM = pd.read_sql(query, con)
TM.head(5)
TM.shape

# Check the summary of the NumberCarsOwned
summary = rx_summary("NumberCarsOwned", TM)
print(summary)

# Create a linear model
linmod = rx_lin_mod("NumberCarsOwned ~ YearlyIncome + Age + TotalChildren",
                    data=TM)
predmod = rx_predict(linmod, data=TM, output_data=TM)
predmod.head(10)

# End of script
Exemple #4
0
    "sa", os.getenv("SQLPass"))
cnxn = pyodbc.connect(conn_str)
inputsql = 'select "RentalCount", "Year", "Month", "Day", "WeekDay", "Snow", "Holiday", "FWeekDay" from dbo.rental_data where Year < 2015'
rental_train_data = pd.read_sql(inputsql, cnxn)

rental_train_data["Holiday"] = rental_train_data["Holiday"].astype("category")
rental_train_data["Snow"] = rental_train_data["Snow"].astype("category")
rental_train_data["WeekDay"] = rental_train_data["WeekDay"].astype("category")

linmod_model = rx_lin_mod(
    "RentalCount ~ Month + Day + WeekDay + Snow + Holiday",
    data=rental_train_data)
trained_model = rx_serialize_model(linmod_model, realtime_scoring_only=True)

print(
    rx_summary("RentalCount ~ Month + Day + WeekDay + Snow + Holiday",
               rental_train_data))

# Dump learned model to file
with open(r'c:\model\trained_model.pickle', mode='wb') as f:
    f.write(trained_model)

# Dump learned model to Table
cursor = cnxn.cursor()
cursor.execute(\
'''
MERGE rental_models AS target
USING (SELECT ? as model_name) AS source
ON(target.model_name = source.model_name)
WHEN MATCHED THEN UPDATE SET native_model = ?
WHEN NOT MATCHED BY TARGET THEN INSERT (model_name, lang, native_model) VALUES(?,?,?);
''', \
Exemple #5
0
conn_str = 'Driver=SQL Server;Server=SoL01.techsummit.local;Database=MLDB;Uid={0};Pwd={1};'.format("sa",os.getenv("SQLPass"))
cnxn = pyodbc.connect(conn_str)
inputsql ='''
SELECT C1, C2 FROM (
VALUES
('A',1),('B',2),('C',3),('D',4),('E',5),('F',6),('G',7),('H',8),('I',9),('J',10),
('K',11),('L',12),('M',13),('N',14),('O',15),('P',16),('Q',17),('R',18),('S',19),('T',20),
('U',21),('V',22),('W',23),('X',24),('Y',25),('Z',26)
) AS T1(C1, C2)
'''
train_data = pd.read_sql(inputsql, cnxn)
train_data["C1"] = train_data["C1"].astype("category")
print(train_data["C1"] )

# model = rx_lin_mod("C2 ~ C1", data = train_data)
model = rx_fast_linear("C2 ~ C1", data = train_data, method = "regression")
print(rx_summary("C2 ~ C1", train_data))

trained_model = rx_serialize_model(model, realtime_scoring_only = True)
# Dump learned model to file
with open(r'c:\temp\trained_model_{0}.pickle'.format(model_file), mode='wb') as f:
    f.write(trained_model)


p_data = pd.DataFrame([0,5,10,99,4], columns=["C1"])
p_data["C1"] = train_data["C1"].astype("category")
print(p_data["C1"])

pred = rx_predict(model, data = p_data)
print(pred)
Exemple #6
0
from sklearn import datasets
import pandas as pd
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
print(df)
import revoscalepy
from revoscalepy import rx_summary
summary = rx_summary("petal length (cm)", df)
print(summary)
# Import the DeployClient and MLServer classes from 
# the azureml-model-management-sdk package so you can 
# connect to Machine Learning Server (use=MLServer).

from azureml.deploy import DeployClient
from azureml.deploy.server import MLServer

# Define the location of the Machine Learning Server
HOST = 'http://*****:*****@')
client = DeployClient(HOST, use=MLServer, auth=context)
         "Month" : { "type" : "integer" },
         "Day" : { "type" : "integer" },
         "RentalCount" : { "type" : "integer" },
         "WeekDay" : {
             "type" : "factor",
             "levels" : ["1", "2", "3", "4", "5", "6", "7"]
         },
         "Holiday" : {
             "type" : "factor",
             "levels" : ["1", "0"]
         },
         "Snow" : {
             "type" : "factor",
             "levels" : ["1", "0"]
         }
     }
data_source = RxSqlServerData(sql_query=inputsql, connection_string=conn_str, column_info=column_info)

linmod_model = rx_lin_mod("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = data_source, computeContext = cc)
trained_model = rx_serialize_model(linmod_model, realtime_scoring_only = True)

with open(r'c:\temp\trained_model.pickle', mode='wb') as f:
    f.write(trained_model)

print(rx_summary("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data_source))


cnxn = pyodbc.connect(conn_str)
cursor=cnxn.cursor()
cursor.execute("INSERT INTO rental_models(model_name, lang, native_model) VALUES(?, ?, ?)", ("linear_model", "Python", trained_model))
cnxn.commit()
# First, get the names and types of the variables to be treated.
# For rxSummary to give correct info on characters, stringsAsFactors = True should be used.
LengthOfStay_sql2 = RxSqlServerData(table="LengthOfStay",
                                    connection_string=connection_string,
                                    stringsAsFactors=True)

#col = rxCreateColInfo(LengthOfStay_sql2)    # Not yet available
colnames = rx_get_var_names(LengthOfStay_sql2)

# Then, get the names of the variables that actually have missing values. Assumption: no NA in eid, lengthofstay, or dates.
var = [
    x for x in colnames
    if x not in ["eid", "lengthofstay", "vdate", "discharged"]
]
f = "+".join(var)
summary = rx_summary(formula=f, data=LengthOfStay_sql2,
                     by_term=True).summary_data_frame
var_with_NA = summary[summary["MissingObs"] > 0]

method = None
if var_with_NA.empty:
    print("No missing values.")
    print("You can move to step 2.")
    missing = False
else:
    print("Variables containing missing values are:")
    print(var_with_NA)
    print("Apply one of the methods below to fill missing values.")
    missing = True
    method = "missing"
    #method = "mean_mode"
def main(tablename, inputdf, overwrite=False):
    """Imports a data source into SQL Server table and, operating
       in-database, uses logistic regression to create a
       predictive model.

       A comparison to an out-of-database, in memory method
       is performed (TODO)
       
       Parameters
       ----------

       tablename : str
           The new or previosly create table name in database.

       inputdf : RxTextData object
           The data source

       overwrite : bool, optional (default=False)
           Whether or not to overwrite the table.  
           Set to True if this is a new table, otherwise False.
       """

    ####################################################################
    # Set the compute context to SQL SERVER
    ####################################################################

    # NB: don't need, but would be good to know what this actually does here
    # RxComputeContext(LOCAL, '9.1')

    compute_context = RxInSqlServer(connection_string=CONNECTION_STRING)
    # ,
    # num_tasks = 1,
    # auto_cleanup = False,
    # console_output=True
    # )

    rx_set_compute_context(compute_context)

    # if overwrite:

    ####################################################################
    # Create table in SQL server
    ####################################################################

    print("Creating tables...")
    data_source = RxSqlServerData(table=tablename,
                                  connection_string=CONNECTION_STRING)

    ####################################################################
    # Read data into the SQL server table that was just created
    ####################################################################

    print("Reading data into tables...")

    rx_data_step(input_data=inputdf, output_file=data_source, overwrite=True)

    #####################################################################
    # Set up a query on table for train and test data (and ensure factor levels)
    #####################################################################
    print("Setting up query and datasources for train and test sets...")

    # Train data
    data_source_train = RxSqlServerData(
        sql_query="SELECT TOP 10000 * FROM Lead_Demography_Tbl \
                    ORDER BY Lead_Id",
        connection_string=CONNECTION_STRING,
        verbose=True)

    # Import training data RxImport style from new query source
    X_y_train = rx_import(data_source_train)
    # X_y_train = rx_data_step(input_data=data_source_train, overwrite=True)

    print("Test data...")
    # Test data (let's pick ~30% size of training dataset)
    data_source_test = RxSqlServerData(
        sql_query="SELECT * FROM Lead_Demography_Tbl \
                    ORDER BY Lead_Id \
                    OFFSET 10000 ROWS \
                    FETCH FIRST 3000 ROW ONLY",
        connection_string=CONNECTION_STRING)

    # Import data RxImport style from new query source
    X_y_test = rx_import(data_source_test)
    # X_y_test = rx_data_step(input_data=data_source_test, overwrite=True)

    #####################################################################
    # Run revoscalepy logistic regression on training data (in-database)
    #####################################################################

    print('Fitting a logistic regression model...')

    mod = rx_logit(formula="Annual_Income_Bucket_gt120k ~ \
                                F(Highest_Education_Graduate_School)",
                   data=X_y_train,
                   compute_context=compute_context,
                   verbose=2)
    assert mod is not None
    assert mod._results is not None
    pprint(mod._results)

    #####################################################################
    # Summary on training data (in-database)
    #####################################################################

    # Note:  for "data" use data source and not the rximport'ed data

    print('\nSummary: \n')

    summary = rx_summary("Annual_Income_Bucket_gt120k ~ \
                                F(Highest_Education_Graduate_School)",
                         data=data_source_train,
                         compute_context=compute_context,
                         cube=True,
                         verbose=2)

    #####################################################################
    # Predict on test data (in-database)
    #####################################################################

    print("\nPredict on test: \n")

    pred = rx_predict(mod, data=X_y_test, verbose=2, write_model_vars=True)

    #####################################################################
    # Metrics for predition based on groundtruth (with scikit-learn tools)
    #####################################################################

    pred_results = pred._results['Annual_Income_Bucket_gt120k_Pred']
    # For some reason the prediction results are not in a binary [0,1] format
    y_pred = binarize(pred_results, threshold=(min(pred_results) + \
                        max(pred_results))/2).reshape(-1, 1)
    y_true = pred._data['Annual_Income_Bucket_gt120k']

    print("Model prediction results:", y_pred)
    print("Actual values: ", y_true)

    print("Accuracy score: ", accuracy_score(y_true=y_true, y_pred=y_pred))