def transform(dataset, context): from pandas import DataFrame table_name = None connection_string = "Driver=ODBC Driver 13 for SQL Server;Server=13.91.49.253;Database=Hospital;Uid=revotester;Pwd=T3sterPwd" def detect_table(table_name, connection_string): from revoscalepy import RxSqlServerData, rx_import detect_sql = RxSqlServerData(sql_query="IF EXISTS (select 1 from information_schema.tables where table_name = '{}') SELECT 1 AS ret ELSE SELECT 0 AS ret".format(table_name), connection_string=connection_string) does_exist = rx_import(detect_sql) if does_exist.iloc[0,0] == 1: return True else: return False missing = detect_table("LoS0", connection_string) if missing is False: table_name = "LengthOfStay" else: table_name = "LoS0" LengthOfStay_cleaned_sql = RxSqlServerData(table=table_name, connection_string=connection_string) # Get the mean and standard deviation of those variables. col_list = rx_get_var_names(LengthOfStay_cleaned_sql) f = "+".join(col_list) summary = rx_summary(formula=f, data=LengthOfStay_cleaned_sql, by_term=True).summary_data_frame names = ["hematocrit", "neutrophils", "sodium", "glucose", "bloodureanitro", "creatinine", "bmi", "pulse", "respiration"] statistics = summary[summary["Name"].isin(names)] statistics = statistics[["Name", "Mean", "StdDev"]] # standardization transform function def standardize(data, context): for n, row in statistics.iterrows(): data[[row["Name"]]] = (data[[row["Name"]]] - row["Mean"])/row["StdDev"] return data # number_of_issues transform function def calculate_number_of_issues(data, context): data["number_of_issues"] = to_numeric(data["hemo"]) + to_numeric(data["dialysisrenalendstage"]) + to_numeric(data["asthma"])\ + to_numeric(data["irondef"]) + to_numeric(data["pneum"]) + to_numeric(data["substancedependence"])\ + to_numeric(data["psychologicaldisordermajor"]) + to_numeric(data["depress"])\ + to_numeric(data["psychother"]) + to_numeric(data["fibrosisandother"]) + to_numeric(data["malnutrition"]) return data data = DataFrame(dataset) data = standardize(data, context) data = calculate_number_of_issues(data, context) return data
model_type = "linear" conn_str = 'Driver=SQL Server;Server=<Server Name>;Database=MLDB;Uid=<User Name>;Pwd=<Password>;' cnxn = pyodbc.connect(conn_str) inputsql = 'select "RentalCount", "Year", "Month", "Day", "WeekDay", "Snow", "Holiday", "FWeekDay" from dbo.rental_data where Year < 2015' rental_train_data = pd.read_sql(inputsql, cnxn) # Used to sample data # train = rental_train_data.sample(frac=0.8, random_state=1) # test = rental_train_data.loc[~ rental_train_data.index.isin(train.index)] # print("Train {0} / test {1}".format(len(train), len(test))) rental_train_data["Holiday"] = rental_train_data["Holiday"].astype("category") rental_train_data["Snow"] = rental_train_data["Snow"].astype("category") rental_train_data["WeekDay"] = rental_train_data["WeekDay"].astype("category") if model_type == "linear": linmod_model = rx_lin_mod("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = rental_train_data) trained_model = rx_serialize_model(linmod_model, realtime_scoring_only = True) if model_type == "dtree": dtree_model = rx_dtree("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = rental_train_data) trained_model = rx_serialize_model(dtree_model, realtime_scoring_only = True) print(rx_summary("~ Month + Day + WeekDay + Snow + Holiday", rental_train_data)) # Dump learned model to file with open(r'c:\temp\trained_model.pickle', mode='wb') as f: f.write(trained_model) cursor=cnxn.cursor() cursor.execute("INSERT INTO rental_models(model_name, lang, native_model) VALUES(?, ?, ?)", (model_type + "_model", "Python", trained_model)) cnxn.commit()
# SQL Server data # Importing data from SQL Server using pyodbc # Using the revoscalepy library import numpy as np import pandas as pd import pyodbc from revoscalepy import rx_lin_mod, rx_predict, rx_summary # Connecting and reading the data con = pyodbc.connect('DSN=AWDW;UID=RUser;PWD=Pa$$w0rd') query = """SELECT CustomerKey, Age, YearlyIncome, TotalChildren, NumberCarsOwned FROM dbo.vTargetMail;""" TM = pd.read_sql(query, con) TM.head(5) TM.shape # Check the summary of the NumberCarsOwned summary = rx_summary("NumberCarsOwned", TM) print(summary) # Create a linear model linmod = rx_lin_mod("NumberCarsOwned ~ YearlyIncome + Age + TotalChildren", data=TM) predmod = rx_predict(linmod, data=TM, output_data=TM) predmod.head(10) # End of script
"sa", os.getenv("SQLPass")) cnxn = pyodbc.connect(conn_str) inputsql = 'select "RentalCount", "Year", "Month", "Day", "WeekDay", "Snow", "Holiday", "FWeekDay" from dbo.rental_data where Year < 2015' rental_train_data = pd.read_sql(inputsql, cnxn) rental_train_data["Holiday"] = rental_train_data["Holiday"].astype("category") rental_train_data["Snow"] = rental_train_data["Snow"].astype("category") rental_train_data["WeekDay"] = rental_train_data["WeekDay"].astype("category") linmod_model = rx_lin_mod( "RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data=rental_train_data) trained_model = rx_serialize_model(linmod_model, realtime_scoring_only=True) print( rx_summary("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", rental_train_data)) # Dump learned model to file with open(r'c:\model\trained_model.pickle', mode='wb') as f: f.write(trained_model) # Dump learned model to Table cursor = cnxn.cursor() cursor.execute(\ ''' MERGE rental_models AS target USING (SELECT ? as model_name) AS source ON(target.model_name = source.model_name) WHEN MATCHED THEN UPDATE SET native_model = ? WHEN NOT MATCHED BY TARGET THEN INSERT (model_name, lang, native_model) VALUES(?,?,?); ''', \
conn_str = 'Driver=SQL Server;Server=SoL01.techsummit.local;Database=MLDB;Uid={0};Pwd={1};'.format("sa",os.getenv("SQLPass")) cnxn = pyodbc.connect(conn_str) inputsql =''' SELECT C1, C2 FROM ( VALUES ('A',1),('B',2),('C',3),('D',4),('E',5),('F',6),('G',7),('H',8),('I',9),('J',10), ('K',11),('L',12),('M',13),('N',14),('O',15),('P',16),('Q',17),('R',18),('S',19),('T',20), ('U',21),('V',22),('W',23),('X',24),('Y',25),('Z',26) ) AS T1(C1, C2) ''' train_data = pd.read_sql(inputsql, cnxn) train_data["C1"] = train_data["C1"].astype("category") print(train_data["C1"] ) # model = rx_lin_mod("C2 ~ C1", data = train_data) model = rx_fast_linear("C2 ~ C1", data = train_data, method = "regression") print(rx_summary("C2 ~ C1", train_data)) trained_model = rx_serialize_model(model, realtime_scoring_only = True) # Dump learned model to file with open(r'c:\temp\trained_model_{0}.pickle'.format(model_file), mode='wb') as f: f.write(trained_model) p_data = pd.DataFrame([0,5,10,99,4], columns=["C1"]) p_data["C1"] = train_data["C1"].astype("category") print(p_data["C1"]) pred = rx_predict(model, data = p_data) print(pred)
from sklearn import datasets import pandas as pd iris = datasets.load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) print(df) import revoscalepy from revoscalepy import rx_summary summary = rx_summary("petal length (cm)", df) print(summary) # Import the DeployClient and MLServer classes from # the azureml-model-management-sdk package so you can # connect to Machine Learning Server (use=MLServer). from azureml.deploy import DeployClient from azureml.deploy.server import MLServer # Define the location of the Machine Learning Server HOST = 'http://*****:*****@') client = DeployClient(HOST, use=MLServer, auth=context)
"Month" : { "type" : "integer" }, "Day" : { "type" : "integer" }, "RentalCount" : { "type" : "integer" }, "WeekDay" : { "type" : "factor", "levels" : ["1", "2", "3", "4", "5", "6", "7"] }, "Holiday" : { "type" : "factor", "levels" : ["1", "0"] }, "Snow" : { "type" : "factor", "levels" : ["1", "0"] } } data_source = RxSqlServerData(sql_query=inputsql, connection_string=conn_str, column_info=column_info) linmod_model = rx_lin_mod("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = data_source, computeContext = cc) trained_model = rx_serialize_model(linmod_model, realtime_scoring_only = True) with open(r'c:\temp\trained_model.pickle', mode='wb') as f: f.write(trained_model) print(rx_summary("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data_source)) cnxn = pyodbc.connect(conn_str) cursor=cnxn.cursor() cursor.execute("INSERT INTO rental_models(model_name, lang, native_model) VALUES(?, ?, ?)", ("linear_model", "Python", trained_model)) cnxn.commit()
# First, get the names and types of the variables to be treated. # For rxSummary to give correct info on characters, stringsAsFactors = True should be used. LengthOfStay_sql2 = RxSqlServerData(table="LengthOfStay", connection_string=connection_string, stringsAsFactors=True) #col = rxCreateColInfo(LengthOfStay_sql2) # Not yet available colnames = rx_get_var_names(LengthOfStay_sql2) # Then, get the names of the variables that actually have missing values. Assumption: no NA in eid, lengthofstay, or dates. var = [ x for x in colnames if x not in ["eid", "lengthofstay", "vdate", "discharged"] ] f = "+".join(var) summary = rx_summary(formula=f, data=LengthOfStay_sql2, by_term=True).summary_data_frame var_with_NA = summary[summary["MissingObs"] > 0] method = None if var_with_NA.empty: print("No missing values.") print("You can move to step 2.") missing = False else: print("Variables containing missing values are:") print(var_with_NA) print("Apply one of the methods below to fill missing values.") missing = True method = "missing" #method = "mean_mode"
def main(tablename, inputdf, overwrite=False): """Imports a data source into SQL Server table and, operating in-database, uses logistic regression to create a predictive model. A comparison to an out-of-database, in memory method is performed (TODO) Parameters ---------- tablename : str The new or previosly create table name in database. inputdf : RxTextData object The data source overwrite : bool, optional (default=False) Whether or not to overwrite the table. Set to True if this is a new table, otherwise False. """ #################################################################### # Set the compute context to SQL SERVER #################################################################### # NB: don't need, but would be good to know what this actually does here # RxComputeContext(LOCAL, '9.1') compute_context = RxInSqlServer(connection_string=CONNECTION_STRING) # , # num_tasks = 1, # auto_cleanup = False, # console_output=True # ) rx_set_compute_context(compute_context) # if overwrite: #################################################################### # Create table in SQL server #################################################################### print("Creating tables...") data_source = RxSqlServerData(table=tablename, connection_string=CONNECTION_STRING) #################################################################### # Read data into the SQL server table that was just created #################################################################### print("Reading data into tables...") rx_data_step(input_data=inputdf, output_file=data_source, overwrite=True) ##################################################################### # Set up a query on table for train and test data (and ensure factor levels) ##################################################################### print("Setting up query and datasources for train and test sets...") # Train data data_source_train = RxSqlServerData( sql_query="SELECT TOP 10000 * FROM Lead_Demography_Tbl \ ORDER BY Lead_Id", connection_string=CONNECTION_STRING, verbose=True) # Import training data RxImport style from new query source X_y_train = rx_import(data_source_train) # X_y_train = rx_data_step(input_data=data_source_train, overwrite=True) print("Test data...") # Test data (let's pick ~30% size of training dataset) data_source_test = RxSqlServerData( sql_query="SELECT * FROM Lead_Demography_Tbl \ ORDER BY Lead_Id \ OFFSET 10000 ROWS \ FETCH FIRST 3000 ROW ONLY", connection_string=CONNECTION_STRING) # Import data RxImport style from new query source X_y_test = rx_import(data_source_test) # X_y_test = rx_data_step(input_data=data_source_test, overwrite=True) ##################################################################### # Run revoscalepy logistic regression on training data (in-database) ##################################################################### print('Fitting a logistic regression model...') mod = rx_logit(formula="Annual_Income_Bucket_gt120k ~ \ F(Highest_Education_Graduate_School)", data=X_y_train, compute_context=compute_context, verbose=2) assert mod is not None assert mod._results is not None pprint(mod._results) ##################################################################### # Summary on training data (in-database) ##################################################################### # Note: for "data" use data source and not the rximport'ed data print('\nSummary: \n') summary = rx_summary("Annual_Income_Bucket_gt120k ~ \ F(Highest_Education_Graduate_School)", data=data_source_train, compute_context=compute_context, cube=True, verbose=2) ##################################################################### # Predict on test data (in-database) ##################################################################### print("\nPredict on test: \n") pred = rx_predict(mod, data=X_y_test, verbose=2, write_model_vars=True) ##################################################################### # Metrics for predition based on groundtruth (with scikit-learn tools) ##################################################################### pred_results = pred._results['Annual_Income_Bucket_gt120k_Pred'] # For some reason the prediction results are not in a binary [0,1] format y_pred = binarize(pred_results, threshold=(min(pred_results) + \ max(pred_results))/2).reshape(-1, 1) y_true = pred._data['Annual_Income_Bucket_gt120k'] print("Model prediction results:", y_pred) print("Actual values: ", y_true) print("Accuracy score: ", accuracy_score(y_true=y_true, y_pred=y_pred))