from revoscalepy import RxInSqlServer, RxLocalSeq, rx_set_compute_context, RxSqlServerData from microsoftml import rx_predict as ml_predict from lung_cancer.connection_settings import get_connection_string, TABLE_CLASSIFIERS, TABLE_FEATURES, TABLE_PATIENTS, FASTTREE_MODEL_NAME from lung_cancer.lung_cancer_utils import retrieve_model # Connect to SQL Server and set compute context connection_string = get_connection_string() sql = RxInSqlServer(connection_string=connection_string) local = RxLocalSeq() rx_set_compute_context(local) # Specify patient to make prediction for PatientIndex = 20 # Select patient data query = "SELECT TOP(1) * FROM {} AS t1 INNER JOIN {} AS t2 ON t1.patient_id = t2.patient_id WHERE t2.idx = {}".format( TABLE_FEATURES, TABLE_PATIENTS, PatientIndex) patient_sql = RxSqlServerData(sql_query=query, connection_string=connection_string) # Get classifier classifier = retrieve_model(TABLE_CLASSIFIERS, connection_string, FASTTREE_MODEL_NAME) # Make Prediction on a single patient predictions = ml_predict(classifier, data=patient_sql, extra_vars_to_write=["label", "patient_id"]) print("The probability of cancer for patient {} with patient_id {} is {}%".
from lung_cancer.lung_cancer_utils import insert_model, create_formula, roc, train_test_split from lung_cancer.connection_settings import get_connection_string, TABLE_PATIENTS, TABLE_CLASSIFIERS, TABLE_FEATURES, TABLE_TRAIN_ID, TABLE_PREDICTIONS, FASTTREE_MODEL_NAME print("Starting routine") # Set recursion limit to be slightly larger to accommodate larger formulas (which are paresed recursively) print("Old recursion limit: ", sys.getrecursionlimit()) sys.setrecursionlimit(1500) print("New recursion limit: ", sys.getrecursionlimit()) # Connect to SQL Server and set compute context connection_string = get_connection_string() sql = RxInSqlServer(connection_string=connection_string) local = RxLocalSeq() rx_set_compute_context(local) # Train Test Split resample = True if resample: print("Performing Train Test Split") p = 80 train_test_split(TABLE_TRAIN_ID, TABLE_PATIENTS, p, connection_string=connection_string) # Point to the SQL table with the training data column_info = {"label": {"type": "numeric"}} query = "SELECT * FROM {} WHERE patient_id IN (SELECT patient_id FROM {})".format( TABLE_FEATURES, TABLE_TRAIN_ID)
from revoscalepy import rx_import from pandas import Categorical import pandas as pd import pyodbc model_type = "linear" conn_str = 'Driver=SQL Server;Server=Win01.techsummit.local;Database=MLDB;Uid={0};Pwd={1};'.format("os.getenv("UID")", os.getenv("SQLPass")) inputsql = 'select "RentalCount", "Year", "Month", "Day", "WeekDay", "Snow", "Holiday", "FWeekDay" from dbo.rental_data where Year < 2015' cc = RxInSqlServer( connection_string = conn_str, num_tasks = 1, auto_cleanup = False ) local_cc = RxLocalSeq() previous_cc = rx_set_compute_context(cc) rx_get_compute_context() column_info = { "Year" : { "type" : "integer" }, "Month" : { "type" : "integer" }, "Day" : { "type" : "integer" }, "RentalCount" : { "type" : "integer" }, "WeekDay" : { "type" : "factor", "levels" : ["1", "2", "3", "4", "5", "6", "7"] }, "Holiday" : { "type" : "factor", "levels" : ["1", "0"] },
########################################################################################################################################## ## Compute Contexts and Packages ########################################################################################################################################## from pandas import DataFrame from pandas import to_numeric from revoscalepy import rx_get_var_names, RxSqlServerData, rx_summary, rx_set_compute_context, rx_data_step from SQLConnection import * from length_of_stay_utils import detect_table # Set the Compute Context to Sql. rx_set_compute_context(sql) ########################################################################################################################################## ## Input: Point to the SQL table with the cleaned raw data set ########################################################################################################################################## table_name = None missing = detect_table("LoS0", connection_string) if missing is False: table_name = "LengthOfStay" else: table_name = "LoS0"
import pandas as pd from revoscalepy import RxSqlServerData, rx_data_step, RxInSqlServer, RxLocalSeq, rx_set_compute_context from lung_cancer.connection_settings import get_connection_string, TABLE_LABELS, TABLE_FEATURES, IMAGES_FOLDER from lung_cancer.lung_cancer_utils import compute_features, gather_image_paths, average_pool print("Starting routine") # Connect to SQL Server connection_string = get_connection_string() sql = RxInSqlServer(connection_string=connection_string) local = RxLocalSeq() rx_set_compute_context(local) # Get paths to filetable images data = gather_image_paths(IMAGES_FOLDER, connection_string) data = data.head(1000) print("Number of images to featurize: {}".format(len(data))) # Featureize images print("Featurizing Images") rx_set_compute_context( local) # TODO: sql not working correctly. Change to sql later. featurized_data = compute_features(data) rx_set_compute_context(local) # Average Pooling print("Performing Average Pooling") pooled_data = average_pool(featurized_data) # Write features to table
def main(tablename, inputdf, overwrite=False): """Imports a data source into SQL Server table and, operating in-database, uses logistic regression to create a predictive model. A comparison to an out-of-database, in memory method is performed (TODO) Parameters ---------- tablename : str The new or previosly create table name in database. inputdf : RxTextData object The data source overwrite : bool, optional (default=False) Whether or not to overwrite the table. Set to True if this is a new table, otherwise False. """ #################################################################### # Set the compute context to SQL SERVER #################################################################### # NB: don't need, but would be good to know what this actually does here # RxComputeContext(LOCAL, '9.1') compute_context = RxInSqlServer(connection_string=CONNECTION_STRING) # , # num_tasks = 1, # auto_cleanup = False, # console_output=True # ) rx_set_compute_context(compute_context) # if overwrite: #################################################################### # Create table in SQL server #################################################################### print("Creating tables...") data_source = RxSqlServerData(table=tablename, connection_string=CONNECTION_STRING) #################################################################### # Read data into the SQL server table that was just created #################################################################### print("Reading data into tables...") rx_data_step(input_data=inputdf, output_file=data_source, overwrite=True) ##################################################################### # Set up a query on table for train and test data (and ensure factor levels) ##################################################################### print("Setting up query and datasources for train and test sets...") # Train data data_source_train = RxSqlServerData( sql_query="SELECT TOP 10000 * FROM Lead_Demography_Tbl \ ORDER BY Lead_Id", connection_string=CONNECTION_STRING, verbose=True) # Import training data RxImport style from new query source X_y_train = rx_import(data_source_train) # X_y_train = rx_data_step(input_data=data_source_train, overwrite=True) print("Test data...") # Test data (let's pick ~30% size of training dataset) data_source_test = RxSqlServerData( sql_query="SELECT * FROM Lead_Demography_Tbl \ ORDER BY Lead_Id \ OFFSET 10000 ROWS \ FETCH FIRST 3000 ROW ONLY", connection_string=CONNECTION_STRING) # Import data RxImport style from new query source X_y_test = rx_import(data_source_test) # X_y_test = rx_data_step(input_data=data_source_test, overwrite=True) ##################################################################### # Run revoscalepy logistic regression on training data (in-database) ##################################################################### print('Fitting a logistic regression model...') mod = rx_logit(formula="Annual_Income_Bucket_gt120k ~ \ F(Highest_Education_Graduate_School)", data=X_y_train, compute_context=compute_context, verbose=2) assert mod is not None assert mod._results is not None pprint(mod._results) ##################################################################### # Summary on training data (in-database) ##################################################################### # Note: for "data" use data source and not the rximport'ed data print('\nSummary: \n') summary = rx_summary("Annual_Income_Bucket_gt120k ~ \ F(Highest_Education_Graduate_School)", data=data_source_train, compute_context=compute_context, cube=True, verbose=2) ##################################################################### # Predict on test data (in-database) ##################################################################### print("\nPredict on test: \n") pred = rx_predict(mod, data=X_y_test, verbose=2, write_model_vars=True) ##################################################################### # Metrics for predition based on groundtruth (with scikit-learn tools) ##################################################################### pred_results = pred._results['Annual_Income_Bucket_gt120k_Pred'] # For some reason the prediction results are not in a binary [0,1] format y_pred = binarize(pred_results, threshold=(min(pred_results) + \ max(pred_results))/2).reshape(-1, 1) y_true = pred._data['Annual_Income_Bucket_gt120k'] print("Model prediction results:", y_pred) print("Actual values: ", y_true) print("Accuracy score: ", accuracy_score(y_true=y_true, y_pred=y_pred))