Exemple #1
0
from revoscalepy import RxInSqlServer, RxLocalSeq, rx_set_compute_context, RxSqlServerData
from microsoftml import rx_predict as ml_predict

from lung_cancer.connection_settings import get_connection_string, TABLE_CLASSIFIERS, TABLE_FEATURES, TABLE_PATIENTS, FASTTREE_MODEL_NAME
from lung_cancer.lung_cancer_utils import retrieve_model

# Connect to SQL Server and set compute context
connection_string = get_connection_string()
sql = RxInSqlServer(connection_string=connection_string)
local = RxLocalSeq()
rx_set_compute_context(local)

# Specify patient to make prediction for
PatientIndex = 20

# Select patient data
query = "SELECT TOP(1) * FROM {} AS t1 INNER JOIN {} AS t2 ON t1.patient_id = t2.patient_id WHERE t2.idx = {}".format(
    TABLE_FEATURES, TABLE_PATIENTS, PatientIndex)
patient_sql = RxSqlServerData(sql_query=query,
                              connection_string=connection_string)

# Get classifier
classifier = retrieve_model(TABLE_CLASSIFIERS, connection_string,
                            FASTTREE_MODEL_NAME)

# Make Prediction on a single patient
predictions = ml_predict(classifier,
                         data=patient_sql,
                         extra_vars_to_write=["label", "patient_id"])

print("The probability of cancer for patient {} with patient_id {} is {}%".
Exemple #2
0
from lung_cancer.lung_cancer_utils import insert_model, create_formula, roc, train_test_split
from lung_cancer.connection_settings import get_connection_string, TABLE_PATIENTS, TABLE_CLASSIFIERS, TABLE_FEATURES, TABLE_TRAIN_ID, TABLE_PREDICTIONS, FASTTREE_MODEL_NAME

print("Starting routine")

# Set recursion limit to be slightly larger to accommodate larger formulas (which are paresed recursively)
print("Old recursion limit: ", sys.getrecursionlimit())
sys.setrecursionlimit(1500)
print("New recursion limit: ", sys.getrecursionlimit())

# Connect to SQL Server and set compute context
connection_string = get_connection_string()
sql = RxInSqlServer(connection_string=connection_string)
local = RxLocalSeq()
rx_set_compute_context(local)

# Train Test Split
resample = True
if resample:
    print("Performing Train Test Split")
    p = 80
    train_test_split(TABLE_TRAIN_ID,
                     TABLE_PATIENTS,
                     p,
                     connection_string=connection_string)

# Point to the SQL table with the training data
column_info = {"label": {"type": "numeric"}}
query = "SELECT * FROM {} WHERE patient_id IN (SELECT patient_id FROM {})".format(
    TABLE_FEATURES, TABLE_TRAIN_ID)
from revoscalepy import rx_import
from pandas import Categorical
import pandas as pd
import pyodbc

model_type = "linear"
conn_str = 'Driver=SQL Server;Server=Win01.techsummit.local;Database=MLDB;Uid={0};Pwd={1};'.format("os.getenv("UID")", os.getenv("SQLPass"))
inputsql = 'select "RentalCount", "Year", "Month", "Day", "WeekDay", "Snow", "Holiday", "FWeekDay" from dbo.rental_data where Year < 2015'

cc = RxInSqlServer(
     connection_string = conn_str,
     num_tasks = 1,
     auto_cleanup = False
)
local_cc = RxLocalSeq()
previous_cc = rx_set_compute_context(cc)
rx_get_compute_context()

column_info = {
         "Year" : { "type" : "integer" },
         "Month" : { "type" : "integer" },
         "Day" : { "type" : "integer" },
         "RentalCount" : { "type" : "integer" },
         "WeekDay" : {
             "type" : "factor",
             "levels" : ["1", "2", "3", "4", "5", "6", "7"]
         },
         "Holiday" : {
             "type" : "factor",
             "levels" : ["1", "0"]
         },
Exemple #4
0
##########################################################################################################################################

## Compute Contexts and Packages

##########################################################################################################################################

from pandas import DataFrame
from pandas import to_numeric

from revoscalepy import rx_get_var_names, RxSqlServerData, rx_summary, rx_set_compute_context, rx_data_step

from SQLConnection import *
from length_of_stay_utils import detect_table

# Set the Compute Context to Sql.
rx_set_compute_context(sql)


##########################################################################################################################################

## Input: Point to the SQL table with the cleaned raw data set

##########################################################################################################################################

table_name = None
missing = detect_table("LoS0", connection_string)
if missing is False:
    table_name = "LengthOfStay"
else:
    table_name = "LoS0"
Exemple #5
0
import pandas as pd
from revoscalepy import RxSqlServerData, rx_data_step, RxInSqlServer, RxLocalSeq, rx_set_compute_context

from lung_cancer.connection_settings import get_connection_string, TABLE_LABELS, TABLE_FEATURES, IMAGES_FOLDER
from lung_cancer.lung_cancer_utils import compute_features, gather_image_paths, average_pool

print("Starting routine")

# Connect to SQL Server
connection_string = get_connection_string()
sql = RxInSqlServer(connection_string=connection_string)
local = RxLocalSeq()
rx_set_compute_context(local)

# Get paths to filetable images
data = gather_image_paths(IMAGES_FOLDER, connection_string)
data = data.head(1000)
print("Number of images to featurize: {}".format(len(data)))

# Featureize images
print("Featurizing Images")
rx_set_compute_context(
    local)  # TODO: sql not working correctly. Change to sql later.
featurized_data = compute_features(data)
rx_set_compute_context(local)

# Average Pooling
print("Performing Average Pooling")
pooled_data = average_pool(featurized_data)

# Write features to table
def main(tablename, inputdf, overwrite=False):
    """Imports a data source into SQL Server table and, operating
       in-database, uses logistic regression to create a
       predictive model.

       A comparison to an out-of-database, in memory method
       is performed (TODO)
       
       Parameters
       ----------

       tablename : str
           The new or previosly create table name in database.

       inputdf : RxTextData object
           The data source

       overwrite : bool, optional (default=False)
           Whether or not to overwrite the table.  
           Set to True if this is a new table, otherwise False.
       """

    ####################################################################
    # Set the compute context to SQL SERVER
    ####################################################################

    # NB: don't need, but would be good to know what this actually does here
    # RxComputeContext(LOCAL, '9.1')

    compute_context = RxInSqlServer(connection_string=CONNECTION_STRING)
    # ,
    # num_tasks = 1,
    # auto_cleanup = False,
    # console_output=True
    # )

    rx_set_compute_context(compute_context)

    # if overwrite:

    ####################################################################
    # Create table in SQL server
    ####################################################################

    print("Creating tables...")
    data_source = RxSqlServerData(table=tablename,
                                  connection_string=CONNECTION_STRING)

    ####################################################################
    # Read data into the SQL server table that was just created
    ####################################################################

    print("Reading data into tables...")

    rx_data_step(input_data=inputdf, output_file=data_source, overwrite=True)

    #####################################################################
    # Set up a query on table for train and test data (and ensure factor levels)
    #####################################################################
    print("Setting up query and datasources for train and test sets...")

    # Train data
    data_source_train = RxSqlServerData(
        sql_query="SELECT TOP 10000 * FROM Lead_Demography_Tbl \
                    ORDER BY Lead_Id",
        connection_string=CONNECTION_STRING,
        verbose=True)

    # Import training data RxImport style from new query source
    X_y_train = rx_import(data_source_train)
    # X_y_train = rx_data_step(input_data=data_source_train, overwrite=True)

    print("Test data...")
    # Test data (let's pick ~30% size of training dataset)
    data_source_test = RxSqlServerData(
        sql_query="SELECT * FROM Lead_Demography_Tbl \
                    ORDER BY Lead_Id \
                    OFFSET 10000 ROWS \
                    FETCH FIRST 3000 ROW ONLY",
        connection_string=CONNECTION_STRING)

    # Import data RxImport style from new query source
    X_y_test = rx_import(data_source_test)
    # X_y_test = rx_data_step(input_data=data_source_test, overwrite=True)

    #####################################################################
    # Run revoscalepy logistic regression on training data (in-database)
    #####################################################################

    print('Fitting a logistic regression model...')

    mod = rx_logit(formula="Annual_Income_Bucket_gt120k ~ \
                                F(Highest_Education_Graduate_School)",
                   data=X_y_train,
                   compute_context=compute_context,
                   verbose=2)
    assert mod is not None
    assert mod._results is not None
    pprint(mod._results)

    #####################################################################
    # Summary on training data (in-database)
    #####################################################################

    # Note:  for "data" use data source and not the rximport'ed data

    print('\nSummary: \n')

    summary = rx_summary("Annual_Income_Bucket_gt120k ~ \
                                F(Highest_Education_Graduate_School)",
                         data=data_source_train,
                         compute_context=compute_context,
                         cube=True,
                         verbose=2)

    #####################################################################
    # Predict on test data (in-database)
    #####################################################################

    print("\nPredict on test: \n")

    pred = rx_predict(mod, data=X_y_test, verbose=2, write_model_vars=True)

    #####################################################################
    # Metrics for predition based on groundtruth (with scikit-learn tools)
    #####################################################################

    pred_results = pred._results['Annual_Income_Bucket_gt120k_Pred']
    # For some reason the prediction results are not in a binary [0,1] format
    y_pred = binarize(pred_results, threshold=(min(pred_results) + \
                        max(pred_results))/2).reshape(-1, 1)
    y_true = pred._data['Annual_Income_Bucket_gt120k']

    print("Model prediction results:", y_pred)
    print("Actual values: ", y_true)

    print("Accuracy score: ", accuracy_score(y_true=y_true, y_pred=y_pred))