def train_classifier(conn_str, feature_table, training_table, testing_table, score_table, model_table, DNNDimension): levels = get_label_levels(feature_table, conn_str) # get levels of Label colInfo = {"Label": {"type": "factor", "levels": levels}} train_query = "SELECT * FROM {} WHERE image IN (SELECT image FROM {})".format( feature_table, training_table) train_data = RxSqlServerData(sql_query=train_query, connection_string=conn_str, column_info=colInfo) test_query = "SELECT * FROM {} WHERE image IN (SELECT image FROM {})".format( feature_table, testing_table) test_data = RxSqlServerData(sql_query=test_query, connection_string=conn_str, column_info=colInfo) featureSet = ["feature." + str(i) for i in range(DNNDimension)] label = "Label" cols = featureSet + [label] classifier = rx_neural_network( "Label ~ feature", data=train_data, method="multiClass", optimizer=sgd_optimizer(learning_rate=0.011, l_rate_red_ratio=0.9, l_rate_red_freq=3, momentum=0.2), num_iterations=300, ml_transforms=[concat(cols={"feature": featureSet})]) test_score = rx_predict(classifier, test_data, extra_vars_to_write=["image", "Label"]) train_score = rx_predict(classifier, train_data, extra_vars_to_write=["image", "Label"]) testACC = float( len(test_score[test_score["Label"] == test_score["PredictedLabel"]])) / len(test_score) trainACC = float( len(train_score[train_score["Label"] == train_score["PredictedLabel"]])) / len(train_score) print("The train accuracy of the neural network model is {}".format( trainACC)) print( "The test accuracy of the neural network model is {}".format(testACC)) print("Saving the predictive results of all the images into SQL table...") outputScore = RxSqlServerData(connection_string=conn_str, table=score_table) rx_data_step(pd.concat([test_score, train_score]), outputScore, overwrite=True) print("Saving model into SQL table...") save_model(model_table, conn_str, classifier, MODEL_NAME)
def test_step3_check_output(): # Check that all RTS models have been created RTS_odbc = RxOdbcData(connection_string, table="RTS") forest_serialized = rx_read_object(RTS_odbc, key="RF", deserialize=False, decompress=None) boosted_serialized = rx_read_object(RTS_odbc, key="GBT", deserialize=False, decompress=None) fast_serialized = rx_read_object(RTS_odbc, key="FT", deserialize=False, decompress=None) NN_serialized = rx_read_object(RTS_odbc, key="NN", deserialize=False, decompress=None) assert forest_serialized.__str__()[:6] == "b'blob" assert boosted_serialized.__str__()[:6] == "b'blob" assert fast_serialized.__str__()[:6] == "b'blob" assert NN_serialized.__str__()[:6] == "b'blob" # Check that predictions have been made for for all models forest_prediction_sql = RxSqlServerData( sql_query= "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = N'Forest_Prediction'", connection_string=connection_string) forest_prediction = rx_import(input_data=forest_prediction_sql) boosted_prediction_sql = RxSqlServerData( sql_query= "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = N'Boosted_Prediction'", connection_string=connection_string) boosted_prediction = rx_import(input_data=boosted_prediction_sql) fast_prediction_sql = RxSqlServerData( sql_query= "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = N'Fast_Prediction'", connection_string=connection_string) fast_prediction = rx_import(input_data=fast_prediction_sql) NN_prediction_sql = RxSqlServerData( sql_query= "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = N'NN_Prediction'", connection_string=connection_string) NN_prediction = rx_import(input_data=NN_prediction_sql) assert isinstance(forest_prediction, DataFrame) assert isinstance(boosted_prediction, DataFrame) assert isinstance(fast_prediction, DataFrame) assert isinstance(NN_prediction, DataFrame)
def detect_table(table_name, connection_string): from revoscalepy import RxSqlServerData, rx_import detect_sql = RxSqlServerData(sql_query="IF EXISTS (select 1 from information_schema.tables where table_name = '{}') SELECT 1 AS ret ELSE SELECT 0 AS ret".format(table_name), connection_string=connection_string) does_exist = rx_import(detect_sql) if does_exist.iloc[0,0] == 1: return True else: return False
def get_num_rows(table, connection_string): count_sql = RxSqlServerData( sql_query="SELECT COUNT(*) FROM {};".format(table), connection_string=connection_string) count = rx_import(count_sql) count = count.iloc[0, 0] return count
def rank_candidates(conn_str, query_table, model_table, results_table): print("Loading the classifier output for all the candidate images...") candidateImageVector = load_predicted_scores(conn_str, TABLE_SCORE) print("Loading query images...") queryImagesSql = "SELECT * FROM " + query_table queryImages = RxSqlServerData(connection_string=conn_str, sql_query=queryImagesSql, strings_as_factors=False) print("Embedding query images...") query_image_features = rx_data_step(input_data=queryImages, overwrite=True, transform_function=featurize_transform, report_progress=2) print("Loading classifier...") classifier = load_model(model_table, conn_str, MODEL_NAME) print("Getting classifier output for query images...") queryImageScores = rx_predict(classifier, query_image_features, extra_vars_to_write=["image", "Label"]) queryImageVector = map_images_to_predictedscores(queryImageScores) print("Calculating cosine similarity between each image pair...") rank_candiate_images(conn_str, queryImageVector, candidateImageVector, TOP_K_CANDIDATES, results_table)
def select_top_value_of_column(table_name, connection_string, column_name): query = "SELECT TOP(1) " + column_name + " FROM " + table_name print(query) query_sql = RxSqlServerData(sql_query=query, connection_string=connection_string) data = rx_import(query_sql) print(data) return data.iloc[0, 0]
def gather_image_paths(images_folder, connection_string): root = os.path.dirname(images_folder) query = 'SELECT ([file_stream].GetFileNamespacePath()) as image FROM [MriData] WHERE [is_directory] = 0' filetable_sql = RxSqlServerData(sql_query=query, connection_string=connection_string) data = rx_import(filetable_sql) data["image"] = data["image"].apply(lambda x: os.path.join(root, x[1:])) # TODO: assert to confirm paths exist data["patient_id"] = data["image"].map(lambda x: os.path.basename(os.path.dirname(x))) return data
def train_test_split(id, table, train_table, p, connection_string): from revoscalepy import rx_set_temp_compute_context, RxInSqlServer, rx_data_step, RxLocalSeq def transform(data, compute_context): return data data = RxSqlServerData( sql_query= "SELECT {} FROM {} WHERE ABS(CAST(CAST(HashBytes('MD5', CAST({} AS varchar(10))) AS VARBINARY(64)) AS BIGINT) % 100) < {};" .format(id, table, id, p), connection_string=connection_string) train_table_dest = RxSqlServerData(table='Train_Id', connection_string=connection_string) rx_data_step(input_data=data, output_file=train_table_dest, overwrite=True, transform_function=transform) print("Created {}".format(train_table))
def manage_prediction(patient_index): query = "SELECT TOP(1) * FROM {} AS t1 INNER JOIN {} AS t2 ON t1.patient_id = t2.patient_id WHERE t2.idx = {}".format( TABLE_PCA_FEATURES, TABLE_PATIENTS, patient_index) patient_sql = RxSqlServerData(sql_query=query, connection_string=connection_string) predictions = ml_predict(model, data=patient_sql, extra_vars_to_write=["label", "patient_id"]) prob = float(predictions["Probability"].iloc[0]) * 100 return prob
def evaluate_model(conn_str, candidate_table, image_pairs_table): distMethods = ["L2", "cosine"] image_scores = load_predicted_scores(conn_str, TABLE_SCORE) imagePairQuery = "SELECT * FROM " + image_pairs_table imagePairData_sql = RxSqlServerData(connection_string=conn_str, sql_query=imagePairQuery, strings_as_factors=False) imagePairData = rx_import(imagePairData_sql) print("Calculating ranking metrics...") calculate_ranking_metrics(image_scores, imagePairData, distMethods)
def transform(dataset, context): from pandas import DataFrame table_name = None connection_string = "Driver=ODBC Driver 13 for SQL Server;Server=13.91.49.253;Database=Hospital;Uid=revotester;Pwd=T3sterPwd" def detect_table(table_name, connection_string): from revoscalepy import RxSqlServerData, rx_import detect_sql = RxSqlServerData(sql_query="IF EXISTS (select 1 from information_schema.tables where table_name = '{}') SELECT 1 AS ret ELSE SELECT 0 AS ret".format(table_name), connection_string=connection_string) does_exist = rx_import(detect_sql) if does_exist.iloc[0,0] == 1: return True else: return False missing = detect_table("LoS0", connection_string) if missing is False: table_name = "LengthOfStay" else: table_name = "LoS0" LengthOfStay_cleaned_sql = RxSqlServerData(table=table_name, connection_string=connection_string) # Get the mean and standard deviation of those variables. col_list = rx_get_var_names(LengthOfStay_cleaned_sql) f = "+".join(col_list) summary = rx_summary(formula=f, data=LengthOfStay_cleaned_sql, by_term=True).summary_data_frame names = ["hematocrit", "neutrophils", "sodium", "glucose", "bloodureanitro", "creatinine", "bmi", "pulse", "respiration"] statistics = summary[summary["Name"].isin(names)] statistics = statistics[["Name", "Mean", "StdDev"]] # standardization transform function def standardize(data, context): for n, row in statistics.iterrows(): data[[row["Name"]]] = (data[[row["Name"]]] - row["Mean"])/row["StdDev"] return data # number_of_issues transform function def calculate_number_of_issues(data, context): data["number_of_issues"] = to_numeric(data["hemo"]) + to_numeric(data["dialysisrenalendstage"]) + to_numeric(data["asthma"])\ + to_numeric(data["irondef"]) + to_numeric(data["pneum"]) + to_numeric(data["substancedependence"])\ + to_numeric(data["psychologicaldisordermajor"]) + to_numeric(data["depress"])\ + to_numeric(data["psychother"]) + to_numeric(data["fibrosisandother"]) + to_numeric(data["malnutrition"]) return data data = DataFrame(dataset) data = standardize(data, context) data = calculate_number_of_issues(data, context) return data
def prepare_data(conn_str, feature_table, train_table, test_table, evaluation_table, numQueryImagePerCat, numNegImgsPerQueryImg, ratioTrainTest): image_data_sql = RxSqlServerData(connection_string=conn_str, sql_query="SELECT image, Label FROM " + feature_table, string_as_factors=False) image_data_df = rx_import(image_data_sql) print("Split image data into training and testing set...") get_training_testing_images(image_data_df, ratioTrainTest, train_table, test_table, conn_str) print("Getting ranking set from the testing image set...") prepare_evaluation_set(conn_str, feature_table, test_table, evaluation_table, numQueryImagePerCat, numNegImgsPerQueryImg)
def test_step2_check_output(): LoS_sql = RxSqlServerData( sql_query="SELECT TOP (5) * FROM [Hospital].[dbo].[LoS] ORDER BY eid", connection_string=connection_string) LoS = rx_import(input_data=LoS_sql) LoS[["number_of_issues"]] = LoS[["number_of_issues"]].apply(to_numeric) # bmi = [0.312740, -0.671356, -0.48006, -0.921639, 0.226158] # lengthofstay = [3.0, 7.0, 3.0, 1.0, 4.0] # number_of_issues = [0, 0, 0, 0, 2] # d = {"bmi": bmi, "lengthofstay": lengthofstay, "number_of_issues": number_of_issues} # df = DataFrame(d, index = [0,1,2,3,4]) assert (LoS.loc[4, 'number_of_issues'] == 2) assert (round(LoS.loc[1, 'bmi'], 6) == round(-0.67135640398687824, 6)) assert (LoS.loc[3, 'lengthofstay'] == 1.0)
def loaddata(self): #9.2.0 sqlQuery-> sql_query; connectionString-> connection_string #dataSource = RxSqlServerData(sqlQuery = "select * from dbo.trainingdata", verbose=True, reportProgress =True, # connectionString = self.__connectionstring) dataSource = RxSqlServerData( sql_query="select * from dbo.trainingdata", verbose=True, reportProgress=True, connection_string=self.__connectionstring) #9.2.0 connectionString->connection_string; autoCleanup-> auto_cleanup #self.__computeContext = RxInSqlServer(connectionString = self.__connectionstring, autoCleanup = True) self.__computeContext = RxInSqlServer( connection_string=self.__connectionstring, auto_cleanup=True) #9.2.0 rx_import_datasource -> rx_import #data = rx_import_datasource(dataSource) data = rx_import(dataSource) return data
# Set the Compute Context to local. rx_set_compute_context(local) ########################################################################################################################################## ## Upload the data set to to SQL ########################################################################################################################################## # Point to the input data set while specifying the classes. file_path = "..\\Data" LoS_text = RxTextData(file=os.path.join(file_path, "LengthOfStay.csv"), column_info=col_type_info) # Upload the table to SQL. LengthOfStay_sql = RxSqlServerData(table="LengthOfStay", connection_string=connection_string) rx_data_step(input_data=LoS_text, output_file=LengthOfStay_sql, overwrite=True) ########################################################################################################################################## ## Determine if LengthOfStay has missing values ########################################################################################################################################## # First, get the names and types of the variables to be treated. # For rxSummary to give correct info on characters, stringsAsFactors = True should be used. LengthOfStay_sql2 = RxSqlServerData(table="LengthOfStay", connection_string=connection_string, stringsAsFactors=True) #col = rxCreateColInfo(LengthOfStay_sql2) # Not yet available
########################################################################################################################################## ## Input: Point to the SQL table with the cleaned raw data set ########################################################################################################################################## table_name = None missing = detect_table("LoS0", connection_string) if missing is False: table_name = "LengthOfStay" else: table_name = "LoS0" LengthOfStay_cleaned_sql = RxSqlServerData(table=table_name, connection_string=connection_string) ########################################################################################################################################## ## Feature Engineering: ## 1- Standardization: hematocrit, neutrophils, sodium, glucose, bloodureanitro, creatinine, bmi, pulse, respiration. ## 2- Number of preidentified medical conditions: number_of_issues. ########################################################################################################################################## # Combine transform functions into one overarching transform def transform(dataset, context): from pandas import DataFrame table_name = None connection_string = "Driver=ODBC Driver 13 for SQL Server;Server=13.91.49.253;Database=Hospital;Uid=revotester;Pwd=T3sterPwd"
from lung_cancer.connection_settings import get_connection_string, TABLE_CLASSIFIERS, TABLE_FEATURES, TABLE_PATIENTS, FASTTREE_MODEL_NAME from lung_cancer.lung_cancer_utils import retrieve_model # Connect to SQL Server and set compute context connection_string = get_connection_string() sql = RxInSqlServer(connection_string=connection_string) local = RxLocalSeq() rx_set_compute_context(local) # Specify patient to make prediction for PatientIndex = 20 # Select patient data query = "SELECT TOP(1) * FROM {} AS t1 INNER JOIN {} AS t2 ON t1.patient_id = t2.patient_id WHERE t2.idx = {}".format( TABLE_FEATURES, TABLE_PATIENTS, PatientIndex) patient_sql = RxSqlServerData(sql_query=query, connection_string=connection_string) # Get classifier classifier = retrieve_model(TABLE_CLASSIFIERS, connection_string, FASTTREE_MODEL_NAME) # Make Prediction on a single patient predictions = ml_predict(classifier, data=patient_sql, extra_vars_to_write=["label", "patient_id"]) print("The probability of cancer for patient {} with patient_id {} is {}%". format(PatientIndex, predictions["patient_id"].iloc[0], predictions["Probability"].iloc[0] * 100)) if predictions["label"].iloc[0] == 0: print("Ground Truth: This patient does not have cancer")
"Month" : { "type" : "integer" }, "Day" : { "type" : "integer" }, "RentalCount" : { "type" : "integer" }, "WeekDay" : { "type" : "factor", "levels" : ["1", "2", "3", "4", "5", "6", "7"] }, "Holiday" : { "type" : "factor", "levels" : ["1", "0"] }, "Snow" : { "type" : "factor", "levels" : ["1", "0"] } } data_source = RxSqlServerData(sql_query=inputsql, connection_string=conn_str, column_info=column_info) linmod_model = rx_lin_mod("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = data_source, computeContext = cc) trained_model = rx_serialize_model(linmod_model, realtime_scoring_only = True) with open(r'c:\temp\trained_model.pickle', mode='wb') as f: f.write(trained_model) print(rx_summary("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data_source)) cnxn = pyodbc.connect(conn_str) cursor=cnxn.cursor() cursor.execute("INSERT INTO rental_models(model_name, lang, native_model) VALUES(?, ?, ?)", ("linear_model", "Python", trained_model)) cnxn.commit()
def submit_img(conn_str, table, img): inp = {'image': img, 'Label': 0} df = pd.DataFrame([inp], columns=inp.keys()) out = RxSqlServerData(table=table, connection_string=conn_str) rx_data_step(input_data=df, output_file=out, overwrite=True) print("{} uploaded to {}".format(img, table))
#from microsoftml import rx_predict as ml_predict from length_of_stay_utils import train_test_split, evaluate_model, create_formula, get_num_rows, write_rts_model from SQLConnection import * # Set the Compute Context to SQL. rx_set_compute_context(local) ########################################################################################################################################## ## Input: Point to the SQL table with the data set for modeling ########################################################################################################################################## LoS = RxSqlServerData(table="LoS", connection_string=connection_string, strings_as_factors=True) ########################################################################################################################################## ## Split the data set into a training and a testing set ########################################################################################################################################## # Randomly split the data into a training set and a testing set, with a splitting % p. # p % goes to the training set, and the rest goes to the testing set. Default is 70%. p = 70 ## Create the Train_Id table containing Lead_Id of training set. train_test_split("eid", "LoS", "Train_Id", p, connection_string)
## Export the data to SQL Server ########################################################################################################################################## # Set the compute context to local to export the data to SQL Server. rx_set_compute_context(local) # Point to the txt data sets. News_Train_text = RxTextData(file=os.path.join(file_path, "News_Train"), delimiter="\t") News_Test_text = RxTextData(file=os.path.join(file_path, "News_Test"), delimiter="\t") Label_Names_text = RxTextData(file=os.path.join(file_path, "Label_Names"), delimiter="\t") # Point to the SQL tables where they will be written. News_Train_sql = RxSqlServerData(table="News_Train", connection_string=connection_string) News_Test_sql = RxSqlServerData(table="News_Test", connection_string=connection_string) Label_Names_sql = RxSqlServerData(table="Label_Names", connection_string=connection_string) # Export them to SQL Server. rx_data_step(input_data=News_Train_text, output_file=News_Train_sql, overwrite=True) rx_data_step(input_data=News_Test_text, output_file=News_Test_sql, overwrite=True) rx_data_step(input_data=Label_Names_text, output_file=Label_Names_sql, overwrite=True)
#Connection string to connect to SQL Server named instance conn_str = 'Driver=SQL Server;Server=DESKTOP-NLR1KT5;Database=Apple_sql;Trusted_Connection=True;' #Define the columns we wish to import column_info = {"Date" : { "type" : "str" }, "Open" : { "type" : "str" }, "High" : { "type" : "str" }, "Low" : { "type" : "str" }, "Close" : { "type" : "str" }, "Adj_Close" : { "type" : "str" }, "Volume" : { "type" : "str" } } #Get the data from SQL Server Table data_source = RxSqlServerData(table="dbo.Apple_sql", connection_string=conn_str, column_info=column_info) computeContext = RxInSqlServer( connection_string = conn_str, num_tasks = 1, auto_cleanup = False ) RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) #testing dataset #df = pd.read_csv('H:\\JSDev\\Python_SQL_Server_Value_At_Risk\\AAPL.csv', parse_dates=["Date"], index_col="Date") #df.Close.resample('M').mean() df = pd.DataFrame(rx_import(input_data = data_source)) print("Data frame:", df) returns = df.pct_change()
# Train Test Split resample = True if resample: print("Performing Train Test Split") p = 80 train_test_split(TABLE_TRAIN_ID, TABLE_PATIENTS, p, connection_string=connection_string) # Point to the SQL table with the training data column_info = {"label": {"type": "numeric"}} query = "SELECT * FROM {} WHERE patient_id IN (SELECT patient_id FROM {})".format( TABLE_FEATURES, TABLE_TRAIN_ID) train_sql = RxSqlServerData(sql_query=query, connection_string=connection_string, column_info=column_info) # Create formula formula = create_formula(train_sql) print("Formula:", formula) # Fit a classification model rx_set_compute_context( local) # TODO: change this to sql when sql starts working classifier = rx_fast_trees(formula=formula, data=train_sql, num_trees=1000, method="binary", random_seed=5) rx_set_compute_context(local)
"WeekDay": { "type": "factor", "levels": ["1", "2", "3", "4", "5", "6", "7"] }, "Holiday": { "type": "factor", "levels": ["1", "0"] }, "Snow": { "type": "factor", "levels": ["1", "0"] } } #Get the data from SQL Server Table data_source = RxSqlServerData(table="dbo.rental_data", connection_string=conn_str, column_info=column_info) computeContext = RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) # import data source and convert to pandas dataframe df = pd.DataFrame(rx_import(input_data=data_source)) print("Data frame:", df) # Get all the columns from the dataframe. columns = df.columns.tolist() # Filter the columns to remove ones we don't want to use in the training columns = [c for c in columns if c not in ["Year"]]
def select_entry_where_column_equals_value(table_name, connection_string, column_name, value): query = "SELECT TOP (1) * FROM {} WHERE {} = '{}'".format(table_name, column_name, value) print(query) query_sql = RxSqlServerData(sql_query=query, connection_string=connection_string) data = rx_import(query_sql) return data
def display_head(table_name, n_rows): table_sql = RxSqlServerData(sql_query="SELECT TOP({}}) * FROM {}}".format( n_rows, table_name), connection_string=connection_string) table = rx_import(table_sql) print(table)
def get_patients_id(table_name, connection_string): query = "SELECT DISTINCT patient_id FROM {}".format(table_name) sqldata = RxSqlServerData(sql_query=query, connection_string=connection_string) data = rx_import(sqldata) return data["patient_id"].tolist()
from revoscalepy import rx_import from revoscalepy import rx_dforest from revoscalepy import rx_dtree from revoscalepy import RxSqlServerData from revoscalepy import rx_serialize_model import pyodbc # Connection string to connect to SQL Server named instance conn_str = 'Driver=SQL Server;Server=.;Database=SQLML;Trusted_Connection=True;' sql_query ='SELECT "Sepal.Length","Sepal.Width","Petal.Length","Petal.Width",Species ' \ 'FROM dbo.vw_iris_training' data_source = RxSqlServerData(sql_query=sql_query, connection_string=conn_str) iris_train_data = rx_import(input_data=data_source) iris_train_data["Species"] = iris_train_data["Species"].astype("category") model = rx_dforest( "Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width", data=iris_train_data) trained_rx_dforest_model = rx_serialize_model(model, realtime_scoring_only=True) #Save model conn = pyodbc.connect(conn_str) cursor = conn.cursor() cursor.execute( "DELETE FROM dbo.iris_models WHERE model_name ='rxDForest' AND model_language ='Python'" ) cursor.execute( "INSERT INTO dbo.iris_models(model,model_name,model_language) values (?,?, ?)", trained_rx_dforest_model, 'rxDForest', 'Python') conn.commit()
DECLARE @PythonScript NVARCHAR(MAX) =CONCAT(' import pandas as pd import os import glob from revoscalepy import RxSqlServerData, rx_data_step sqlConnString = "Driver=SQL Server;Server=Serv; ',@DBConnectionString,'" Filefolderepath = ImportFilePath+"*.xlsx" if ImportAll ==1 and CombineTarget==0: for FullFilePath in glob.glob(Filefolderepath): Filename = os.path.basename(FullFilePath).replace(".xlsx","") xl = pd.ExcelFile(FullFilePath) for sheetname in xl.sheet_names: Output = pd.read_excel(FullFilePath, sheetname=sheetname, na_filter=False).astype(str) if not Output.empty: sqlDS = RxSqlServerData(connection_string = sqlConnString,table = "".join(fl for fl in Filename if fl.isalnum())+"_"+"".join(sh for sh in sheetname if sh.isalnum())) rx_data_step(input_data = Output, output_file = sqlDS,overwrite = True) if ImportAll ==1 and CombineTarget==1: df2=pd.DataFrame() for FullFilePath in glob.glob(Filefolderepath): Filename = os.path.basename(FullFilePath).replace(".xlsx","") xl = pd.ExcelFile(FullFilePath) for sheetname in xl.sheet_names: Output = pd.read_excel(FullFilePath, sheetname=sheetname).columns.astype(str) Output = ",".join(list(Output)) df1 = pd.DataFrame([[Filename,sheetname,FullFilePath,Output]],columns=["Filename","sheetname","FullFilePath","Headers"]) df2=df2.append(df1,ignore_index=True) sqlDS = RxSqlServerData(connection_string = sqlConnString,table = "Tbl_PyImpExp1") rx_data_step(input_data = df2, output_file = sqlDS,overwrite = True) if ImportAll ==0: Filename =ImportFilePath+ExcelFileName+".xlsx"
def test_step1_check_output(): col_info = { "eid": { 'type': 'integer' }, "vdate": { 'type': 'character' }, "rcount": { 'type': 'character' }, "gender": { 'type': 'factor' }, "dialysisrenalendstage": { 'type': 'factor' }, "asthma": { 'type': 'factor' }, "irondef": { 'type': 'factor' }, "pneum": { 'type': 'factor' }, "substancedependence": { 'type': 'factor' }, "psychologicaldisordermajor": { 'type': 'factor' }, "depress": { 'type': 'factor' }, "psychother": { 'type': 'factor' }, "fibrosisandother": { 'type': 'factor' }, "malnutrition": { 'type': 'factor' }, "hemo": { 'type': 'factor' }, "hematocrit": { 'type': 'numeric' }, "neutrophils": { 'type': 'numeric' }, "sodium": { 'type': 'numeric' }, "glucose": { 'type': 'numeric' }, "bloodureanitro": { 'type': 'numeric' }, "creatinine": { 'type': 'numeric' }, "bmi": { 'type': 'numeric' }, "pulse": { 'type': 'numeric' }, "respiration": { 'type': 'numeric' }, "secondarydiagnosisnonicd9": { 'type': 'factor' }, "discharged": { 'type': 'character' }, "facid": { 'type': 'factor' }, "lengthofstay": { 'type': 'integer' } } # Point to the input data set while specifying the classes. file_path = "..\\Data" LoS_text = RxTextData(file=os.path.join(file_path, "LengthOfStay.csv"), column_info=col_info) table_text = rx_import(LoS_text) LengthOfStay_sql = RxSqlServerData( sql_query="SELECT * FROM [Hospital].[dbo].[LengthOfStay] ORDER BY eid", connection_string=connection_string, column_info=col_info) table_sql = rx_import(LengthOfStay_sql) assert table_text.equals(table_sql)