def test_step3_check_output(): # Check that all RTS models have been created RTS_odbc = RxOdbcData(connection_string, table="RTS") forest_serialized = rx_read_object(RTS_odbc, key="RF", deserialize=False, decompress=None) boosted_serialized = rx_read_object(RTS_odbc, key="GBT", deserialize=False, decompress=None) fast_serialized = rx_read_object(RTS_odbc, key="FT", deserialize=False, decompress=None) NN_serialized = rx_read_object(RTS_odbc, key="NN", deserialize=False, decompress=None) assert forest_serialized.__str__()[:6] == "b'blob" assert boosted_serialized.__str__()[:6] == "b'blob" assert fast_serialized.__str__()[:6] == "b'blob" assert NN_serialized.__str__()[:6] == "b'blob" # Check that predictions have been made for for all models forest_prediction_sql = RxSqlServerData( sql_query= "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = N'Forest_Prediction'", connection_string=connection_string) forest_prediction = rx_import(input_data=forest_prediction_sql) boosted_prediction_sql = RxSqlServerData( sql_query= "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = N'Boosted_Prediction'", connection_string=connection_string) boosted_prediction = rx_import(input_data=boosted_prediction_sql) fast_prediction_sql = RxSqlServerData( sql_query= "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = N'Fast_Prediction'", connection_string=connection_string) fast_prediction = rx_import(input_data=fast_prediction_sql) NN_prediction_sql = RxSqlServerData( sql_query= "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = N'NN_Prediction'", connection_string=connection_string) NN_prediction = rx_import(input_data=NN_prediction_sql) assert isinstance(forest_prediction, DataFrame) assert isinstance(boosted_prediction, DataFrame) assert isinstance(fast_prediction, DataFrame) assert isinstance(NN_prediction, DataFrame)
def get_num_rows(table, connection_string): count_sql = RxSqlServerData( sql_query="SELECT COUNT(*) FROM {};".format(table), connection_string=connection_string) count = rx_import(count_sql) count = count.iloc[0, 0] return count
def gather_image_paths(images_folder, label_to_number_dict, connection_string, mode="train"): table = None if mode == "train": table = "TrainData" elif mode == "val": table = "ValData" elif mode == "test": table = "TestData" else: raise ValueError("'mode' not recognized") query = 'SELECT ([file_stream].GetFileNamespacePath()) as image FROM [{}] WHERE [is_directory] = 0 AND file_type = \'png\''.format( table) filetable_sql = revo.RxSqlServerData(sql_query=query, connection_string=connection_string) data = revo.rx_import(filetable_sql) data["image"] = data["image"].apply(lambda x: os.path.join( images_folder, x[1:])) # TODO: assert to confirm paths exist data["class"] = data["image"].map( lambda x: os.path.basename(os.path.dirname(x))) data["label"] = data["class"].map(lambda x: label_to_number_dict[x]) print(data.iloc[0, 0]) return data
def select_top_value_of_column(table_name, connection_string, column_name): query = "SELECT TOP(1) " + column_name + " FROM " + table_name print(query) query_sql = RxSqlServerData(sql_query=query, connection_string=connection_string) data = rx_import(query_sql) print(data) return data.iloc[0, 0]
def detect_table(table_name, connection_string): from revoscalepy import RxSqlServerData, rx_import detect_sql = RxSqlServerData(sql_query="IF EXISTS (select 1 from information_schema.tables where table_name = '{}') SELECT 1 AS ret ELSE SELECT 0 AS ret".format(table_name), connection_string=connection_string) does_exist = rx_import(detect_sql) if does_exist.iloc[0,0] == 1: return True else: return False
def gather_image_paths(images_folder, connection_string): root = os.path.dirname(images_folder) query = 'SELECT ([file_stream].GetFileNamespacePath()) as image FROM [MriData] WHERE [is_directory] = 0' filetable_sql = RxSqlServerData(sql_query=query, connection_string=connection_string) data = rx_import(filetable_sql) data["image"] = data["image"].apply(lambda x: os.path.join(root, x[1:])) # TODO: assert to confirm paths exist data["patient_id"] = data["image"].map(lambda x: os.path.basename(os.path.dirname(x))) return data
def DoInMemoryDivision_SQLPlayground(): start_time = time.time() #define the strings for connection conn_str = 'Driver=SQL Server;Server=SANDY;Database=S19SQLPlayground;Trusted_Connection=True;' customer_query = '''SELECT * FROM dbo.Customer;''' product_query = '''SELECT * FROM dbo.Product;''' purchase_query = '''SELECT * FROM dbo.Purchase;''' input_query = [customer_query, product_query, purchase_query] table_name = ['customer_data', 'product_data', 'purchase_data'] #try connection and write query to build the dataframe connected = True try: for i in range(len(input_query)): data_source = revoscale.RxSqlServerData(sql_query=input_query[i], connection_string=conn_str) revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) #globals()[name] create new variable globals()[table_name[i]] = pd.DataFrame( revoscale.rx_import(data_source)) except: print( "Cannot connect to DataBase and create the data table, existing..." ) connected = False if connected == False: exit(0) #iterate 3 tables, either add a new entry to the custExistence dictionary or update the count custExistence = {} for cu_index, cu_row in customer_data.iterrows(): for pr_index, pr_row in product_data.iterrows(): for pu_index, pu_row in purchase_data.iterrows(): if pu_row["CustomerId"] == cu_row["CustomerId"] and pu_row[ "ProductId"] == pr_row["ProductId"]: currentcustomer = pu_row["CustomerId"] if currentcustomer in custExistence: custExistence[currentcustomer] += 1 else: custExistence[currentcustomer] = 1 #print(custExistence) #Count all the product (count the row) nbProducts = product_data.shape[0] #print(nbProducts) #print out the customer who has bought all the products for customerid in custExistence: if custExistence[customerid] == nbProducts: print("CustomerId : " + str(customerid)) end_time = time.time() print("This took " + str(end_time - start_time) + " s")
def evaluate_model(conn_str, candidate_table, image_pairs_table): distMethods = ["L2", "cosine"] image_scores = load_predicted_scores(conn_str, TABLE_SCORE) imagePairQuery = "SELECT * FROM " + image_pairs_table imagePairData_sql = RxSqlServerData(connection_string=conn_str, sql_query=imagePairQuery, strings_as_factors=False) imagePairData = rx_import(imagePairData_sql) print("Calculating ranking metrics...") calculate_ranking_metrics(image_scores, imagePairData, distMethods)
def prepare_data(conn_str, feature_table, train_table, test_table, evaluation_table, numQueryImagePerCat, numNegImgsPerQueryImg, ratioTrainTest): image_data_sql = RxSqlServerData(connection_string=conn_str, sql_query="SELECT image, Label FROM " + feature_table, string_as_factors=False) image_data_df = rx_import(image_data_sql) print("Split image data into training and testing set...") get_training_testing_images(image_data_df, ratioTrainTest, train_table, test_table, conn_str) print("Getting ranking set from the testing image set...") prepare_evaluation_set(conn_str, feature_table, test_table, evaluation_table, numQueryImagePerCat, numNegImgsPerQueryImg)
def test_step2_check_output(): LoS_sql = RxSqlServerData( sql_query="SELECT TOP (5) * FROM [Hospital].[dbo].[LoS] ORDER BY eid", connection_string=connection_string) LoS = rx_import(input_data=LoS_sql) LoS[["number_of_issues"]] = LoS[["number_of_issues"]].apply(to_numeric) # bmi = [0.312740, -0.671356, -0.48006, -0.921639, 0.226158] # lengthofstay = [3.0, 7.0, 3.0, 1.0, 4.0] # number_of_issues = [0, 0, 0, 0, 2] # d = {"bmi": bmi, "lengthofstay": lengthofstay, "number_of_issues": number_of_issues} # df = DataFrame(d, index = [0,1,2,3,4]) assert (LoS.loc[4, 'number_of_issues'] == 2) assert (round(LoS.loc[1, 'bmi'], 6) == round(-0.67135640398687824, 6)) assert (LoS.loc[3, 'lengthofstay'] == 1.0)
def perform_clustering(): conn_str = 'Driver=SQL Server;Server=DESKTOP-NLR1KT5;Database=tpcxbb_1gb;Trusted_Connection=True;' input_query = '''SELECT ss_customer_sk AS customer, ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio, ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio, ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio, COALESCE(returns_count, 0) AS frequency FROM ( SELECT ss_customer_sk, COUNT(distinct(ss_ticket_number)) AS orders_count, COUNT(ss_item_sk) AS orders_items, SUM( ss_net_paid ) AS orders_money FROM store_sales s GROUP BY ss_customer_sk ) orders LEFT OUTER JOIN ( SELECT sr_customer_sk, count(distinct(sr_ticket_number)) as returns_count, COUNT(sr_item_sk) as returns_items, SUM( sr_return_amt ) AS returns_money FROM store_returns GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk''' column_info = { "customer": { "type": "integer" }, "orderRatio": { "type": "integer" }, "itemsRatio": { "type": "integer" }, "frequency": { "type": "integer" } } data_source = revoscale.RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str) revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) customer_data = pd.DataFrame(revoscale.rx_import(data_source)) print("Data frame:", customer_data.head(n=5)) cdata = customer_data K = range(1, 20) KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K) centroids = (k.cluster_centers_ for k in KM) D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids) dist = (np.min(D, axis=1) for D in D_k) avgWithinSS = [sum(d) / cdata.shape[0] for d in dist] plt.plot(K, avgWithinSS, 'b*-') plt.grid(True) plt.xlabel('Number of clusters') plt.ylabel('Average within-cluster sum of squares') plt.title('Elbow for KMeans clustering') plt.show() n_clusters = 4 means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111) columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"] est = means_cluster.fit(customer_data[columns]) clusters = est.labels_ customer_data['cluster'] = clusters for c in range(n_clusters): cluster_members = customer_data[customer_data['cluster'] == c][:] print('Cluster{}(n={}):'.format(c, len(cluster_members))) print('-' * 17) print(customer_data.groupby(['cluster']).mean())
"WeekDay": { "type": "factor", "levels": ["1", "2", "3", "4", "5", "6", "7"] }, "Holiday": { "type": "factor", "levels": ["1", "0"] }, "Snow": { "type": "factor", "levels": ["1", "0"] } } #Get the data from SQL Server Table data_source = RxSqlServerData(table="dbo.rental_data", connection_string=conn_str, column_info=column_info) computeContext = RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) # import data source and convert to pandas dataframe df = pd.DataFrame(rx_import(input_data=data_source)) print("Data frame:", df) # Get all the columns from the dataframe. columns = df.columns.tolist() # Filter the columns to remove ones we don't want to use in the training columns = [c for c in columns if c not in ["Year"]]
def get_rental_predictions(): #Connection string to connect to SQL Server named instance conn_str = 'Driver=SQL Server;Server=MYSQLSERVER;Database=TutorialDB;Trusted_Connection=True;' #Define the columns we wish to import column_info = { "Year" : { "type" : "integer" }, "Month" : { "type" : "integer" }, "Day" : { "type" : "integer" }, "RentalCount" : { "type" : "integer" }, "WeekDay" : { "type" : "factor", "levels" : ["1", "2", "3", "4", "5", "6", "7"] }, "Holiday" : { "type" : "factor", "levels" : ["1", "0"] }, "Snow" : { "type" : "factor", "levels" : ["1", "0"] } } #Get the data from SQL Server Table data_source = RxSqlServerData(table="dbo.rental_data", connection_string=conn_str, column_info=column_info) computeContext = RxInSqlServer( connection_string = conn_str, num_tasks = 1, auto_cleanup = False ) RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) # import data source and convert to pandas dataframe df = pd.DataFrame(rx_import(input_data = data_source)) print("Data frame:", df) # Get all the columns from the dataframe. columns = df.columns.tolist() # Filter the columns to remove ones we don't want to use in the training columns = [c for c in columns if c not in ["Year"]] # Store the variable we'll be predicting on. target = "RentalCount" # Generate the training set. Set random_state to be able to replicate results. train = df.sample(frac=0.8, random_state=1) # Select anything not in the training set and put it in the testing set. test = df.loc[~df.index.isin(train.index)] # Print the shapes of both sets. print("Training set shape:", train.shape) print("Testing set shape:", test.shape) # Initialize the model class. lin_model = LinearRegression() # Fit the model to the training data. lin_model.fit(train[columns], train[target]) # Generate our predictions for the test set. lin_predictions = lin_model.predict(test[columns]) print("Predictions:", lin_predictions) # Compute error between our test predictions and the actual values. lin_mse = mean_squared_error(lin_predictions, test[target]) print("Computed error:", lin_mse)
# Connection string to connect to SQL Server named instance. conn_str = 'Driver=SQL Server;Server=MS;Database=ML;Trusted_Connection=True;' input_query = '''SELECT * FROM [OnlineSales].[RFM]''' # Define the columns we wish to import. column_info = { "customer": {"type": "integer"}, "orderRatio": {"type": "integer"}, "itemsRatio": {"type": "integer"}, "frequency": {"type": "integer"} } data_source = revoscalepy.RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str) revoscalepy.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) # import data source and convert to pandas dataframe. customer_data = pd.DataFrame(revoscalepy.rx_import(data_source)) print("Data frame:", customer_data.head(n=5)) print(customer_data.describe()) cdata = customer_data K = range(1, 20) KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K) centroids = (k.cluster_centers_ for k in KM) D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids) dist = (np.min(D, axis=1) for D in D_k) avgWithinSS = [sum(d) / cdata.shape[0] for d in dist] plt.plot(K, avgWithinSS, 'b*-') plt.grid(True) plt.xlabel('Number of clusters') plt.ylabel('Average within-cluster sum of squares')
def DoInMemoryDivision(): # Connection string to connect to local DB conn_str = 'Driver=SQL Server;Server=DESKTOP-M98I69D;Database=S19SQLPlayground_Seb;Trusted_Connection=True;' # Queries to retrieve the Customers, Products, and purchases data respectively query_customers = '''SELECT * FROM dbo.Customer''' query_products = '''SELECT * FROM dbo.Product''' query_purchases = '''SELECT * FROM dbo.Purchase''' # Define the columns we wish to import for Customers, Products, and purchases data respectively col_cust_info = { "Customer": { "type": "integer" }, "FirstName": { "type": "str" }, "LastName": { "type": "str" } } col_prod_info = { "Product": { "type": "integer" }, "Description": { "type": "str" }, "UnitPrice": { "type": "integer" } } col_pur_info = { "Customer": { "type": "integer" }, "Product": { "type": "integer" }, "Quantity": { "type": "integer" } } # Start Timer to measure execution time start = timer() data_source_cust = revoscale.RxSqlServerData(sql_query=query_customers, column_Info=col_cust_info, connection_string=conn_str) data_source_prod = revoscale.RxSqlServerData(sql_query=query_products, column_Info=col_prod_info, connection_string=conn_str) data_source_pur = revoscale.RxSqlServerData(sql_query=query_purchases, column_Info=col_pur_info, connection_string=conn_str) # retrieve the data sources revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=3, auto_cleanup=False) # import data sources and convert to pandas dataframe. customer_data = pd.DataFrame(revoscale.rx_import(data_source_cust)) product_data = pd.DataFrame(revoscale.rx_import(data_source_prod)) purchase_data = pd.DataFrame(revoscale.rx_import(data_source_pur)) # Create a DataFrame to store customer purchase count cust_dict = {'CustId': [], 'PurchaseCount': []} cust_Purc_Count = pd.DataFrame(cust_dict) # Nb of products nb_products = len(product_data.index) # Division process # Iterate over all customers by df rows for cindex, crow in customer_data.iterrows(): # Iterate over all products by df rows for pindex, prow in product_data.iterrows(): # Iterate over all purchases by df rows for hindex, hrow in purchase_data.iterrows(): # Check whether the currentCustomer has bought the current product # print (crow['CustomerId'], hrow['CustomerId'], prow['ProductId'] ) if (crow['CustomerId'] == hrow['CustomerId']) and (prow['ProductId'] == hrow['ProductId']): # Either add a new DataRow in custPurchasesCount # if no existing for the current customer or update the count custExistence = 0 for index, cust_count in cust_Purc_Count.iterrows(): if cust_count['CustId'] == crow['CustomerId']: custExistence += 1 if (custExistence == 0): # First product for this customer td = {'CustId': crow['CustomerId'], 'PurchaseCount': 1} # Make a dataframe to be able to append to cust_Purc_Count tdf = pd.DataFrame(td, index=[0]) # Append to cust_Purc_Count # We must use the syntax below because for df append doesn't occur in place but in the object returned by the method cust_Purc_Count = cust_Purc_Count.append(tdf) else: # The product is already referenced cust_Purc_Count => Just increase the PurchaseCount cust_count['PurchaseCount'] += 1 for dindex, drow in cust_Purc_Count.iterrows(): # Check the customers that bought all the products if drow['PurchaseCount'] == nb_products: print('\nDivision CustomerId: ' + str(drow['CustId'])) end = timer() print("The In Memory Division in Python took (S) : ", end - start)
output_file=News_Test_sql, overwrite=True) rx_data_step(input_data=Label_Names_text, output_file=Label_Names_sql, overwrite=True) ########################################################################################################################################## ## Get the factor levels of the label ########################################################################################################################################## # Get the factor levels of the label. Factors_sql = RxSqlServerData( sql_query="SELECT DISTINCT Label FROM News_Train", connection_string=connection_string) levels_list = list(rx_import(Factors_sql)['Label']) # Write the factor name and levels into a dictionary. factor_info = { 'Label': { 'type': 'factor', 'levels': [str(s) for s in levels_list] } } ########################################################################################################################################## ## Create features on the fly for the training set and train the model ########################################################################################################################################## # Set the compute context to SQL for training. rx_set_compute_context(sql)
# 2.2.1 Specifying a missed value for strings import os import settings as st from revoscalepy import rx_import, rx_get_info, RxXdfData # Importing Delimited Text Data infile = os.path.join(st.SAMPLE_DATA_DIR, 'AirlineDemoSmall.csv') ## REM: Our RxOptions.set_option("OutDataPath", [RESULTS_LOCATION]) ## is not taken into account (bug?). Hence, we specify the full location outfile = os.path.join(st.RESULTS_LOCATION, 'airExample.xdf') # The 'missingValueString' option is not available in RevoscalePy # See: https://docs.microsoft.com/en-us/machine-learning-server/python-reference/revoscalepy/rxmissingvalues # A question has been asked on StackOverflow for missing string values: # https://stackoverflow.com/questions/51155965/default-value-for-strings-when-missing-in-revoscalepy-import airData = rx_import( input_data=infile, output_file=outfile, strings_as_factors=True, # missingValueString = "M" rows_per_read=200000, overwrite=True)
def get_patients_id(table_name, connection_string): query = "SELECT DISTINCT patient_id FROM {}".format(table_name) sqldata = RxSqlServerData(sql_query=query, connection_string=connection_string) data = rx_import(sqldata) return data["patient_id"].tolist()
def test_step1_check_output(): col_info = { "eid": { 'type': 'integer' }, "vdate": { 'type': 'character' }, "rcount": { 'type': 'character' }, "gender": { 'type': 'factor' }, "dialysisrenalendstage": { 'type': 'factor' }, "asthma": { 'type': 'factor' }, "irondef": { 'type': 'factor' }, "pneum": { 'type': 'factor' }, "substancedependence": { 'type': 'factor' }, "psychologicaldisordermajor": { 'type': 'factor' }, "depress": { 'type': 'factor' }, "psychother": { 'type': 'factor' }, "fibrosisandother": { 'type': 'factor' }, "malnutrition": { 'type': 'factor' }, "hemo": { 'type': 'factor' }, "hematocrit": { 'type': 'numeric' }, "neutrophils": { 'type': 'numeric' }, "sodium": { 'type': 'numeric' }, "glucose": { 'type': 'numeric' }, "bloodureanitro": { 'type': 'numeric' }, "creatinine": { 'type': 'numeric' }, "bmi": { 'type': 'numeric' }, "pulse": { 'type': 'numeric' }, "respiration": { 'type': 'numeric' }, "secondarydiagnosisnonicd9": { 'type': 'factor' }, "discharged": { 'type': 'character' }, "facid": { 'type': 'factor' }, "lengthofstay": { 'type': 'integer' } } # Point to the input data set while specifying the classes. file_path = "..\\Data" LoS_text = RxTextData(file=os.path.join(file_path, "LengthOfStay.csv"), column_info=col_info) table_text = rx_import(LoS_text) LengthOfStay_sql = RxSqlServerData( sql_query="SELECT * FROM [Hospital].[dbo].[LengthOfStay] ORDER BY eid", connection_string=connection_string, column_info=col_info) table_sql = rx_import(LengthOfStay_sql) assert table_text.equals(table_sql)
# 2.2 Importing Delimited Text Data import os import settings as st from revoscalepy import rx_import, rx_get_info # Importing Delimited Text Data infile = os.path.join(st.SAMPLE_DATA_DIR, 'claims.txt') claims_data_frame = rx_import(infile) # Displaying info print(rx_get_info(claims_data_frame, get_var_info=True)) # Saving text data into XDF file ## REM: Our RxOptions.set_option("OutDataPath", [RESULTS_LOCATION]) ## is not taken into account (bug?). Hence, we specify the full location outfile = os.path.join(st.RESULTS_LOCATION, 'claims.xdf') rx_import(input_data=infile, output_file=outfile, overwrite=True)
########################################################################################################################################## # Make Predictions, then import them into Python. forest_prediction_sql = RxSqlServerData(table="Forest_Prediction", string_as_factors=True, connection_string=connection_string) rx_predict(forest_model, data=LoS_Test, output_data=forest_prediction_sql, type="response", extra_vars_to_write=["lengthofstay", "eid"], overwrite=True) # Compute the performance metrics of the model. forest_prediction = rx_import(input_data=forest_prediction_sql) forest_metrics = evaluate_model( observed=forest_prediction['lengthofstay'], predicted=forest_prediction['lengthofstay_Pred'], model="RF") ########################################################################################################################################## ## Boosted Trees Scoring ########################################################################################################################################## # Make Predictions, then import them into Python. boosted_prediction_sql = RxSqlServerData(table="Boosted_Prediction", string_as_factors=True, connection_string=connection_string)
# 2.8 Append to an existing file import os import settings as st from revoscalepy import rx_import, rx_get_info # Input file firstFile = os.path.join(st.SAMPLE_DATA_DIR, 'claims.txt') # Output file outfile = os.path.join(st.RESULTS_LOCATION, 'claimsclaims.xdf') # Creating 1st file rx_import(input_data=firstFile, output_file=outfile, strings_as_factors=True, overwrite=True) # Displaying info claims_data_frame = rx_import(outfile) print(rx_get_info(claims_data_frame, get_var_info=True)) # Appending the same file to the output rx_import(input_data=firstFile, output_file=outfile, strings_as_factors=True, append="rows") # Re-displaying info claims_data_frame = rx_import(outfile) print(rx_get_info(claims_data_frame, get_var_info=True))
import pandas as pd from revoscalepy import rx_import, rx_get_info, RxOptions, RxTextData, RxXdfData, rx_create_col_info, rx_data_step # For this example, we assume the airOT201201.csv file was downloaded into the current working directory dirpath = os.getcwd() # Other samples in RxOptions.get_option("sampleDataDir") # CSV File (Raw Data for one month) airlineDemoPathCSV = os.path.join(dirpath, "airOT201201.csv") # Create the CSV Data Source using RxTextData ds = RxTextData(airlineDemoPathCSV) # Import first 10 rows of the CSV file airlineDemo = rx_import(input_data=ds, number_rows=10) # Inspect data types using rx_get_info print(rx_get_info(airlineDemo, get_var_info=True)) # Inspect Unique Carrier (factor) print(rx_get_info(airlineDemo, vars_to_keep="UNIQUE_CARRIER", get_var_info=True)) # Import the first 10 rows again, and this time use strings_as_factors = True airlineDemo = rx_import(input_data=ds, strings_as_factors = True, number_rows=10) # Inspect data types print(rx_get_info(airlineDemo, get_var_info=True)) # Inspect Unique Carrier (factor) to see the difference using strings_as_factors = True print(rx_get_info(airlineDemo, vars_to_keep="UNIQUE_CARRIER", get_var_info=True))
def select_entry_where_column_equals_value(table_name, connection_string, column_name, value): query = "SELECT TOP (1) * FROM {} WHERE {} = '{}'".format(table_name, column_name, value) print(query) query_sql = RxSqlServerData(sql_query=query, connection_string=connection_string) data = rx_import(query_sql) return data
# Defining transformation colInfoList = { 'age': { 'type': 'factor', 'levels': ['17-20', '21-24', '25-29', '30-34', '35-39', '40-49', '50-59', '60+'] }, 'car.age': { 'type': 'factor', 'levels': ['0-3', '4-7', '8_9', '10+'] }, 'type': { 'type': 'factor', 'levels': ['A', 'B', 'C', 'D'] }, } # Applying transformation rx_import(input_data=infile, output_file=outfile, column_info=colInfoList, overwrite=True) # Displaying info claims_data_frame = rx_import(outfile) print(rx_get_info(claims_data_frame, get_var_info=True)) print(claims_data_frame.head())
outfileTypeRelabeled = os.path.join(st.RESULTS_LOCATION, 'claimsTypeRelabeled.xdf') # Defining transformation colInfoList = { 'type': { 'type': 'factor', 'levels': ['A', 'B', 'C', 'D'], 'newLevels': ['Subcompact', 'Compact', 'Mid-size', 'Full-size'], 'description': 'Body Type' } } # Applying transformation rx_import(input_data=inFileAddVars, output_file=outfileTypeRelabeled, column_info=colInfoList, overwrite=True) ## REM: Contrary to what is mentioned in the documentation, rx_import ## does not return a RxXdfData object when an output file is specified ## but a <class 'bool'>. ## See: https://docs.microsoft.com/en-us/machine-learning-server/python-reference/revoscalepy/rx-import#returns # Displaying info claims_data_frame = rx_import(outfileTypeRelabeled) info = rx_get_info(claims_data_frame, get_var_info=True) print(info) print(claims_data_frame.head())
from revoscalepy import rx_import from revoscalepy import rx_dforest from revoscalepy import rx_dtree from revoscalepy import RxSqlServerData from revoscalepy import rx_serialize_model import pyodbc # Connection string to connect to SQL Server named instance conn_str = 'Driver=SQL Server;Server=.;Database=SQLML;Trusted_Connection=True;' sql_query ='SELECT "Sepal.Length","Sepal.Width","Petal.Length","Petal.Width",Species ' \ 'FROM dbo.vw_iris_training' data_source = RxSqlServerData(sql_query=sql_query, connection_string=conn_str) iris_train_data = rx_import(input_data=data_source) iris_train_data["Species"] = iris_train_data["Species"].astype("category") model = rx_dforest( "Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width", data=iris_train_data) trained_rx_dforest_model = rx_serialize_model(model, realtime_scoring_only=True) #Save model conn = pyodbc.connect(conn_str) cursor = conn.cursor() cursor.execute( "DELETE FROM dbo.iris_models WHERE model_name ='rxDForest' AND model_language ='Python'" ) cursor.execute( "INSERT INTO dbo.iris_models(model,model_name,model_language) values (?,?, ?)", trained_rx_dforest_model, 'rxDForest', 'Python') conn.commit()
def display_head(table_name, n_rows): table_sql = RxSqlServerData(sql_query="SELECT TOP({}}) * FROM {}}".format( n_rows, table_name), connection_string=connection_string) table = rx_import(table_sql) print(table)
def perform_clustering(): ################################################################################################ ## Connect to DB and select data ################################################################################################ # Connection string to connect to SQL Server named instance. conn_str = 'Driver=SQL Server;Server=DESKTOP-INU1SHI;Database=tpcxbb_1gb;Trusted_Connection=True;' input_query = '''SELECT ss_customer_sk AS customer, ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio, ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio, ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio, COALESCE(returns_count, 0) AS frequency FROM ( SELECT ss_customer_sk, -- return order ratio COUNT(distinct(ss_ticket_number)) AS orders_count, -- return ss_item_sk ratio COUNT(ss_item_sk) AS orders_items, -- return monetary amount ratio SUM( ss_net_paid ) AS orders_money FROM store_sales s GROUP BY ss_customer_sk ) orders LEFT OUTER JOIN ( SELECT sr_customer_sk, -- return order ratio count(distinct(sr_ticket_number)) as returns_count, -- return ss_item_sk ratio COUNT(sr_item_sk) as returns_items, -- return monetary amount ratio SUM( sr_return_amt ) AS returns_money FROM store_returns GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk''' # Define the columns we wish to import. column_info = { "customer": {"type": "integer"}, "orderRatio": {"type": "integer"}, "itemsRatio": {"type": "integer"}, "frequency": {"type": "integer"} } data_source = revoscale.RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str) ComputeContext = revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) # import data source and convert to pandas dataframe. customer_data = pd.DataFrame(revoscale.rx_import(data_source)) print("Data frame:", customer_data.head(n=5)) ################################################################################################ ## Determine number of clusters using the Elbow method ################################################################################################ cdata = customer_data K = range(1,20) KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K) centroids = (k.cluster_centers_ for k in KM) D_K = (sci_distance.cdist(cdata,cent,'euclidean')for cent in centroids) dist = (np.min(D, axis=1)for D in D_K) avgWithinSS = [sum(d)/cdata.shape[0] for d in dist] plt.plot(K, avgWithinSS, 'b*-') plt.grid(True) plt.xlabel('Number of Clusters') plt.ylabel('Average Within Clusters Sum of Squares') plt.title('Kmeans Clustering') plt.show()
import microsoftml as ml from microsoftml import FastForest, FastLinear, FastTrees, LogisticRegression, NeuralNetwork from microsoftml import rx_fast_forest, rx_fast_linear, rx_fast_trees, rx_logistic_regression, rx_neural_network from microsoftml import rx_ensemble, Ensemble, EnsembleControl import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_recall_curve from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import roc_curve from sklearn.metrics import auc # Read Cleaned Ipo2609 XDF file IPO2609FE = rx_import("IPO2609FeatureEngineering.xdf") ipo_train, ipo_test = train_test_split(IPO2609FE, random_state=42) # Columns for training (X), remove label (y) features = IPO2609FE.columns.drop(["underpriced"]) # SciKit-Learn logistic regression model X = ipo_train[features.values.tolist()] y = np.ravel(ipo_train["underpriced"]) sk_log_reg = LogisticRegression() sk_log_model = sk_log_reg.fit(X, y) probList = [] probList = sk_log_model.predict_proba(ipo_test[features.values.tolist()])[:, 1] probArray = np.asarray(probList) fpr, tpr, thresholds = roc_curve(ipo_test["underpriced"], probArray) aucResult = auc(fpr, tpr)
def perform_clustering(): ################################################################################################ ## Connect to DB and select data ################################################################################################ # Connection string to connect to SQL Server named instance. conn_str = 'Driver=SQL Server;Server=DESKTOP-CF2TNBV;Database=tpcxbb_1gb;Trusted_Connection=True;' input_query = '''SELECT ss_customer_sk AS customer, ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio, ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio, ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio, COALESCE(returns_count, 0) AS frequency FROM ( SELECT ss_customer_sk, -- return order ratio COUNT(distinct(ss_ticket_number)) AS orders_count, -- return ss_item_sk ratio COUNT(ss_item_sk) AS orders_items, -- return monetary amount ratio SUM( ss_net_paid ) AS orders_money FROM store_sales s GROUP BY ss_customer_sk ) orders LEFT OUTER JOIN ( SELECT sr_customer_sk, -- return order ratio count(distinct(sr_ticket_number)) as returns_count, -- return ss_item_sk ratio COUNT(sr_item_sk) as returns_items, -- return monetary amount ratio SUM( sr_return_amt ) AS returns_money FROM store_returns GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk''' # Define the columns we wish to import. column_info = { "customer": { "type": "integer" }, "orderRatio": { "type": "integer" }, "itemsRatio": { "type": "integer" }, "frequency": { "type": "integer" } } data_source = revoscale.RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str) revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) # import data source and convert to pandas dataframe. customer_data = pd.DataFrame(revoscale.rx_import(data_source)) print("Data frame:", customer_data.head(n=5)) ################################################################################################ ## Determine number of clusters using the Elbow method ################################################################################################ cdata = customer_data K = range(1, 20) KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K) centroids = (k.cluster_centers_ for k in KM) D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids) dist = (np.min(D, axis=1) for D in D_k) avgWithinSS = [sum(d) / cdata.shape[0] for d in dist] plt.plot(K, avgWithinSS, 'b*-') plt.grid(True) plt.xlabel('Number of clusters') plt.ylabel('Average within-cluster sum of squares') plt.title('Elbow for KMeans clustering') plt.show() ################################################################################################ ## Perform clustering using Kmeans ################################################################################################ # It looks like k=4 is a good number to use based on the elbow graph. n_clusters = 4 means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111) columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"] est = means_cluster.fit(customer_data[columns]) clusters = est.labels_ customer_data['cluster'] = clusters # Print some data about the clusters: # For each cluster, count the members. for c in range(n_clusters): cluster_members = customer_data[customer_data['cluster'] == c][:] print('Cluster{}(n={}):'.format(c, len(cluster_members))) print('-' * 17) # Print mean values per cluster. print(customer_data.groupby(['cluster']).mean())