def compute_features(data, output_table, connection_string): results_sql = revo.RxSqlServerData(table=output_table, connection_string=connection_string) revo.rx_data_step(input_data=data, output_file=results_sql, overwrite=True, transform_function=featurize_transform)
def train_classifier(conn_str, feature_table, training_table, testing_table, score_table, model_table, DNNDimension): levels = get_label_levels(feature_table, conn_str) # get levels of Label colInfo = {"Label": {"type": "factor", "levels": levels}} train_query = "SELECT * FROM {} WHERE image IN (SELECT image FROM {})".format( feature_table, training_table) train_data = RxSqlServerData(sql_query=train_query, connection_string=conn_str, column_info=colInfo) test_query = "SELECT * FROM {} WHERE image IN (SELECT image FROM {})".format( feature_table, testing_table) test_data = RxSqlServerData(sql_query=test_query, connection_string=conn_str, column_info=colInfo) featureSet = ["feature." + str(i) for i in range(DNNDimension)] label = "Label" cols = featureSet + [label] classifier = rx_neural_network( "Label ~ feature", data=train_data, method="multiClass", optimizer=sgd_optimizer(learning_rate=0.011, l_rate_red_ratio=0.9, l_rate_red_freq=3, momentum=0.2), num_iterations=300, ml_transforms=[concat(cols={"feature": featureSet})]) test_score = rx_predict(classifier, test_data, extra_vars_to_write=["image", "Label"]) train_score = rx_predict(classifier, train_data, extra_vars_to_write=["image", "Label"]) testACC = float( len(test_score[test_score["Label"] == test_score["PredictedLabel"]])) / len(test_score) trainACC = float( len(train_score[train_score["Label"] == train_score["PredictedLabel"]])) / len(train_score) print("The train accuracy of the neural network model is {}".format( trainACC)) print( "The test accuracy of the neural network model is {}".format(testACC)) print("Saving the predictive results of all the images into SQL table...") outputScore = RxSqlServerData(connection_string=conn_str, table=score_table) rx_data_step(pd.concat([test_score, train_score]), outputScore, overwrite=True) print("Saving model into SQL table...") save_model(model_table, conn_str, classifier, MODEL_NAME)
def rank_candidates(conn_str, query_table, model_table, results_table): print("Loading the classifier output for all the candidate images...") candidateImageVector = load_predicted_scores(conn_str, TABLE_SCORE) print("Loading query images...") queryImagesSql = "SELECT * FROM " + query_table queryImages = RxSqlServerData(connection_string=conn_str, sql_query=queryImagesSql, strings_as_factors=False) print("Embedding query images...") query_image_features = rx_data_step(input_data=queryImages, overwrite=True, transform_function=featurize_transform, report_progress=2) print("Loading classifier...") classifier = load_model(model_table, conn_str, MODEL_NAME) print("Getting classifier output for query images...") queryImageScores = rx_predict(classifier, query_image_features, extra_vars_to_write=["image", "Label"]) queryImageVector = map_images_to_predictedscores(queryImageScores) print("Calculating cosine similarity between each image pair...") rank_candiate_images(conn_str, queryImageVector, candidateImageVector, TOP_K_CANDIDATES, results_table)
def train_test_split(id, table, train_table, p, connection_string): from revoscalepy import rx_set_temp_compute_context, RxInSqlServer, rx_data_step, RxLocalSeq def transform(data, compute_context): return data data = RxSqlServerData( sql_query= "SELECT {} FROM {} WHERE ABS(CAST(CAST(HashBytes('MD5', CAST({} AS varchar(10))) AS VARBINARY(64)) AS BIGINT) % 100) < {};" .format(id, table, id, p), connection_string=connection_string) train_table_dest = RxSqlServerData(table='Train_Id', connection_string=connection_string) rx_data_step(input_data=data, output_file=train_table_dest, overwrite=True, transform_function=transform) print("Created {}".format(train_table))
# Import into dataframe with vars_to_keep airlineDemo = rx_import(input_data=ds, vars_to_keep=vars_to_keep) # Get Info on the latest import print(rx_get_info(airlineDemo, get_var_info=True)) # Check size in memory. Note, we have a significant reduction in memory by specifyin only the var we need sys.getsizeof(airlineDemo) # Create a transform function to add Gain in Minutes def transformFunc(data): ret = data ret["GainMinutes"] = data["DepDelayMinutes"] - data["ArrDelayMinutes"] return ret # Import the data again using the transformation function airlineDemo = rx_import(input_data=ds, vars_to_keep=vars_to_keep, transform_function=transformFunc) # Review the new calculated column, GainMinutes airlineDemo["GainMinutes"].describe() # Write data to Output File using rx_data_step outPath = os.path.join(dirpath, "AirOnTime2012_Processed.xdf") rx_data_step(input_data = airlineDemo, output_file = outPath, rows_per_read = 200000, overwrite = True, xdf_compression_level = 5) # 5 is faster and the file is not much bigger than 9
## Fast Trees Scoring ########################################################################################################################################## # Make Predictions, then write them to a table. LoS_Test_import = rx_import(input_data=LoS_Test) fast_prediction = ml_predict(fast_model, data=LoS_Test_import, extra_vars_to_write=["lengthofstay", "eid"], overwrite=True) fast_prediction_sql = RxSqlServerData(table="Fast_Prediction", strings_as_factors=True, connection_string=connection_string) rx_data_step(input_data=fast_prediction, output_file=fast_prediction_sql, overwrite=True) # Compute the performance metrics of the model. fast_metrics = evaluate_model(observed=fast_prediction['lengthofstay'], predicted=fast_prediction['Score'], model="FT") ########################################################################################################################################## ## Neural Networks Scoring ########################################################################################################################################## # Make Predictions, then write them to a table. NN_prediction = ml_predict(NN_model,
def main(tablename, inputdf, overwrite=False): """Imports a data source into SQL Server table and, operating in-database, uses logistic regression to create a predictive model. A comparison to an out-of-database, in memory method is performed (TODO) Parameters ---------- tablename : str The new or previosly create table name in database. inputdf : RxTextData object The data source overwrite : bool, optional (default=False) Whether or not to overwrite the table. Set to True if this is a new table, otherwise False. """ #################################################################### # Set the compute context to SQL SERVER #################################################################### # NB: don't need, but would be good to know what this actually does here # RxComputeContext(LOCAL, '9.1') compute_context = RxInSqlServer(connection_string=CONNECTION_STRING) # , # num_tasks = 1, # auto_cleanup = False, # console_output=True # ) rx_set_compute_context(compute_context) # if overwrite: #################################################################### # Create table in SQL server #################################################################### print("Creating tables...") data_source = RxSqlServerData(table=tablename, connection_string=CONNECTION_STRING) #################################################################### # Read data into the SQL server table that was just created #################################################################### print("Reading data into tables...") rx_data_step(input_data=inputdf, output_file=data_source, overwrite=True) ##################################################################### # Set up a query on table for train and test data (and ensure factor levels) ##################################################################### print("Setting up query and datasources for train and test sets...") # Train data data_source_train = RxSqlServerData( sql_query="SELECT TOP 10000 * FROM Lead_Demography_Tbl \ ORDER BY Lead_Id", connection_string=CONNECTION_STRING, verbose=True) # Import training data RxImport style from new query source X_y_train = rx_import(data_source_train) # X_y_train = rx_data_step(input_data=data_source_train, overwrite=True) print("Test data...") # Test data (let's pick ~30% size of training dataset) data_source_test = RxSqlServerData( sql_query="SELECT * FROM Lead_Demography_Tbl \ ORDER BY Lead_Id \ OFFSET 10000 ROWS \ FETCH FIRST 3000 ROW ONLY", connection_string=CONNECTION_STRING) # Import data RxImport style from new query source X_y_test = rx_import(data_source_test) # X_y_test = rx_data_step(input_data=data_source_test, overwrite=True) ##################################################################### # Run revoscalepy logistic regression on training data (in-database) ##################################################################### print('Fitting a logistic regression model...') mod = rx_logit(formula="Annual_Income_Bucket_gt120k ~ \ F(Highest_Education_Graduate_School)", data=X_y_train, compute_context=compute_context, verbose=2) assert mod is not None assert mod._results is not None pprint(mod._results) ##################################################################### # Summary on training data (in-database) ##################################################################### # Note: for "data" use data source and not the rximport'ed data print('\nSummary: \n') summary = rx_summary("Annual_Income_Bucket_gt120k ~ \ F(Highest_Education_Graduate_School)", data=data_source_train, compute_context=compute_context, cube=True, verbose=2) ##################################################################### # Predict on test data (in-database) ##################################################################### print("\nPredict on test: \n") pred = rx_predict(mod, data=X_y_test, verbose=2, write_model_vars=True) ##################################################################### # Metrics for predition based on groundtruth (with scikit-learn tools) ##################################################################### pred_results = pred._results['Annual_Income_Bucket_gt120k_Pred'] # For some reason the prediction results are not in a binary [0,1] format y_pred = binarize(pred_results, threshold=(min(pred_results) + \ max(pred_results))/2).reshape(-1, 1) y_true = pred._data['Annual_Income_Bucket_gt120k'] print("Model prediction results:", y_pred) print("Actual values: ", y_true) print("Accuracy score: ", accuracy_score(y_true=y_true, y_pred=y_pred))
alpha=opacity, color='g', label='NYSE') rects3 = plt.bar(index + bar_width * 2, df_under_nasdaq, bar_width, alpha=opacity, color='r', label='NASDAQ') plt.xlabel('Exchange') plt.ylabel('Return') plt.title('Underpriced Average Returns by Exchange') plt.xticks(index + bar_width, ('D1', 'R12', 'R24', 'R36')) plt.legend() plt.tight_layout() plt.show() # Write data to Output Files using rx_data_step # ipo2609 rx_data_step(input_data=df, output_file="IPO2609Cleaned.xdf", overwrite=True, xdf_compression_level=5) # Merged ipo1526 rx_data_step(input_data=df_merged, output_file="IPO1526Merged.xdf", overwrite=True, xdf_compression_level=5)
# 4.4 Converting .xdf Files to Text import os import settings as st from revoscalepy import rx_data_step, RxTextData, RxXdfData # Import / Export infile = os.path.join(st.SAMPLE_DATA_DIR, 'claims.txt') outfile = os.path.join(st.RESULTS_LOCATION, 'claims.xdf') claimsCsv = RxTextData(infile) claimsXdf = RxXdfData(outfile) rx_data_step(input_data=claimsCsv, output_file=claimsXdf, overwrite=True) outfile2 = os.path.join(st.RESULTS_LOCATION, 'claimsMyDelimiter.txt') claimsTxt = RxTextData(outfile2, delimiter="\t") rx_data_step(input_data=claimsXdf, output_file=claimsTxt, overwrite=True) ## REM: It seems like the 'delimiter' parameter is not taken into account. ## A release 0.9.2 bug?
startTime = datetime.now() connection_string="Driver={ODBC Driver 17 for SQL Server};Server=,14330;Database=;Trusted_Connection=Yes" query="""select top 108000 a.transcriptComponentId, a.componenttext from dbo.FactEarningsText a join dbo.FactEarningsCall b on a.transcriptComponentId = b.transcriptComponentId where a.componenttext is not null and b.transcriptcomponentTypeId not in (3,1,7) order by rand(checksum(newid()))""" ds = RxSqlServerData(sql_query = query, connection_string = connection_string,rows_per_read=10000) df = rx_data_step(ds) dfToList = df['componenttext'].tolist() #conn = pyodbc.connect('driver={ODBC Driver 11 for SQL Server};' # 'server=;' # 'database=;' # 'Trusted_Connection=yes;') #sql = "select top 1000 * from dbo.FactEarningsText a" #data = pd.read_sql(sql,conn) #dfToList = data['componenttext'].tolist() class CustomVectorizer(CountVectorizer):
########################################################################################################################################## ## Upload the data set to to SQL ########################################################################################################################################## # Point to the input data set while specifying the classes. file_path = "..\\Data" LoS_text = RxTextData(file=os.path.join(file_path, "LengthOfStay.csv"), column_info=col_type_info) # Upload the table to SQL. LengthOfStay_sql = RxSqlServerData(table="LengthOfStay", connection_string=connection_string) rx_data_step(input_data=LoS_text, output_file=LengthOfStay_sql, overwrite=True) ########################################################################################################################################## ## Determine if LengthOfStay has missing values ########################################################################################################################################## # First, get the names and types of the variables to be treated. # For rxSummary to give correct info on characters, stringsAsFactors = True should be used. LengthOfStay_sql2 = RxSqlServerData(table="LengthOfStay", connection_string=connection_string, stringsAsFactors=True) #col = rxCreateColInfo(LengthOfStay_sql2) # Not yet available colnames = rx_get_var_names(LengthOfStay_sql2)
def submit_img(conn_str, table, img): inp = {'image': img, 'Label': 0} df = pd.DataFrame([inp], columns=inp.keys()) out = RxSqlServerData(table=table, connection_string=conn_str) rx_data_step(input_data=df, output_file=out, overwrite=True) print("{} uploaded to {}".format(img, table))
# 3.6 Working with an Xdf Data Source import os import settings as st from revoscalepy import rx_data_step ## REM: The rxOpen and rxReadNext functions are not implemented in RevoscalePy ## However, a workaround can be implemented: # Importing data by chunks infile = os.path.join(st.SAMPLE_DATA_DIR,'claims.xdf') readLines = 0 while True: myClaims = rx_data_step(input_data = infile, start_row = readLines, number_rows_read = 8, report_progress = 0) # More lines to process? if myClaims is None: # Nope break readLines = readLines + myClaims.shape[0] # Process read lines... print("Number of read lines: " + str(readLines))
data = data.head(1000) print("Number of images to featurize: {}".format(len(data))) # Featureize images print("Featurizing Images") rx_set_compute_context( local) # TODO: sql not working correctly. Change to sql later. featurized_data = compute_features(data) rx_set_compute_context(local) # Average Pooling print("Performing Average Pooling") pooled_data = average_pool(featurized_data) # Write features to table tempfeats_sql = RxSqlServerData(table=TABLE_FEATURES, connection_string=connection_string) rx_data_step(input_data=pooled_data, output_file=tempfeats_sql, overwrite=True) # Perform join with labels print("Joining features with labels") query = """SELECT {}.*, {}.label FROM {} INNER JOIN {} ON {}.patient_id = {}.patient_id; """.format(TABLE_FEATURES, TABLE_LABELS, TABLE_FEATURES, TABLE_LABELS, TABLE_FEATURES, TABLE_LABELS) tempfeats_sql = RxSqlServerData(sql_query=query, connection_string=connection_string) features_sql = RxSqlServerData(table=TABLE_FEATURES, connection_string=connection_string) rx_data_step(tempfeats_sql, features_sql, overwrite=True) print("Routine finished")
cor_X = uni_X.copy() # Drop columns with insignificant correlations so plot is easier to read cor_X = cor_X.drop([ 'uw1', 'uw2', 'sectype', 'of', 'ex_code', 't_code', 'audit', 'risks', 'yr', 'sic', 'lockup', 'expenses' ], axis=1) corrmat = cor_X.corr() top_corr_features = corrmat.index plt.figure(figsize=(20, 20)) # Plot heat map g = sns.heatmap(cor_X[top_corr_features].corr(), annot=True, cmap="RdYlGn") plt.show() # Specify final vars to keep for training a model # Start with non-indicator values with most importance, avoiding highly correlated features # Note: model_vars is a set to avoid dups model_vars = { 'book', 'd', 'audit', 'in', 'lockup', 'op', 'max', 'sel', 'uses', 'of', 'sa', 'sp' } # Add top 50 RFE features model_vars.update(rfe_features) model_df = rfe_X[list(model_vars)] model_df = model_df.join(uni_y) rx_data_step(input_data=model_df, output_file="IPO2609FeatureEngineering.xdf", overwrite=True, xdf_compression_level=5)
# Make Predictions, then write them to a table. #NN_prediction = ml_predict(NN_model, data=LoS_Test_import, extra_vars_to_write=["lengthofstay", "eid"], overwrite=True) #NN_prediction_sql = RxSqlServerData(table="NN_Prediction", string_as_factors = True, connection_string=connection_string) #rx_data_step(input_data=NN_prediction, output_file=NN_prediction_sql, overwrite=True) # Compute the performance metrics of the model. #NN_metrics = evaluate_model(observed=NN_prediction['lengthofstay'], predicted=NN_prediction['Score'], model="NN") ########################################################################################################################################## ## Write to Master Predictions Table (LoS_Predictions) ########################################################################################################################################## print("Writing LoS_Predictions") query = """SELECT LengthOfStay.eid, CONVERT(DATE, LengthOfStay.vdate, 110) as vdate, LengthOfStay.rcount, LengthOfStay.gender, LengthOfStay.dialysisrenalendstage, LengthOfStay.asthma, LengthOfStay.irondef, LengthOfStay.pneum, LengthOfStay.substancedependence, LengthOfStay.psychologicaldisordermajor, LengthOfStay.depress, LengthOfStay.psychother, LengthOfStay.fibrosisandother, LengthOfStay.malnutrition, LengthOfStay.hemo, LengthOfStay.hematocrit, LengthOfStay.neutrophils, LengthOfStay.sodium, LengthOfStay.glucose, LengthOfStay.bloodureanitro, LengthOfStay.creatinine, LengthOfStay.bmi, LengthOfStay.pulse, LengthOfStay.respiration, number_of_issues, LengthOfStay.secondarydiagnosisnonicd9, CONVERT(DATE, LengthOfStay.discharged, 110) as discharged, LengthOfStay.facid, LoS.lengthofstay, CONVERT(DATE, CONVERT(DATETIME, LengthOfStay.vdate, 110) + CAST(ROUND(Score, 0) as int), 110) as discharged_Pred, CAST(ROUND(Score, 0) as int) as lengthofstay_Pred FROM LoS JOIN Forest_Prediction ON LoS.eid = Forest_Prediction.eid JOIN LengthOfStay ON LoS.eid = LengthOfStay.eid;""" results_sql = RxSqlServerData(sql_query=query, connection_string=connection_string) los_pred_sql = RxSqlServerData(table="LoS_Predictions", connection_string=connection_string) rx_data_step(results_sql, los_pred_sql, overwrite=True)
data[[row["Name"]]] = (data[[row["Name"]]] - row["Mean"])/row["StdDev"] return data # number_of_issues transform function def calculate_number_of_issues(data, context): data["number_of_issues"] = to_numeric(data["hemo"]) + to_numeric(data["dialysisrenalendstage"]) + to_numeric(data["asthma"])\ + to_numeric(data["irondef"]) + to_numeric(data["pneum"]) + to_numeric(data["substancedependence"])\ + to_numeric(data["psychologicaldisordermajor"]) + to_numeric(data["depress"])\ + to_numeric(data["psychother"]) + to_numeric(data["fibrosisandother"]) + to_numeric(data["malnutrition"]) return data data = DataFrame(dataset) data = standardize(data, context) data = calculate_number_of_issues(data, context) return data # We drop the LoS view in case the SQL Stored Procedure was executed in the same database before. #drop_view("LoS", connection_string) # Standardize the cleaned table by wrapping it up in rx_data_step. Output is written to LoS. LengthOfStay_cleaned_sql = RxSqlServerData(sql_query="SELECT * FROM [{}]".format(table_name), connection_string=connection_string) LoS_sql = RxSqlServerData(table="LoS", connection_string=connection_string) rx_data_step(input_data=LengthOfStay_cleaned_sql, output_file=LoS_sql, overwrite=True, transform_function=transform) ## is it needed if we force col_info? # alter_column("LoS", "number_of_issues", "varchar(2)", connection_string) # alter_column("LoS", "lengthofstay", "float", connection_string) # int -> float for regression
# 4.3 Using the Data Step to Create an .xdf File from a Data Frame import os import settings as st import pandas as pd import numpy as np from revoscalepy import rx_data_step, rx_get_info np.random.seed(39) x1 = np.random.normal(size = 10000) x2 = np.random.uniform(size = 10000) x3 = x1 + x2 s = np.stack((x1,x2,x3)) s = np.transpose(s) myData = pd.DataFrame(s, columns = ['x1','x2','x3']).query("x2 > .1") # Export files directory outFile = os.path.join(st.RESULTS_LOCATION, 'testFile.xdf') rx_data_step(input_data = myData, output_file = outFile, rows_per_read = 5000, overwrite = True) print(rx_get_info(outFile))
News_Test_text = RxTextData(file=os.path.join(file_path, "News_Test"), delimiter="\t") Label_Names_text = RxTextData(file=os.path.join(file_path, "Label_Names"), delimiter="\t") # Point to the SQL tables where they will be written. News_Train_sql = RxSqlServerData(table="News_Train", connection_string=connection_string) News_Test_sql = RxSqlServerData(table="News_Test", connection_string=connection_string) Label_Names_sql = RxSqlServerData(table="Label_Names", connection_string=connection_string) # Export them to SQL Server. rx_data_step(input_data=News_Train_text, output_file=News_Train_sql, overwrite=True) rx_data_step(input_data=News_Test_text, output_file=News_Test_sql, overwrite=True) rx_data_step(input_data=Label_Names_text, output_file=Label_Names_sql, overwrite=True) ########################################################################################################################################## ## Get the factor levels of the label ########################################################################################################################################## # Get the factor levels of the label. Factors_sql = RxSqlServerData( sql_query="SELECT DISTINCT Label FROM News_Train",
data=train_sql, num_trees=1000, method="binary", random_seed=5) rx_set_compute_context(local) # Serialize model and insert into table insert_model(TABLE_CLASSIFIERS, connection_string, classifier, FASTTREE_MODEL_NAME) # Point to the SQL table with the testing data query = "SELECT * FROM {} WHERE patient_id NOT IN (SELECT patient_id FROM {})".format( TABLE_FEATURES, TABLE_TRAIN_ID) test_sql = RxSqlServerData(sql_query=query, connection_string=connection_string, column_info=column_info) # Make predictions on the test data predictions = ml_predict(classifier, data=test_sql, extra_vars_to_write=["label", "patient_id"]) predictions_sql = RxSqlServerData(table=TABLE_PREDICTIONS, connection_string=connection_string) rx_data_step(predictions, predictions_sql, overwrite=True) print(predictions.head()) # Evaluate model using ROC roc(predictions["label"], predictions["Probability"]) print("Routine finished")
# 2.12 Reading Data from an .xdf File into a Data Frame (row selection) import os import settings as st from revoscalepy import rx_data_step, rx_get_info ## Unfortunately, the 'row_selection' parameter of rx_data_step is not supported. ## See https://docs.microsoft.com/en-us/machine-learning-server/python-reference/revoscalepy/rx-data-step#row_selection ## However, it is possible filter rows after data is loaded # Import file infile = os.path.join(st.SAMPLE_DATA_DIR, 'CensusWorkers.xdf') # Loading data myCensusDF = rx_data_step(input_data=infile, vars_to_keep=['age', 'perwt', 'sex', 'state']) print(myCensusDF.head()) # state == "Washington" & age > 40 mySelection1 = myCensusDF.query("state=='Washington' and age>40") print(mySelection1.head()) # Every 10th row mySelection2 = myCensusDF.iloc[::10, :] print(mySelection2.head())
def compute_features(data): featurized_data = rx_data_step(input_data=data, overwrite=True, transform_function=featurize_transform) featurized_data.columns = ["image", "patient_id"] + ["f" + str(i) for i in range(len(featurized_data.columns)-2)] return featurized_data
# Calculating the entropy # Function that calculates the entropy def f_entropy(indata): indataprob = indata.value_counts() / len(indata) entropy = sc.stats.entropy(indataprob, base=2) return entropy # Use the function on variables f_entropy( TM.NumberCarsOwned), np.log2(5), f_entropy(TM.NumberCarsOwned) / np.log2(5) f_entropy(TM.BikeBuyer), np.log2(2), f_entropy(TM.BikeBuyer) / np.log2(2) # rx_data_step filtering variables from revoscalepy import rx_data_step TM4 = rx_data_step(input_data=TM.iloc[0:3, ], vars_to_keep={'CustomerKey', 'Age', 'AgeCUB'}) TM4 ########################### # More code as a bonus:-) # ########################### # Projections TM1 = TM[["CustomerKey", "MaritalStatus"]] TM2 = TM[["CustomerKey", "Gender"]] # Positional access TM1.iloc[0:3, 0:2] TM2.iloc[0:3, 0:2] # Filter and projection
DECLARE @PythonScript NVARCHAR(MAX) =CONCAT(' import pandas as pd import os import glob from revoscalepy import RxSqlServerData, rx_data_step sqlConnString = "Driver=SQL Server;Server=Serv; ',@DBConnectionString,'" Filefolderepath = ImportFilePath+"*.xlsx" if ImportAll ==1 and CombineTarget==0: for FullFilePath in glob.glob(Filefolderepath): Filename = os.path.basename(FullFilePath).replace(".xlsx","") xl = pd.ExcelFile(FullFilePath) for sheetname in xl.sheet_names: Output = pd.read_excel(FullFilePath, sheetname=sheetname, na_filter=False).astype(str) if not Output.empty: sqlDS = RxSqlServerData(connection_string = sqlConnString,table = "".join(fl for fl in Filename if fl.isalnum())+"_"+"".join(sh for sh in sheetname if sh.isalnum())) rx_data_step(input_data = Output, output_file = sqlDS,overwrite = True) if ImportAll ==1 and CombineTarget==1: df2=pd.DataFrame() for FullFilePath in glob.glob(Filefolderepath): Filename = os.path.basename(FullFilePath).replace(".xlsx","") xl = pd.ExcelFile(FullFilePath) for sheetname in xl.sheet_names: Output = pd.read_excel(FullFilePath, sheetname=sheetname).columns.astype(str) Output = ",".join(list(Output)) df1 = pd.DataFrame([[Filename,sheetname,FullFilePath,Output]],columns=["Filename","sheetname","FullFilePath","Headers"]) df2=df2.append(df1,ignore_index=True) sqlDS = RxSqlServerData(connection_string = sqlConnString,table = "Tbl_PyImpExp1") rx_data_step(input_data = df2, output_file = sqlDS,overwrite = True) if ImportAll ==0: Filename =ImportFilePath+ExcelFileName+".xlsx" exists = os.path.isfile(Filename)
def dummy_vars(df, context): # Dummy vars df_dummy = pd.get_dummies( df, columns=['Annual_Income_Bucket', 'Highest_Education']) # Rename columns (remove spaces etc.) new_cols = [ x.replace(' ', '_').replace('>', 'gt').replace('<', 'lt') for x in df_dummy.columns ] df_dummy.columns = new_cols return df_dummy output_file = os.path.join(file_path, "Lead_Demography.xdf") # Set a csv as a data source data_source = RxTextData(file=inputfile, delimiter=',') # Dummy variables and output to named xdf rx_data_step(input_data=data_source, output_file=output_file, overwrite=True, transform_function=dummy_vars) rx_get_info(data=output_file, verbose=2) #################################################################### # Call main function to work in SQL compute context #################################################################### main(tablename="Lead_Demography_Tbl", inputdf=output_file, overwrite=True)