コード例 #1
0
def compute_features(data, output_table, connection_string):
    results_sql = revo.RxSqlServerData(table=output_table,
                                       connection_string=connection_string)
    revo.rx_data_step(input_data=data,
                      output_file=results_sql,
                      overwrite=True,
                      transform_function=featurize_transform)
コード例 #2
0
def train_classifier(conn_str, feature_table, training_table, testing_table,
                     score_table, model_table, DNNDimension):
    levels = get_label_levels(feature_table, conn_str)  # get levels of Label
    colInfo = {"Label": {"type": "factor", "levels": levels}}
    train_query = "SELECT * FROM {} WHERE image IN (SELECT image FROM {})".format(
        feature_table, training_table)
    train_data = RxSqlServerData(sql_query=train_query,
                                 connection_string=conn_str,
                                 column_info=colInfo)

    test_query = "SELECT * FROM {} WHERE image IN (SELECT image FROM {})".format(
        feature_table, testing_table)
    test_data = RxSqlServerData(sql_query=test_query,
                                connection_string=conn_str,
                                column_info=colInfo)

    featureSet = ["feature." + str(i) for i in range(DNNDimension)]
    label = "Label"
    cols = featureSet + [label]
    classifier = rx_neural_network(
        "Label ~ feature",
        data=train_data,
        method="multiClass",
        optimizer=sgd_optimizer(learning_rate=0.011,
                                l_rate_red_ratio=0.9,
                                l_rate_red_freq=3,
                                momentum=0.2),
        num_iterations=300,
        ml_transforms=[concat(cols={"feature": featureSet})])

    test_score = rx_predict(classifier,
                            test_data,
                            extra_vars_to_write=["image", "Label"])
    train_score = rx_predict(classifier,
                             train_data,
                             extra_vars_to_write=["image", "Label"])
    testACC = float(
        len(test_score[test_score["Label"] ==
                       test_score["PredictedLabel"]])) / len(test_score)
    trainACC = float(
        len(train_score[train_score["Label"] ==
                        train_score["PredictedLabel"]])) / len(train_score)
    print("The train accuracy of the neural network model is {}".format(
        trainACC))
    print(
        "The test accuracy of the neural network model is {}".format(testACC))

    print("Saving the predictive results of all the images into SQL table...")
    outputScore = RxSqlServerData(connection_string=conn_str,
                                  table=score_table)
    rx_data_step(pd.concat([test_score, train_score]),
                 outputScore,
                 overwrite=True)
    print("Saving model into SQL table...")
    save_model(model_table, conn_str, classifier, MODEL_NAME)
コード例 #3
0
def rank_candidates(conn_str, query_table, model_table, results_table):
    print("Loading the classifier output for all the candidate images...")
    candidateImageVector = load_predicted_scores(conn_str, TABLE_SCORE)

    print("Loading query images...")
    queryImagesSql = "SELECT * FROM " + query_table
    queryImages = RxSqlServerData(connection_string=conn_str,
                                  sql_query=queryImagesSql,
                                  strings_as_factors=False)

    print("Embedding query images...")
    query_image_features = rx_data_step(input_data=queryImages,
                                        overwrite=True,
                                        transform_function=featurize_transform,
                                        report_progress=2)

    print("Loading classifier...")
    classifier = load_model(model_table, conn_str, MODEL_NAME)

    print("Getting classifier output for query images...")
    queryImageScores = rx_predict(classifier,
                                  query_image_features,
                                  extra_vars_to_write=["image", "Label"])
    queryImageVector = map_images_to_predictedscores(queryImageScores)

    print("Calculating cosine similarity between each image pair...")
    rank_candiate_images(conn_str, queryImageVector, candidateImageVector,
                         TOP_K_CANDIDATES, results_table)
コード例 #4
0
def train_test_split(id, table, train_table, p, connection_string):
    from revoscalepy import rx_set_temp_compute_context, RxInSqlServer, rx_data_step, RxLocalSeq

    def transform(data, compute_context):
        return data

    data = RxSqlServerData(
        sql_query=
        "SELECT {} FROM {} WHERE ABS(CAST(CAST(HashBytes('MD5', CAST({} AS varchar(10))) AS VARBINARY(64)) AS BIGINT) % 100) < {};"
        .format(id, table, id, p),
        connection_string=connection_string)
    train_table_dest = RxSqlServerData(table='Train_Id',
                                       connection_string=connection_string)
    rx_data_step(input_data=data,
                 output_file=train_table_dest,
                 overwrite=True,
                 transform_function=transform)
    print("Created {}".format(train_table))
コード例 #5
0
# Import into dataframe with vars_to_keep
airlineDemo = rx_import(input_data=ds, vars_to_keep=vars_to_keep)

# Get Info on the latest import
print(rx_get_info(airlineDemo, get_var_info=True))

# Check size in memory.  Note, we have a significant reduction in memory by specifyin only the var we need
sys.getsizeof(airlineDemo)

# Create a transform function to add Gain in Minutes
def transformFunc(data):
    ret = data
    ret["GainMinutes"] = data["DepDelayMinutes"] - data["ArrDelayMinutes"]
    return ret


# Import the data again using the transformation function
airlineDemo = rx_import(input_data=ds, vars_to_keep=vars_to_keep,
                         transform_function=transformFunc)
# Review the new calculated column, GainMinutes
airlineDemo["GainMinutes"].describe()

# Write data to Output File using rx_data_step
outPath = os.path.join(dirpath, "AirOnTime2012_Processed.xdf")
rx_data_step(input_data = airlineDemo,
    output_file = outPath,
    rows_per_read = 200000, 
    overwrite = True,
    xdf_compression_level = 5) # 5 is faster and the file is not much bigger than 9
コード例 #6
0
##	Fast Trees Scoring

##########################################################################################################################################

# Make Predictions, then write them to a table.
LoS_Test_import = rx_import(input_data=LoS_Test)
fast_prediction = ml_predict(fast_model,
                             data=LoS_Test_import,
                             extra_vars_to_write=["lengthofstay", "eid"],
                             overwrite=True)
fast_prediction_sql = RxSqlServerData(table="Fast_Prediction",
                                      strings_as_factors=True,
                                      connection_string=connection_string)
rx_data_step(input_data=fast_prediction,
             output_file=fast_prediction_sql,
             overwrite=True)

# Compute the performance metrics of the model.
fast_metrics = evaluate_model(observed=fast_prediction['lengthofstay'],
                              predicted=fast_prediction['Score'],
                              model="FT")

##########################################################################################################################################

##	Neural Networks Scoring

##########################################################################################################################################

# Make Predictions, then write them to a table.
NN_prediction = ml_predict(NN_model,
def main(tablename, inputdf, overwrite=False):
    """Imports a data source into SQL Server table and, operating
       in-database, uses logistic regression to create a
       predictive model.

       A comparison to an out-of-database, in memory method
       is performed (TODO)
       
       Parameters
       ----------

       tablename : str
           The new or previosly create table name in database.

       inputdf : RxTextData object
           The data source

       overwrite : bool, optional (default=False)
           Whether or not to overwrite the table.  
           Set to True if this is a new table, otherwise False.
       """

    ####################################################################
    # Set the compute context to SQL SERVER
    ####################################################################

    # NB: don't need, but would be good to know what this actually does here
    # RxComputeContext(LOCAL, '9.1')

    compute_context = RxInSqlServer(connection_string=CONNECTION_STRING)
    # ,
    # num_tasks = 1,
    # auto_cleanup = False,
    # console_output=True
    # )

    rx_set_compute_context(compute_context)

    # if overwrite:

    ####################################################################
    # Create table in SQL server
    ####################################################################

    print("Creating tables...")
    data_source = RxSqlServerData(table=tablename,
                                  connection_string=CONNECTION_STRING)

    ####################################################################
    # Read data into the SQL server table that was just created
    ####################################################################

    print("Reading data into tables...")

    rx_data_step(input_data=inputdf, output_file=data_source, overwrite=True)

    #####################################################################
    # Set up a query on table for train and test data (and ensure factor levels)
    #####################################################################
    print("Setting up query and datasources for train and test sets...")

    # Train data
    data_source_train = RxSqlServerData(
        sql_query="SELECT TOP 10000 * FROM Lead_Demography_Tbl \
                    ORDER BY Lead_Id",
        connection_string=CONNECTION_STRING,
        verbose=True)

    # Import training data RxImport style from new query source
    X_y_train = rx_import(data_source_train)
    # X_y_train = rx_data_step(input_data=data_source_train, overwrite=True)

    print("Test data...")
    # Test data (let's pick ~30% size of training dataset)
    data_source_test = RxSqlServerData(
        sql_query="SELECT * FROM Lead_Demography_Tbl \
                    ORDER BY Lead_Id \
                    OFFSET 10000 ROWS \
                    FETCH FIRST 3000 ROW ONLY",
        connection_string=CONNECTION_STRING)

    # Import data RxImport style from new query source
    X_y_test = rx_import(data_source_test)
    # X_y_test = rx_data_step(input_data=data_source_test, overwrite=True)

    #####################################################################
    # Run revoscalepy logistic regression on training data (in-database)
    #####################################################################

    print('Fitting a logistic regression model...')

    mod = rx_logit(formula="Annual_Income_Bucket_gt120k ~ \
                                F(Highest_Education_Graduate_School)",
                   data=X_y_train,
                   compute_context=compute_context,
                   verbose=2)
    assert mod is not None
    assert mod._results is not None
    pprint(mod._results)

    #####################################################################
    # Summary on training data (in-database)
    #####################################################################

    # Note:  for "data" use data source and not the rximport'ed data

    print('\nSummary: \n')

    summary = rx_summary("Annual_Income_Bucket_gt120k ~ \
                                F(Highest_Education_Graduate_School)",
                         data=data_source_train,
                         compute_context=compute_context,
                         cube=True,
                         verbose=2)

    #####################################################################
    # Predict on test data (in-database)
    #####################################################################

    print("\nPredict on test: \n")

    pred = rx_predict(mod, data=X_y_test, verbose=2, write_model_vars=True)

    #####################################################################
    # Metrics for predition based on groundtruth (with scikit-learn tools)
    #####################################################################

    pred_results = pred._results['Annual_Income_Bucket_gt120k_Pred']
    # For some reason the prediction results are not in a binary [0,1] format
    y_pred = binarize(pred_results, threshold=(min(pred_results) + \
                        max(pred_results))/2).reshape(-1, 1)
    y_true = pred._data['Annual_Income_Bucket_gt120k']

    print("Model prediction results:", y_pred)
    print("Actual values: ", y_true)

    print("Accuracy score: ", accuracy_score(y_true=y_true, y_pred=y_pred))
コード例 #8
0
                 alpha=opacity,
                 color='g',
                 label='NYSE')
rects3 = plt.bar(index + bar_width * 2,
                 df_under_nasdaq,
                 bar_width,
                 alpha=opacity,
                 color='r',
                 label='NASDAQ')
plt.xlabel('Exchange')
plt.ylabel('Return')
plt.title('Underpriced Average Returns by Exchange')
plt.xticks(index + bar_width, ('D1', 'R12', 'R24', 'R36'))
plt.legend()
plt.tight_layout()
plt.show()

# Write data to Output Files using rx_data_step

# ipo2609
rx_data_step(input_data=df,
             output_file="IPO2609Cleaned.xdf",
             overwrite=True,
             xdf_compression_level=5)

# Merged ipo1526
rx_data_step(input_data=df_merged,
             output_file="IPO1526Merged.xdf",
             overwrite=True,
             xdf_compression_level=5)
コード例 #9
0
# 4.4 Converting .xdf Files to Text
import os
import settings as st
from revoscalepy import rx_data_step, RxTextData, RxXdfData

# Import / Export
infile = os.path.join(st.SAMPLE_DATA_DIR, 'claims.txt')
outfile = os.path.join(st.RESULTS_LOCATION, 'claims.xdf')

claimsCsv = RxTextData(infile)
claimsXdf = RxXdfData(outfile)

rx_data_step(input_data=claimsCsv, output_file=claimsXdf, overwrite=True)

outfile2 = os.path.join(st.RESULTS_LOCATION, 'claimsMyDelimiter.txt')
claimsTxt = RxTextData(outfile2, delimiter="\t")

rx_data_step(input_data=claimsXdf, output_file=claimsTxt, overwrite=True)

## REM: It seems like the 'delimiter' parameter is not taken into account.
## A release 0.9.2 bug?
コード例 #10
0
startTime = datetime.now()
connection_string="Driver={ODBC Driver 17 for SQL Server};Server=,14330;Database=;Trusted_Connection=Yes"
query="""select top 108000 a.transcriptComponentId,   a.componenttext
from dbo.FactEarningsText a

join dbo.FactEarningsCall b
on a.transcriptComponentId = b.transcriptComponentId

where a.componenttext is not null
and b.transcriptcomponentTypeId not in  (3,1,7)

order by rand(checksum(newid()))"""

ds = RxSqlServerData(sql_query = query, connection_string = connection_string,rows_per_read=10000)
df = rx_data_step(ds)

dfToList = df['componenttext'].tolist()
    
#conn = pyodbc.connect('driver={ODBC Driver 11 for SQL Server};'
#                      'server=;'
#                      'database=;'
#                      'Trusted_Connection=yes;')

#sql = "select top 1000 * from dbo.FactEarningsText a"
    
#data = pd.read_sql(sql,conn)
#dfToList = data['componenttext'].tolist()

class CustomVectorizer(CountVectorizer):
                    
##########################################################################################################################################

## Upload the data set to to SQL

##########################################################################################################################################

# Point to the input data set while specifying the classes.
file_path = "..\\Data"
LoS_text = RxTextData(file=os.path.join(file_path, "LengthOfStay.csv"),
                      column_info=col_type_info)

# Upload the table to SQL.
LengthOfStay_sql = RxSqlServerData(table="LengthOfStay",
                                   connection_string=connection_string)
rx_data_step(input_data=LoS_text, output_file=LengthOfStay_sql, overwrite=True)

##########################################################################################################################################

## Determine if LengthOfStay has missing values

##########################################################################################################################################

# First, get the names and types of the variables to be treated.
# For rxSummary to give correct info on characters, stringsAsFactors = True should be used.
LengthOfStay_sql2 = RxSqlServerData(table="LengthOfStay",
                                    connection_string=connection_string,
                                    stringsAsFactors=True)

#col = rxCreateColInfo(LengthOfStay_sql2)    # Not yet available
colnames = rx_get_var_names(LengthOfStay_sql2)
コード例 #12
0
def submit_img(conn_str, table, img):
    inp = {'image': img, 'Label': 0}
    df = pd.DataFrame([inp], columns=inp.keys())
    out = RxSqlServerData(table=table, connection_string=conn_str)
    rx_data_step(input_data=df, output_file=out, overwrite=True)
    print("{} uploaded to {}".format(img, table))
コード例 #13
0
# 3.6 Working with an Xdf Data Source
import os
import settings as st
from revoscalepy import rx_data_step

## REM: The rxOpen and rxReadNext functions are not implemented in RevoscalePy
## However, a workaround can be implemented:

# Importing data by chunks
infile = os.path.join(st.SAMPLE_DATA_DIR,'claims.xdf')

readLines = 0

while True:

    myClaims = rx_data_step(input_data = infile,
        start_row = readLines,
        number_rows_read = 8,
        report_progress = 0)

    # More lines to process?  
    if myClaims is None:
        # Nope
        break

    readLines = readLines + myClaims.shape[0]

    # Process read lines...
    print("Number of read lines: " + str(readLines))
コード例 #14
0
data = data.head(1000)
print("Number of images to featurize: {}".format(len(data)))

# Featureize images
print("Featurizing Images")
rx_set_compute_context(
    local)  # TODO: sql not working correctly. Change to sql later.
featurized_data = compute_features(data)
rx_set_compute_context(local)

# Average Pooling
print("Performing Average Pooling")
pooled_data = average_pool(featurized_data)

# Write features to table
tempfeats_sql = RxSqlServerData(table=TABLE_FEATURES,
                                connection_string=connection_string)
rx_data_step(input_data=pooled_data, output_file=tempfeats_sql, overwrite=True)

# Perform join with labels
print("Joining features with labels")
query = """SELECT {}.*, {}.label FROM {} INNER JOIN {} ON {}.patient_id = {}.patient_id;
    """.format(TABLE_FEATURES, TABLE_LABELS, TABLE_FEATURES, TABLE_LABELS,
               TABLE_FEATURES, TABLE_LABELS)
tempfeats_sql = RxSqlServerData(sql_query=query,
                                connection_string=connection_string)
features_sql = RxSqlServerData(table=TABLE_FEATURES,
                               connection_string=connection_string)
rx_data_step(tempfeats_sql, features_sql, overwrite=True)

print("Routine finished")
コード例 #15
0
cor_X = uni_X.copy()
# Drop columns with insignificant correlations so plot is easier to read
cor_X = cor_X.drop([
    'uw1', 'uw2', 'sectype', 'of', 'ex_code', 't_code', 'audit', 'risks', 'yr',
    'sic', 'lockup', 'expenses'
],
                   axis=1)
corrmat = cor_X.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20, 20))
# Plot heat map
g = sns.heatmap(cor_X[top_corr_features].corr(), annot=True, cmap="RdYlGn")
plt.show()

# Specify final vars to keep for training a model
# Start with non-indicator values with most importance, avoiding highly correlated features
# Note: model_vars is a set to avoid dups
model_vars = {
    'book', 'd', 'audit', 'in', 'lockup', 'op', 'max', 'sel', 'uses', 'of',
    'sa', 'sp'
}
# Add top 50 RFE features
model_vars.update(rfe_features)
model_df = rfe_X[list(model_vars)]
model_df = model_df.join(uni_y)

rx_data_step(input_data=model_df,
             output_file="IPO2609FeatureEngineering.xdf",
             overwrite=True,
             xdf_compression_level=5)
コード例 #16
0
# Make Predictions, then write them to a table.
#NN_prediction = ml_predict(NN_model, data=LoS_Test_import, extra_vars_to_write=["lengthofstay", "eid"], overwrite=True)
#NN_prediction_sql = RxSqlServerData(table="NN_Prediction", string_as_factors = True, connection_string=connection_string)
#rx_data_step(input_data=NN_prediction, output_file=NN_prediction_sql, overwrite=True)

# Compute the performance metrics of the model.
#NN_metrics = evaluate_model(observed=NN_prediction['lengthofstay'], predicted=NN_prediction['Score'], model="NN")

##########################################################################################################################################

##	Write to Master Predictions Table (LoS_Predictions)

##########################################################################################################################################

print("Writing LoS_Predictions")
query = """SELECT LengthOfStay.eid, CONVERT(DATE, LengthOfStay.vdate, 110) as vdate, LengthOfStay.rcount, LengthOfStay.gender,
               LengthOfStay.dialysisrenalendstage, LengthOfStay.asthma, LengthOfStay.irondef, LengthOfStay.pneum, LengthOfStay.substancedependence,
               LengthOfStay.psychologicaldisordermajor, LengthOfStay.depress, LengthOfStay.psychother, LengthOfStay.fibrosisandother,
               LengthOfStay.malnutrition, LengthOfStay.hemo, LengthOfStay.hematocrit, LengthOfStay.neutrophils, LengthOfStay.sodium,
               LengthOfStay.glucose, LengthOfStay.bloodureanitro, LengthOfStay.creatinine, LengthOfStay.bmi, LengthOfStay.pulse,
               LengthOfStay.respiration, number_of_issues, LengthOfStay.secondarydiagnosisnonicd9,
               CONVERT(DATE, LengthOfStay.discharged, 110) as discharged, LengthOfStay.facid, LoS.lengthofstay,
               CONVERT(DATE, CONVERT(DATETIME, LengthOfStay.vdate, 110) + CAST(ROUND(Score, 0) as int), 110) as discharged_Pred,
               CAST(ROUND(Score, 0) as int) as lengthofstay_Pred
         FROM LoS JOIN Forest_Prediction ON LoS.eid = Forest_Prediction.eid JOIN LengthOfStay ON LoS.eid = LengthOfStay.eid;"""
results_sql = RxSqlServerData(sql_query=query,
                              connection_string=connection_string)
los_pred_sql = RxSqlServerData(table="LoS_Predictions",
                               connection_string=connection_string)
rx_data_step(results_sql, los_pred_sql, overwrite=True)
コード例 #17
0
            data[[row["Name"]]] = (data[[row["Name"]]] - row["Mean"])/row["StdDev"]
        return data

    # number_of_issues transform function
    def calculate_number_of_issues(data, context):
        data["number_of_issues"] = to_numeric(data["hemo"]) + to_numeric(data["dialysisrenalendstage"]) + to_numeric(data["asthma"])\
                                  + to_numeric(data["irondef"]) + to_numeric(data["pneum"]) + to_numeric(data["substancedependence"])\
                                  + to_numeric(data["psychologicaldisordermajor"]) + to_numeric(data["depress"])\
                                  + to_numeric(data["psychother"]) + to_numeric(data["fibrosisandother"]) + to_numeric(data["malnutrition"])
        return data

    data = DataFrame(dataset)
    data = standardize(data, context)
    data = calculate_number_of_issues(data, context)
    return data

# We drop the LoS view in case the SQL Stored Procedure was executed in the same database before.
#drop_view("LoS", connection_string)

# Standardize the cleaned table by wrapping it up in rx_data_step. Output is written to LoS.
LengthOfStay_cleaned_sql = RxSqlServerData(sql_query="SELECT * FROM [{}]".format(table_name),
                                           connection_string=connection_string)
LoS_sql = RxSqlServerData(table="LoS", connection_string=connection_string)

rx_data_step(input_data=LengthOfStay_cleaned_sql, output_file=LoS_sql, overwrite=True, transform_function=transform)


## is it needed if we force col_info?
# alter_column("LoS", "number_of_issues", "varchar(2)", connection_string)
# alter_column("LoS", "lengthofstay", "float", connection_string)    # int -> float for regression
# 4.3 Using the Data Step to Create an .xdf File from a Data Frame
import os
import settings as st
import pandas as pd
import numpy as np
from revoscalepy import rx_data_step, rx_get_info

np.random.seed(39)

x1 = np.random.normal(size = 10000)
x2 = np.random.uniform(size = 10000)
x3 = x1 + x2

s = np.stack((x1,x2,x3))
s = np.transpose(s)

myData = pd.DataFrame(s, columns = ['x1','x2','x3']).query("x2 > .1")

# Export files directory
outFile = os.path.join(st.RESULTS_LOCATION, 'testFile.xdf')

rx_data_step(input_data = myData,
    output_file = outFile,
    rows_per_read = 5000, 
    overwrite = True)

print(rx_get_info(outFile))
コード例 #19
0
News_Test_text = RxTextData(file=os.path.join(file_path, "News_Test"),
                            delimiter="\t")
Label_Names_text = RxTextData(file=os.path.join(file_path, "Label_Names"),
                              delimiter="\t")

# Point to the SQL tables where they will be written.
News_Train_sql = RxSqlServerData(table="News_Train",
                                 connection_string=connection_string)
News_Test_sql = RxSqlServerData(table="News_Test",
                                connection_string=connection_string)
Label_Names_sql = RxSqlServerData(table="Label_Names",
                                  connection_string=connection_string)

# Export them to SQL Server.
rx_data_step(input_data=News_Train_text,
             output_file=News_Train_sql,
             overwrite=True)
rx_data_step(input_data=News_Test_text,
             output_file=News_Test_sql,
             overwrite=True)
rx_data_step(input_data=Label_Names_text,
             output_file=Label_Names_sql,
             overwrite=True)

##########################################################################################################################################
##	Get the factor levels of the label
##########################################################################################################################################

# Get the factor levels of the label.
Factors_sql = RxSqlServerData(
    sql_query="SELECT DISTINCT Label FROM News_Train",
コード例 #20
0
                           data=train_sql,
                           num_trees=1000,
                           method="binary",
                           random_seed=5)
rx_set_compute_context(local)

# Serialize model and insert into table
insert_model(TABLE_CLASSIFIERS, connection_string, classifier,
             FASTTREE_MODEL_NAME)

# Point to the SQL table with the testing data
query = "SELECT * FROM {} WHERE patient_id NOT IN (SELECT patient_id FROM {})".format(
    TABLE_FEATURES, TABLE_TRAIN_ID)
test_sql = RxSqlServerData(sql_query=query,
                           connection_string=connection_string,
                           column_info=column_info)

# Make predictions on the test data
predictions = ml_predict(classifier,
                         data=test_sql,
                         extra_vars_to_write=["label", "patient_id"])
predictions_sql = RxSqlServerData(table=TABLE_PREDICTIONS,
                                  connection_string=connection_string)
rx_data_step(predictions, predictions_sql, overwrite=True)
print(predictions.head())

# Evaluate model using ROC
roc(predictions["label"], predictions["Probability"])

print("Routine finished")
コード例 #21
0
# 2.12 Reading Data from an .xdf File into a Data Frame (row selection)
import os
import settings as st
from revoscalepy import rx_data_step, rx_get_info

## Unfortunately, the 'row_selection' parameter of rx_data_step is not supported.
## See https://docs.microsoft.com/en-us/machine-learning-server/python-reference/revoscalepy/rx-data-step#row_selection

## However, it is possible filter rows after data is loaded

# Import file
infile = os.path.join(st.SAMPLE_DATA_DIR, 'CensusWorkers.xdf')

# Loading data
myCensusDF = rx_data_step(input_data=infile,
                          vars_to_keep=['age', 'perwt', 'sex', 'state'])

print(myCensusDF.head())

# state == "Washington" & age > 40
mySelection1 = myCensusDF.query("state=='Washington' and age>40")
print(mySelection1.head())

# Every 10th row
mySelection2 = myCensusDF.iloc[::10, :]
print(mySelection2.head())
コード例 #22
0
def compute_features(data):
    featurized_data = rx_data_step(input_data=data, overwrite=True, transform_function=featurize_transform)
    featurized_data.columns = ["image", "patient_id"] + ["f" + str(i) for i in range(len(featurized_data.columns)-2)]
    return featurized_data
コード例 #23
0
# Calculating the entropy
# Function that calculates the entropy
def f_entropy(indata):
    indataprob = indata.value_counts() / len(indata)
    entropy = sc.stats.entropy(indataprob, base=2)
    return entropy


# Use the function on variables
f_entropy(
    TM.NumberCarsOwned), np.log2(5), f_entropy(TM.NumberCarsOwned) / np.log2(5)
f_entropy(TM.BikeBuyer), np.log2(2), f_entropy(TM.BikeBuyer) / np.log2(2)

# rx_data_step filtering variables
from revoscalepy import rx_data_step
TM4 = rx_data_step(input_data=TM.iloc[0:3, ],
                   vars_to_keep={'CustomerKey', 'Age', 'AgeCUB'})
TM4

###########################
# More code as a bonus:-) #
###########################

# Projections
TM1 = TM[["CustomerKey", "MaritalStatus"]]
TM2 = TM[["CustomerKey", "Gender"]]

# Positional access
TM1.iloc[0:3, 0:2]
TM2.iloc[0:3, 0:2]

# Filter and projection
コード例 #24
0
DECLARE @PythonScript NVARCHAR(MAX) =CONCAT('
import pandas as pd
import os
import glob
from revoscalepy import RxSqlServerData, rx_data_step
sqlConnString = "Driver=SQL Server;Server=Serv; ',@DBConnectionString,'"
Filefolderepath = ImportFilePath+"*.xlsx"
if ImportAll ==1 and CombineTarget==0:
    for FullFilePath in glob.glob(Filefolderepath):
       Filename = os.path.basename(FullFilePath).replace(".xlsx","")
       xl = pd.ExcelFile(FullFilePath)
       for sheetname in xl.sheet_names:
          Output = pd.read_excel(FullFilePath, sheetname=sheetname, na_filter=False).astype(str)
       if not Output.empty:
          sqlDS = RxSqlServerData(connection_string = sqlConnString,table = "".join(fl for fl in Filename if fl.isalnum())+"_"+"".join(sh for sh in sheetname if sh.isalnum()))
          rx_data_step(input_data = Output, output_file = sqlDS,overwrite = True)
if ImportAll ==1 and CombineTarget==1:
   df2=pd.DataFrame()
   for FullFilePath in glob.glob(Filefolderepath):
       Filename = os.path.basename(FullFilePath).replace(".xlsx","")
       xl = pd.ExcelFile(FullFilePath)
       for sheetname in xl.sheet_names:
           Output = pd.read_excel(FullFilePath, sheetname=sheetname).columns.astype(str)
           Output = ",".join(list(Output))
           df1 = pd.DataFrame([[Filename,sheetname,FullFilePath,Output]],columns=["Filename","sheetname","FullFilePath","Headers"])
           df2=df2.append(df1,ignore_index=True)
   sqlDS = RxSqlServerData(connection_string = sqlConnString,table = "Tbl_PyImpExp1")
   rx_data_step(input_data = df2, output_file = sqlDS,overwrite = True)
if ImportAll ==0:
   Filename =ImportFilePath+ExcelFileName+".xlsx"
   exists = os.path.isfile(Filename)
    def dummy_vars(df, context):
        # Dummy vars
        df_dummy = pd.get_dummies(
            df, columns=['Annual_Income_Bucket', 'Highest_Education'])

        # Rename columns (remove spaces etc.)
        new_cols = [
            x.replace(' ', '_').replace('>', 'gt').replace('<', 'lt')
            for x in df_dummy.columns
        ]
        df_dummy.columns = new_cols
        return df_dummy

    output_file = os.path.join(file_path, "Lead_Demography.xdf")

    # Set a csv as a data source
    data_source = RxTextData(file=inputfile, delimiter=',')

    # Dummy variables and output to named xdf
    rx_data_step(input_data=data_source,
                 output_file=output_file,
                 overwrite=True,
                 transform_function=dummy_vars)

    rx_get_info(data=output_file, verbose=2)

    ####################################################################
    # Call main function to work in SQL compute context
    ####################################################################

    main(tablename="Lead_Demography_Tbl", inputdf=output_file, overwrite=True)