loss_function=distribution,
                            n_tree=n_trees,
                            learning_rate=shrinkage,
                            sample_rate=bag_fraction,
                            max_depth=interaction_depth,
                            min_bucket=n_min_obs_in_node,
                            seed=seed,
                            replace=False,
                            max_num_bins=200)
probArray = rv.rx_predict(rx_btrees_model, data=ipo_test)
fpr, tpr, thresholds = roc_curve(ipo_test["underpriced"], probArray)
aucResult = auc(fpr, tpr)
print("rx-btrees AUC: " + str(aucResult))

# MicrosoftML Logistic Regression
ml_lreg_model = rx_logistic_regression(formula=formula, data=ipo_train)
ml_lreg_score = ml.rx_predict(ml_lreg_model,
                              data=ipo_test,
                              extra_vars_to_write=["underpriced"])
prob_pred = [ml_lreg_score.loc[i, "Probability"] if ml_lreg_score.loc[i, "PredictedLabel"] \
             else (1 - ml_lreg_score.loc[i, "Probability"]) for i in range(0, ml_lreg_score.shape[0])]
good = ml_lreg_score["PredictedLabel"].as_matrix() == (
    ipo_test["underpriced"] == 1).as_matrix()
fpr, tpr, th = roc_curve(good.ravel(), prob_pred)
aucResult = auc(fpr, tpr)
print("ml-logistic-reg AUC: " + str(aucResult))

# Microsoftml Fast Forest
ml_ff_model = rx_fast_forest(formula=formula, data=ipo_train)
ml_ff_pred = ml.rx_predict(ml_ff_model,
                           data=ipo_test,
Example #2
0
################################
# We define this column as a category.
data["cat"] = data["cat"].astype("category")

print("problem dimension:", data.shape)
print(data.head())

###################################################
# Let's train a logistic regression.

formula = "Label ~ {0}".format(" + ".join(data.columns[1:]))
print(formula)

from microsoftml import rx_logistic_regression
logregml = rx_logistic_regression(formula, data=data)

#########################################
# Let's predict now.

from microsoftml import rx_predict
scores = rx_predict(logregml, data=data)
print(scores.head())

#########################################
# Let's change the type of the category into numerical
# and predict again.

data["cat"] = data["cat"].astype(float)
try:
    scores = rx_predict(logregml, data=data)
Example #3
0
import pandas

data = pandas.DataFrame(data=X, columns=["X1", "X2"])
data["Label"] = Y.astype(float)

##########################
#
# From a geometrical point of view, a binary classification
# problem consists in finding the best boundary between
# two clouds of points. The simplest is to assume that it is
# a straight line. In this case, a logistic regression model
# will help us.

from microsoftml import rx_logistic_regression, rx_predict

logreg = rx_logistic_regression("Label ~ X1 + X2", data=data)

##############################
# The model produces a line boundary
# whose coefficients are:
print(logreg.coef_)

###############################
# We could trace this line but this graph
# would only be valid for a linear model.
# Instead we color the background of the graph with
# the color of the class predicted by the model.

import numpy

import pandas
df = pandas.DataFrame(data=X, columns=["X1", "X2"])
df["Label"] = Y.astype(float)

###########################################################################
# :epkg:`microsoftml` must be told it is a multi-class classification problem.
# It may seem a regression compare to :epkg:`scikit-learn`.
# However because :epkg:`microsoftml` can deal with out-of-memory datasets,
# the third class could appear at the end of the training dataset.
# The parameter *verbose* can take values into 0, 1, 2.
# If > 0, :epkg:`microsoftml` displays information about the training
# on the standard output.

from microsoftml import rx_logistic_regression, rx_predict
logregml = rx_logistic_regression("Label ~ X1 + X2", data=df, method="multiClass", verbose=1)

###################################
# We convert the grid (numpy array) into a dataframe.
dfgrid = pandas.DataFrame(data=gridX, columns=["X1", "X2"])
gridml = rx_predict(logregml, dfgrid)

##################################
# :epkg:`microsoftml` returns three scores.
print(gridml.head(n=3))

##################################
# We need to pick the best one.
predicted_classes = np.argmax(gridml.as_matrix(), axis=1)

#####################
fig, ax = plt.subplots(1, 1)
ax.imshow(Image.open(test_df.loc[0, "image"]))


########################################################
# We train a multiclass classifier using the :epkg:`microsoftml:rx_logistic_regression`
# algorithm. Just for kicks, and to compare from the previous sample, 
# we'll use the Resnet-50 model.

from microsoftml import rx_featurize, load_image, resize_image, extract_pixels, featurize_image
from microsoftml import rx_logistic_regression

image_model = rx_logistic_regression(formula="Label~Features", data=train_df, 
                                     method="multiClass", ml_transforms=[
                        load_image(cols=dict(Features="image")),
                        resize_image(cols="Features", width=227, height=227),
                        extract_pixels(cols="Features"),
                        featurize_image(cols="Features", dnn_model="Alexnet")])

############################
# Note that ``type="multiClass"`` indicates that this is a multiclass training task.
# Finally, let's give it an image and its feature vector to classify. 
# Note that this image was not part of the original training set. 
# See the actual code for details.
# Now use the model to predict the type of the image.

from microsoftml import rx_predict
prediction = rx_predict(image_model, data=test_df)
print(prediction)

###############################
Example #6
0
                   char_feature_extractor=n_gram_hash(hash_bits=17,
                                                      ngram_length=3,
                                                      seed=4),
                   vector_normalizer="L2")
]

# Point to the training set.
News_Train_sql = RxSqlServerData(table="News_Train",
                                 connection_string=connection_string,
                                 column_info=factor_info)

# Train the model.
logistic_model = rx_logistic_regression(formula=training_formula,
                                        data=News_Train_sql,
                                        method="multiClass",
                                        l2_weight=1,
                                        l1_weight=1,
                                        ml_transforms=text_transform_list,
                                        train_threads=4)

# Serialize and save the model to SQL Server.
rx_set_compute_context(local)
models_odbc = RxOdbcData(connection_string, table="Model")
rx_write_object(models_odbc,
                key="LR",
                value=logistic_model,
                serialize=True,
                overwrite=True)

# Set the Compute Context back to SQL.
rx_set_compute_context(sql)
Example #7
0
fig, ax = plt.subplots(1, 1)
ax.scatter(data[labels == 0, 0], data[labels == 0, 1], label="class 0")
ax.scatter(data[labels == 1, 0], data[labels == 1, 1], label="class 1")

#############################
# We put the data into a dataframe.

import pandas
df = pandas.DataFrame(data=data, columns=["X1", "X2"])
df["Label"] = labels

#################################
# We train a logistic regression.

from microsoftml import rx_logistic_regression, rx_predict
logreg = rx_logistic_regression("Label ~ X1 + X2", data=df)

#################################
# And we display the results.

import numpy


def colorie(X, model, ax, fig, additional_columns=None, additional_names=None):
    if isinstance(X, pandas.DataFrame):
        X = X.as_matrix()
    xmin, xmax = numpy.min(X[:, 0]), numpy.max(X[:, 0])
    ymin, ymax = numpy.min(X[:, 1]), numpy.max(X[:, 1])
    hx = (xmax - xmin) / 100
    hy = (ymax - ymin) / 100
    xx, yy = numpy.mgrid[xmin:xmax:hx, ymin:ymax:hy]