Ejemplo n.º 1
0
from sklearn import svm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, roc_curve, auc

from azureml.logging import get_azureml_logger
from azureml.dataprep.package import run

run_logger = get_azureml_logger()

os.makedirs('./outputs', exist_ok=True)

# read dataset as dataframe
print('-------Training model-------')
data = run('Weekly.dprep', dataflow_idx=0, spark=False)
print('Dataset shape: {}'.format(data.shape))

# read features and labels
X = data.iloc[:, 2:3574]
Y = data['Goal']

# normalize data
X = preprocessing.normalize(X, norm='l2')

train_X, test_X, train_Y, test_Y = train_test_split(X,
                                                    Y,
                                                    train_size=0.7,
                                                    random_state=1,
                                                    stratify=Y)
Ejemplo n.º 2
0
from azureml.logging import get_azureml_logger
from azureml.dataprep.package import run

from plot_graphs import plot_iris

# initialize the logger
run_logger = get_azureml_logger()

# create the outputs folder
os.makedirs('./outputs', exist_ok=True)

print('Python version: {}'.format(sys.version))
print()

# load Iris dataset from a DataPrep package as a pandas DataFrame
iris = run('iris.dprep', dataflow_idx=0, spark=False)
print('Iris dataset shape: {}'.format(iris.shape))

# load features and labels
X, Y = iris[['Sepal Length', 'Sepal Width', 'Petal Length',
             'Petal Width']].values, iris['Species'].values

# add n more random features to make the problem harder to solve
# number of new random features to add
n = 40
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, n)]

# split data 65%-35% into training set and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,
# Use the Azure Machine Learning data preparation package
from azureml.dataprep import package

# Use the Azure Machine Learning data collector to log various metrics
from azureml.logging import get_azureml_logger
logger = get_azureml_logger()

# This call will load the referenced package and return a DataFrame.
# If run in a PySpark environment, this call returns a
# Spark DataFrame. If not, it will return a Pandas DataFrame.
df = package.run('data_preprocessing.dprep', dataflow_idx=0)

# Remove this line and add code that uses the DataFrame
df.head(10)
Ejemplo n.º 4
0
    regexList += ['^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$']
    regexList += ['[\w\d\-\_\.]+ @ [\w\d\-\_\.]+']
    regexList += ['Subject:']
    regexList += ['[^a-zA-Z]']

    return regexList


if __name__ == '__main__':
    ####################
    # Use this with AML Workbench to load data from data prep file
    # dfIncidents = package.run('Incidents.dprep', dataflow_idx=0)
    # dfIncidents = pd.read_csv('allIncidents.csv', encoding="ISO-8859-1")
    # dfRequests = package.run('Requests.dprep', dataflow_idx=0)
    dfIncidents = package.run('IncidentsCleaned.dprep', dataflow_idx=0)
    dfRequests = package.run('RequestsCleaned.dprep', dataflow_idx=0)

    # Load dataset from file
    # dfIncidents = pd.read_csv('./data/endava_tickets/all_incidents.csv')
    # dfRequests = pd.read_csv('./data/endava_tickets/all_requests.csv')
    #####################

    # Reorder columns
    columnsOrder = [
        'title', 'body', 'ticket_type', 'category',
        'sub_category1', 'sub_category2', 'business_service',
        'urgency', 'impact'
    ]
    dfIncidents = dfIncidents[columnsOrder]
    dfRequests = dfRequests[columnsOrder]
Ejemplo n.º 5
0
from azureml.sdk import data_collector
from azureml.dataprep.package import run

from iris_plot_lib import plot_iris

# initialize the logger
run_logger = data_collector.current_run()

# create the outputs folder
os.makedirs('./outputs', exist_ok=True)

print('Python version: {}'.format(sys.version))
print()

# load Iris dataset from a DataPrep package
iris = run('iris.dprep', dataflow_idx=0)
print('Iris dataset shape: {}'.format(iris.shape))

# load features and labels
X, Y = iris[['Sepal Length', 'Sepal Width', 'Petal Length',
             'Petal Width']].values, iris['Species'].values

# add n more random features to make the problem harder to solve
# number of new random features to add
n = 40
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, n)]

# split data 65%-35% into training set and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,
Ejemplo n.º 6
0
#========================= LOAD DATASET USING DATAPREP PACKAGE =========================
from azureml.dataprep.package import run

dataset = run('social-ads.dprep', dataflow_idx=0, spark=False)
#print(dataset)

X = dataset.iloc[:, [2,3]].values  # Just use salary column [1] first to show accuracy improvement
y = dataset.iloc[:, 4].values

print(X)
print(y)

'''
#========================= LOAD DATASET USING BLOB STORAGE =========================
from azure.storage.blob import BlockBlobService
import pandas as pd

ACCOUNT_NAME = "mlgputraining" #"<account name>"
ACCOUNT_KEY = "fDP85xqAP9aO/cPgl53ROhe5u3rpMhMI60E+/FDP6sRNWfCm2vAxjRtyApX6/QtOGmvqJtFaw+QMOpp610fjVA==" #"<acccount key>"
CONTAINER_NAME = "datasets" # "<container name>"

blobService = BlockBlobService(account_name=ACCOUNT_NAME, account_key=ACCOUNT_KEY)
blobService.get_blob_to_path(CONTAINER_NAME, 'social_network_ads.csv', 'social_network_ads.csv')

dataset = pd.read_csv('social_network_ads.csv')
print(dataset)

X = dataset.iloc[:, [2, 3]].values  # Just use salary column [1] first to show accurancy improvement
y = dataset.iloc[:, 4].values

print(X)
Ejemplo n.º 7
0
# Use the Azure Machine Learning data preparation package
from azureml.dataprep import package

# Use the Azure Machine Learning data collector to log various metrics
from azureml.logging import get_azureml_logger
logger = get_azureml_logger()

# This call will load the referenced package and return a DataFrame.
# If run in a PySpark environment, this call returns a
# Spark DataFrame. If not, it will return a Pandas DataFrame.
df = package.run('CATelcoCustomerChurnTrainingBlobSample.dprep',
                 dataflow_idx=0)

# Remove this line and add code that uses the DataFrame
df.head(10)
Ejemplo n.º 8
0
# add experiment arguments
parser = argparse.ArgumentParser()
# parser.add_argument('--arg', action='store_true', help='My Arg')
args = parser.parse_args()

# create the outputs folder
os.makedirs('./outputs', exist_ok=True)

print("Loading dataset...")
print()

################################################################
##### Load dataset from a DataPrep package as a pandas DataFrame
################################################################

df = run('train.dprep', dataflow_idx=0, spark=False)
df = df.dropna(how='any',axis=0)

# One hot encoding
df.loc[:,'one_way'] = df.one_way.astype('uint8')
ohe_fields=['one_way','surface_type','street_type','hour','weekday','month']
df_ohe = pd.get_dummies(df,columns=ohe_fields)

# Get the one-hot variable names
ohe_feature_names = pd.get_dummies(df[ohe_fields],columns=ohe_fields).columns.tolist()

# Names of the continuous features
float_feature_names = [
    'accident_counts',
    'speed_limit',
    'aadt',
Ejemplo n.º 9
0
# This code snippet will load the referenced package and return a DataFrame.
# If the code is run in a PySpark environment, then the code will return a
# Spark DataFrame. If not, the code will return a Pandas DataFrame. You can
# copy this code snippet to another code file as needed.
from azureml.dataprep.package import run


# Use this DataFrame for further processing
df = run('sampleReviews.dprep', dataflow_idx=0)

rows, columns = df.shape

for i in range(0, rows):
    try:
        print(df.iloc[i,0] + ' ' + df.iloc[i,1])
    except UnicodeEncodeError:
        pass
Ejemplo n.º 10
0
# This code snippet will load the referenced package and return a DataFrame.
# If the code is run in a PySpark environment, then the code will return a
# Spark DataFrame. If not, the code will return a Pandas DataFrame. You can
# copy this code snippet to another code file as needed.
from azureml.dataprep.package import run

# Use this DataFrame for further processing
df = run('iris-1.dprep', dataflow_idx=0)
Ejemplo n.º 11
0
# This code snippet will load the referenced package and return a DataFrame.
# If the code is run in a PySpark environment, then the code will return a
# Spark DataFrame. If not, the code will return a Pandas DataFrame. You can
# copy this code snippet to another code file as needed.
from azureml.dataprep.package import run


# Use this DataFrame for further processing
df = run('sampleReviews.dprep', dataflow_idx=0)

rows, columns = df.shape

for i in range(0, rows):
    try:
        print(df.iloc[i,0] + ' ' + df.iloc[i,1])
    except UnicodeEncodeError:
        pass
Ejemplo n.º 12
0
# for row in zip(height, weight):
#     print(row[0][0],"->",row[1])

# plt.scatter(height,weight,color='black')
# plt.xlabel("height")
# plt.ylabel("weight")
# plt.show()

#run the prep package and get the data frame
# Use the Azure Machine Learning data preparation package
from azureml.dataprep import package

# This call will load the referenced package and return a DataFrame.
# If run in a PySpark environment, this call returns a
# Spark DataFrame. If not, it will return a Pandas DataFrame.
df = package.run('Prep4.dprep', dataflow_idx=0, spark=False)

# Remove this line and add code that uses the DataFrame
print(df.head(10))
height = pd.DataFrame(df, columns=["Heightft"]).values
weight = pd.DataFrame(df, columns=["Weightkg"]).values

plt.scatter(height, weight, color='black')
plt.xlabel("height")
plt.ylabel("weight")
plt.show()

reg = linear_model.LinearRegression()
reg.fit(height, weight)

#emit slope and intercept
Ejemplo n.º 13
0
run_logger = get_azureml_logger()

# Get count from ArcGIS
if len(sys.argv) > 2:
    search_term = float(sys.argv[2])
webmap_count = search_fire(search_term)
run_logger.log('Webmaps', webmap_count)

# create the outputs folder
os.makedirs('./outputs', exist_ok=True)

print('Python version: {}'.format(sys.version))
print()

# load Iris dataset from a DataPrep package as a pandas DataFrame
iris = run('column-prep.dprep', dataflow_idx=0, spark=False)
print('Iris dataset shape: {}'.format(iris.shape))

# load features and labels
X, Y = iris[['Sepal Length', 'Sepal Width', 'Petal Length',
             'Petal Width']].values, iris['Species'].values

# add n more random features to make the problem harder to solve
# number of new random features to add
# n = 40
n = 10
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, n)]

# split data 65%-35% into training set and test set
Ejemplo n.º 14
0
# Use the Azure Machine Learning data preparation package
from azureml.dataprep import package

# Use the Azure Machine Learning data collector to log various metrics
from azureml.logging import get_azureml_logger
logger = get_azureml_logger()

# This call will load the referenced package and return a DataFrame.
# If run in a PySpark environment, this call returns a
# Spark DataFrame. If not, it will return a Pandas DataFrame.
df = package.run('column-prep.dprep', dataflow_idx=0)

# Remove this line and add code that uses the DataFrame
print(df)
Ejemplo n.º 15
0
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve

from azureml.logging import get_azureml_logger
from azureml.dataprep.package import run

# Initialize the logger
run_logger = get_azureml_logger()

# Create the outputs folder where the training results will be stored
os.makedirs('./outputs', exist_ok=True)

print('Python version: {}\n'.format(sys.version))

# Load dataset from a DataPrep package as a pandas DataFrame
df = run('df.dprep', dataflow_idx=0, spark=False)
print('Dataset shape: {}'.format(df.shape))

# Define the features columns and the label column
X, Y = df[[
    'batteryVoltage', 'responseTime', 'ambientHumidity', 'ambientTemperature'
]].values, df['severity_status'].values

# Add n more random features to make the problem harder to solve
# Randomness is required because the df.csv dataset is an example dataset
# and it is easily ranked with almost 100% accuracy
# n - number of new random features to add
n = 17
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, n)]
def train_model(ratio=.5):
    #set parameters:
    max_features = 5000
    maxlen = 400
    batch_size = 32
    embedding_dims = 50
    filters = 250
    kernel_size = 3
    hidden_dims = 250
    epochs = 2
    seed = 113
    # get the reviews_list and labels_ist from the csv file

    df = run('sampleReviews.dprep', dataflow_idx=0)

    rows, columns = df.shape
    reviews_list = []
    labels_list = []

    for i in range(0, rows):
        try:
            labels_list.append(int(float(df.iloc[i, 1])))
            reviews_list.append(df.iloc[i, 0])
            # print(df.iloc[i,0] + ' ' + df.iloc[i,1])
        except UnicodeEncodeError:
            pass

    # get the corresponding vectors from the data set
    reviews_list_vec = get_vectors_from_text(reviews_list)
    # shuffle the data set
    np.random.seed(seed)
    np.random.shuffle(reviews_list_vec)
    np.random.seed(seed)
    np.random.shuffle(labels_list)
    # split the data set into train and test data
    x_train = reviews_list_vec[:int(len(reviews_list) * ratio)]
    y_train = labels_list[:int(len(labels_list) * ratio)]
    x_test = reviews_list_vec[int(len(reviews_list) * ratio):]
    y_test = labels_list[int(len(labels_list) * ratio):]
    print('Building model...')
    model = Sequential()
    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
    model.add(Dropout(0.2))
    # we add a Convolution1D, which will learn filters
    # word group filters of size filter_length:
    model.add(
        Conv1D(filters,
               kernel_size,
               padding='valid',
               activation='relu',
               strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_test, y_test))
    return model
Ejemplo n.º 17
0
parser = argparse.ArgumentParser()
# parser.add_argument('--arg', action='store_true', help='My Arg')
args = parser.parse_args()
print(args)

# This is how you log scalar metrics
# logger.log("MyMetric", value)

# Create the outputs folder - save any outputs you want managed by AzureML here
os.makedirs('./outputs', exist_ok=True)

print('Python version: {}'.format(sys.version))
print()

# load Iris dataset from a DataPrep package as a pandas DataFrame
titanic_dataset = run('dataset-clear.dprep', dataflow_idx=0, spark=False)
print('Titanic dataset shape: {}'.format(titanic_dataset.shape))
print(titanic_dataset.dtypes)

# load features and labels
X = titanic_dataset[[
    'MAX_ACCX', 'MAX_ACCY', 'MAX_ACCZ', 'MAX_GYROX', 'MAX_GYROY', 'MAX_GYROZ',
    'ACC_AVG', 'ACC_VAR', 'GYRO_AVG', 'GYRO_VAR'
]].values
Y = titanic_dataset['WILDNESS'].values

# split data 65%-35% into training set and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.35,
                                                    random_state=0)
Ejemplo n.º 18
0
# Use the Azure Machine Learning data preparation package
from azureml.dataprep import package

# Use the Azure Machine Learning data collector to log various metrics
from azureml.logging import get_azureml_logger
logger = get_azureml_logger()

# This call will load the referenced package and return a DataFrame.
# If run in a PySpark environment, this call returns a
# Spark DataFrame. If not, it will return a Pandas DataFrame.
df = package.run('iris.dprep', dataflow_idx=0)

# Remove this line and add code that uses the DataFrame
df.head(10)
Ejemplo n.º 19
0
# Use the Azure Machine Learning data preparation package
from azureml.dataprep import package

# Use the Azure Machine Learning data collector to log various metrics
from azureml.logging import get_azureml_logger
logger = get_azureml_logger()

# This call will load the referenced package and return a DataFrame.
# If run in a PySpark environment, this call returns a
# Spark DataFrame. If not, it will return a Pandas DataFrame.
df = package.run('BankMarketCampaignTrainingSample.dprep', dataflow_idx=0)

# Remove this line and add code that uses the DataFrame
df.head(10)
class AzureMlDataRecording(Callback):

    def on_epoch_end(self, epoch, logs=None):
        global logger

        logger.log('Training loss', float(logs.get('loss')))
        logger.log('Training accuracy', float(logs.get('acc')))


##
## STEP 1: Perform final prep work for training
##

# We did some prep work earlier that we use here.
# By running the data preparation packages we clean up the training and validation data.
train_data = package.run('training_prepared.dprep', dataflow_idx=0)
validation_data = package.run('validation_prepared.dprep', dataflow_idx=0)

# Fix up the intent column so it is categorical
train_data['intent'] = train_data['intent'].astype('category')
validation_data['intent'] = validation_data['intent'].astype('category')
validation_data['intent'].cat.categories = train_data['intent'].cat.categories.tolist()

# The neural network doesn't understand text at all.
# Therefor we convert the input text to vector representations using a tokenizer.
tokenizer = Tokenizer(num_words=2500)
tokenizer.fit_on_texts(train_data['text'].values)

# The training features and labels are made compatible with the neural network here.
# Notice the use of the tokenizer and a smart trick from pandas to convert the data.
features = tokenizer.texts_to_sequences(train_data['text'].values)
Ejemplo n.º 21
0
# Create the outputs folder - save any outputs you want managed by AzureML here
os.makedirs('./outputs', exist_ok=True)

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import itertools

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import utils

# Use the Azure Machine Learning data preparation package
from azureml.dataprep import package

# Use the Azure Machine Learning data collector to log various metrics
from azureml.logging import get_azureml_logger
logger = get_azureml_logger()

# This call will load the referenced package and return a DataFrame.
# If run in a PySpark environment, this call returns a
# Spark DataFrame. If not, it will return a Pandas DataFrame.
df = package.run('prep_airline_data.dprep', dataflow_idx=0)

# Remove this line and add code that uses the DataFrame
df.head(10)
Ejemplo n.º 22
0
# Use the Azure Machine Learning data preparation package
from azureml.dataprep import package

# Use the Azure Machine Learning data collector to log various metrics
from azureml.logging import get_azureml_logger
logger = get_azureml_logger()

# This call will load the referenced package and return a DataFrame.
# If run in a PySpark environment, this call returns a
# Spark DataFrame. If not, it will return a Pandas DataFrame.
df = package.run('Wildfire_News.dprep', dataflow_idx=0)

# Remove this line and add code that uses the DataFrame
df.head(10)