from sklearn import svm from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.metrics import precision_recall_curve, roc_curve, auc from azureml.logging import get_azureml_logger from azureml.dataprep.package import run run_logger = get_azureml_logger() os.makedirs('./outputs', exist_ok=True) # read dataset as dataframe print('-------Training model-------') data = run('Weekly.dprep', dataflow_idx=0, spark=False) print('Dataset shape: {}'.format(data.shape)) # read features and labels X = data.iloc[:, 2:3574] Y = data['Goal'] # normalize data X = preprocessing.normalize(X, norm='l2') train_X, test_X, train_Y, test_Y = train_test_split(X, Y, train_size=0.7, random_state=1, stratify=Y)
from azureml.logging import get_azureml_logger from azureml.dataprep.package import run from plot_graphs import plot_iris # initialize the logger run_logger = get_azureml_logger() # create the outputs folder os.makedirs('./outputs', exist_ok=True) print('Python version: {}'.format(sys.version)) print() # load Iris dataset from a DataPrep package as a pandas DataFrame iris = run('iris.dprep', dataflow_idx=0, spark=False) print('Iris dataset shape: {}'.format(iris.shape)) # load features and labels X, Y = iris[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']].values, iris['Species'].values # add n more random features to make the problem harder to solve # number of new random features to add n = 40 random_state = np.random.RandomState(0) n_samples, n_features = X.shape X = np.c_[X, random_state.randn(n_samples, n)] # split data 65%-35% into training set and test set X_train, X_test, Y_train, Y_test = train_test_split(X,
# Use the Azure Machine Learning data preparation package from azureml.dataprep import package # Use the Azure Machine Learning data collector to log various metrics from azureml.logging import get_azureml_logger logger = get_azureml_logger() # This call will load the referenced package and return a DataFrame. # If run in a PySpark environment, this call returns a # Spark DataFrame. If not, it will return a Pandas DataFrame. df = package.run('data_preprocessing.dprep', dataflow_idx=0) # Remove this line and add code that uses the DataFrame df.head(10)
regexList += ['^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$'] regexList += ['[\w\d\-\_\.]+ @ [\w\d\-\_\.]+'] regexList += ['Subject:'] regexList += ['[^a-zA-Z]'] return regexList if __name__ == '__main__': #################### # Use this with AML Workbench to load data from data prep file # dfIncidents = package.run('Incidents.dprep', dataflow_idx=0) # dfIncidents = pd.read_csv('allIncidents.csv', encoding="ISO-8859-1") # dfRequests = package.run('Requests.dprep', dataflow_idx=0) dfIncidents = package.run('IncidentsCleaned.dprep', dataflow_idx=0) dfRequests = package.run('RequestsCleaned.dprep', dataflow_idx=0) # Load dataset from file # dfIncidents = pd.read_csv('./data/endava_tickets/all_incidents.csv') # dfRequests = pd.read_csv('./data/endava_tickets/all_requests.csv') ##################### # Reorder columns columnsOrder = [ 'title', 'body', 'ticket_type', 'category', 'sub_category1', 'sub_category2', 'business_service', 'urgency', 'impact' ] dfIncidents = dfIncidents[columnsOrder] dfRequests = dfRequests[columnsOrder]
from azureml.sdk import data_collector from azureml.dataprep.package import run from iris_plot_lib import plot_iris # initialize the logger run_logger = data_collector.current_run() # create the outputs folder os.makedirs('./outputs', exist_ok=True) print('Python version: {}'.format(sys.version)) print() # load Iris dataset from a DataPrep package iris = run('iris.dprep', dataflow_idx=0) print('Iris dataset shape: {}'.format(iris.shape)) # load features and labels X, Y = iris[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']].values, iris['Species'].values # add n more random features to make the problem harder to solve # number of new random features to add n = 40 random_state = np.random.RandomState(0) n_samples, n_features = X.shape X = np.c_[X, random_state.randn(n_samples, n)] # split data 65%-35% into training set and test set X_train, X_test, Y_train, Y_test = train_test_split(X,
#========================= LOAD DATASET USING DATAPREP PACKAGE ========================= from azureml.dataprep.package import run dataset = run('social-ads.dprep', dataflow_idx=0, spark=False) #print(dataset) X = dataset.iloc[:, [2,3]].values # Just use salary column [1] first to show accuracy improvement y = dataset.iloc[:, 4].values print(X) print(y) ''' #========================= LOAD DATASET USING BLOB STORAGE ========================= from azure.storage.blob import BlockBlobService import pandas as pd ACCOUNT_NAME = "mlgputraining" #"<account name>" ACCOUNT_KEY = "fDP85xqAP9aO/cPgl53ROhe5u3rpMhMI60E+/FDP6sRNWfCm2vAxjRtyApX6/QtOGmvqJtFaw+QMOpp610fjVA==" #"<acccount key>" CONTAINER_NAME = "datasets" # "<container name>" blobService = BlockBlobService(account_name=ACCOUNT_NAME, account_key=ACCOUNT_KEY) blobService.get_blob_to_path(CONTAINER_NAME, 'social_network_ads.csv', 'social_network_ads.csv') dataset = pd.read_csv('social_network_ads.csv') print(dataset) X = dataset.iloc[:, [2, 3]].values # Just use salary column [1] first to show accurancy improvement y = dataset.iloc[:, 4].values print(X)
# Use the Azure Machine Learning data preparation package from azureml.dataprep import package # Use the Azure Machine Learning data collector to log various metrics from azureml.logging import get_azureml_logger logger = get_azureml_logger() # This call will load the referenced package and return a DataFrame. # If run in a PySpark environment, this call returns a # Spark DataFrame. If not, it will return a Pandas DataFrame. df = package.run('CATelcoCustomerChurnTrainingBlobSample.dprep', dataflow_idx=0) # Remove this line and add code that uses the DataFrame df.head(10)
# add experiment arguments parser = argparse.ArgumentParser() # parser.add_argument('--arg', action='store_true', help='My Arg') args = parser.parse_args() # create the outputs folder os.makedirs('./outputs', exist_ok=True) print("Loading dataset...") print() ################################################################ ##### Load dataset from a DataPrep package as a pandas DataFrame ################################################################ df = run('train.dprep', dataflow_idx=0, spark=False) df = df.dropna(how='any',axis=0) # One hot encoding df.loc[:,'one_way'] = df.one_way.astype('uint8') ohe_fields=['one_way','surface_type','street_type','hour','weekday','month'] df_ohe = pd.get_dummies(df,columns=ohe_fields) # Get the one-hot variable names ohe_feature_names = pd.get_dummies(df[ohe_fields],columns=ohe_fields).columns.tolist() # Names of the continuous features float_feature_names = [ 'accident_counts', 'speed_limit', 'aadt',
# This code snippet will load the referenced package and return a DataFrame. # If the code is run in a PySpark environment, then the code will return a # Spark DataFrame. If not, the code will return a Pandas DataFrame. You can # copy this code snippet to another code file as needed. from azureml.dataprep.package import run # Use this DataFrame for further processing df = run('sampleReviews.dprep', dataflow_idx=0) rows, columns = df.shape for i in range(0, rows): try: print(df.iloc[i,0] + ' ' + df.iloc[i,1]) except UnicodeEncodeError: pass
# This code snippet will load the referenced package and return a DataFrame. # If the code is run in a PySpark environment, then the code will return a # Spark DataFrame. If not, the code will return a Pandas DataFrame. You can # copy this code snippet to another code file as needed. from azureml.dataprep.package import run # Use this DataFrame for further processing df = run('iris-1.dprep', dataflow_idx=0)
# This code snippet will load the referenced package and return a DataFrame. # If the code is run in a PySpark environment, then the code will return a # Spark DataFrame. If not, the code will return a Pandas DataFrame. You can # copy this code snippet to another code file as needed. from azureml.dataprep.package import run # Use this DataFrame for further processing df = run('sampleReviews.dprep', dataflow_idx=0) rows, columns = df.shape for i in range(0, rows): try: print(df.iloc[i,0] + ' ' + df.iloc[i,1]) except UnicodeEncodeError: pass
# for row in zip(height, weight): # print(row[0][0],"->",row[1]) # plt.scatter(height,weight,color='black') # plt.xlabel("height") # plt.ylabel("weight") # plt.show() #run the prep package and get the data frame # Use the Azure Machine Learning data preparation package from azureml.dataprep import package # This call will load the referenced package and return a DataFrame. # If run in a PySpark environment, this call returns a # Spark DataFrame. If not, it will return a Pandas DataFrame. df = package.run('Prep4.dprep', dataflow_idx=0, spark=False) # Remove this line and add code that uses the DataFrame print(df.head(10)) height = pd.DataFrame(df, columns=["Heightft"]).values weight = pd.DataFrame(df, columns=["Weightkg"]).values plt.scatter(height, weight, color='black') plt.xlabel("height") plt.ylabel("weight") plt.show() reg = linear_model.LinearRegression() reg.fit(height, weight) #emit slope and intercept
run_logger = get_azureml_logger() # Get count from ArcGIS if len(sys.argv) > 2: search_term = float(sys.argv[2]) webmap_count = search_fire(search_term) run_logger.log('Webmaps', webmap_count) # create the outputs folder os.makedirs('./outputs', exist_ok=True) print('Python version: {}'.format(sys.version)) print() # load Iris dataset from a DataPrep package as a pandas DataFrame iris = run('column-prep.dprep', dataflow_idx=0, spark=False) print('Iris dataset shape: {}'.format(iris.shape)) # load features and labels X, Y = iris[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']].values, iris['Species'].values # add n more random features to make the problem harder to solve # number of new random features to add # n = 40 n = 10 random_state = np.random.RandomState(0) n_samples, n_features = X.shape X = np.c_[X, random_state.randn(n_samples, n)] # split data 65%-35% into training set and test set
# Use the Azure Machine Learning data preparation package from azureml.dataprep import package # Use the Azure Machine Learning data collector to log various metrics from azureml.logging import get_azureml_logger logger = get_azureml_logger() # This call will load the referenced package and return a DataFrame. # If run in a PySpark environment, this call returns a # Spark DataFrame. If not, it will return a Pandas DataFrame. df = package.run('column-prep.dprep', dataflow_idx=0) # Remove this line and add code that uses the DataFrame print(df)
from sklearn.model_selection import train_test_split from sklearn.metrics import precision_recall_curve from azureml.logging import get_azureml_logger from azureml.dataprep.package import run # Initialize the logger run_logger = get_azureml_logger() # Create the outputs folder where the training results will be stored os.makedirs('./outputs', exist_ok=True) print('Python version: {}\n'.format(sys.version)) # Load dataset from a DataPrep package as a pandas DataFrame df = run('df.dprep', dataflow_idx=0, spark=False) print('Dataset shape: {}'.format(df.shape)) # Define the features columns and the label column X, Y = df[[ 'batteryVoltage', 'responseTime', 'ambientHumidity', 'ambientTemperature' ]].values, df['severity_status'].values # Add n more random features to make the problem harder to solve # Randomness is required because the df.csv dataset is an example dataset # and it is easily ranked with almost 100% accuracy # n - number of new random features to add n = 17 random_state = np.random.RandomState(0) n_samples, n_features = X.shape X = np.c_[X, random_state.randn(n_samples, n)]
def train_model(ratio=.5): #set parameters: max_features = 5000 maxlen = 400 batch_size = 32 embedding_dims = 50 filters = 250 kernel_size = 3 hidden_dims = 250 epochs = 2 seed = 113 # get the reviews_list and labels_ist from the csv file df = run('sampleReviews.dprep', dataflow_idx=0) rows, columns = df.shape reviews_list = [] labels_list = [] for i in range(0, rows): try: labels_list.append(int(float(df.iloc[i, 1]))) reviews_list.append(df.iloc[i, 0]) # print(df.iloc[i,0] + ' ' + df.iloc[i,1]) except UnicodeEncodeError: pass # get the corresponding vectors from the data set reviews_list_vec = get_vectors_from_text(reviews_list) # shuffle the data set np.random.seed(seed) np.random.shuffle(reviews_list_vec) np.random.seed(seed) np.random.shuffle(labels_list) # split the data set into train and test data x_train = reviews_list_vec[:int(len(reviews_list) * ratio)] y_train = labels_list[:int(len(labels_list) * ratio)] x_test = reviews_list_vec[int(len(reviews_list) * ratio):] y_test = labels_list[int(len(labels_list) * ratio):] print('Building model...') model = Sequential() # we start off with an efficient embedding layer which maps # our vocab indices into embedding_dims dimensions model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) model.add(Dropout(0.2)) # we add a Convolution1D, which will learn filters # word group filters of size filter_length: model.add( Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1)) model.add(GlobalMaxPooling1D()) model.add(Dense(hidden_dims)) model.add(Dropout(0.2)) model.add(Activation('relu')) model.add(Dense(1)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test)) return model
parser = argparse.ArgumentParser() # parser.add_argument('--arg', action='store_true', help='My Arg') args = parser.parse_args() print(args) # This is how you log scalar metrics # logger.log("MyMetric", value) # Create the outputs folder - save any outputs you want managed by AzureML here os.makedirs('./outputs', exist_ok=True) print('Python version: {}'.format(sys.version)) print() # load Iris dataset from a DataPrep package as a pandas DataFrame titanic_dataset = run('dataset-clear.dprep', dataflow_idx=0, spark=False) print('Titanic dataset shape: {}'.format(titanic_dataset.shape)) print(titanic_dataset.dtypes) # load features and labels X = titanic_dataset[[ 'MAX_ACCX', 'MAX_ACCY', 'MAX_ACCZ', 'MAX_GYROX', 'MAX_GYROY', 'MAX_GYROZ', 'ACC_AVG', 'ACC_VAR', 'GYRO_AVG', 'GYRO_VAR' ]].values Y = titanic_dataset['WILDNESS'].values # split data 65%-35% into training set and test set X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.35, random_state=0)
# Use the Azure Machine Learning data preparation package from azureml.dataprep import package # Use the Azure Machine Learning data collector to log various metrics from azureml.logging import get_azureml_logger logger = get_azureml_logger() # This call will load the referenced package and return a DataFrame. # If run in a PySpark environment, this call returns a # Spark DataFrame. If not, it will return a Pandas DataFrame. df = package.run('iris.dprep', dataflow_idx=0) # Remove this line and add code that uses the DataFrame df.head(10)
# Use the Azure Machine Learning data preparation package from azureml.dataprep import package # Use the Azure Machine Learning data collector to log various metrics from azureml.logging import get_azureml_logger logger = get_azureml_logger() # This call will load the referenced package and return a DataFrame. # If run in a PySpark environment, this call returns a # Spark DataFrame. If not, it will return a Pandas DataFrame. df = package.run('BankMarketCampaignTrainingSample.dprep', dataflow_idx=0) # Remove this line and add code that uses the DataFrame df.head(10)
class AzureMlDataRecording(Callback): def on_epoch_end(self, epoch, logs=None): global logger logger.log('Training loss', float(logs.get('loss'))) logger.log('Training accuracy', float(logs.get('acc'))) ## ## STEP 1: Perform final prep work for training ## # We did some prep work earlier that we use here. # By running the data preparation packages we clean up the training and validation data. train_data = package.run('training_prepared.dprep', dataflow_idx=0) validation_data = package.run('validation_prepared.dprep', dataflow_idx=0) # Fix up the intent column so it is categorical train_data['intent'] = train_data['intent'].astype('category') validation_data['intent'] = validation_data['intent'].astype('category') validation_data['intent'].cat.categories = train_data['intent'].cat.categories.tolist() # The neural network doesn't understand text at all. # Therefor we convert the input text to vector representations using a tokenizer. tokenizer = Tokenizer(num_words=2500) tokenizer.fit_on_texts(train_data['text'].values) # The training features and labels are made compatible with the neural network here. # Notice the use of the tokenizer and a smart trick from pandas to convert the data. features = tokenizer.texts_to_sequences(train_data['text'].values)
# Create the outputs folder - save any outputs you want managed by AzureML here os.makedirs('./outputs', exist_ok=True) import matplotlib.pyplot as plt import numpy as np import pandas as pd import itertools from sklearn.preprocessing import LabelBinarizer, LabelEncoder from sklearn.metrics import confusion_matrix from keras.models import Sequential from keras.layers import Dense, Activation, Dropout from keras import utils # Use the Azure Machine Learning data preparation package from azureml.dataprep import package # Use the Azure Machine Learning data collector to log various metrics from azureml.logging import get_azureml_logger logger = get_azureml_logger() # This call will load the referenced package and return a DataFrame. # If run in a PySpark environment, this call returns a # Spark DataFrame. If not, it will return a Pandas DataFrame. df = package.run('prep_airline_data.dprep', dataflow_idx=0) # Remove this line and add code that uses the DataFrame df.head(10)
# Use the Azure Machine Learning data preparation package from azureml.dataprep import package # Use the Azure Machine Learning data collector to log various metrics from azureml.logging import get_azureml_logger logger = get_azureml_logger() # This call will load the referenced package and return a DataFrame. # If run in a PySpark environment, this call returns a # Spark DataFrame. If not, it will return a Pandas DataFrame. df = package.run('Wildfire_News.dprep', dataflow_idx=0) # Remove this line and add code that uses the DataFrame df.head(10)