def get_clean_data(): ds = TabularDatasetFactory.from_delimited_files(path=_DATA_URL) return clean_data(ds)
from azureml.data.dataset_factory import TabularDatasetFactory from azureml.core.run import Run from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder from sklearn.metrics import mean_absolute_error import argparse import os import numpy as np import joblib import pandas as pd # Imports dataset mbdataset = TabularDatasetFactory.from_delimited_files("https://raw.githubusercontent.com/czofficial/nd00333-capstone/4a6a4924bdd4c6a188aeb24e0c282bae11c8933b/mercedes.csv") df = mbdataset.to_pandas_dataframe() # Cleans dataset def clean_data(df): x_df = df x_df = pd.get_dummies(df, columns=['model', 'transmission', 'fuelType']) y_df = x_df.pop("price") return x_df,y_df x, y = clean_data(df) # Splits dataset into train and test
import numpy as np from sklearn.metrics import mean_squared_error import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(url) # TODO: Split data into train and test sets. ### YOUR CODE HERE ###a run = Run.get_context() def clean_data(data): # Dict for cleaning data months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12} weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7} # Clean and one hot encode data
def retrieve_cleaned_data(): ds = TabularDatasetFactory.from_delimited_files(path="https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv") x, y = clean_data(ds) return train_test_split(x, y, test_size = 0.3, random_state = 0)
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--learning_rate', type=float, default=0.03, help="Learning rate param for lgbm") parser.add_argument('--max_depth', type=int, default=10, help="Limit the tree depth explicitly") parser.add_argument('--num_leaves', type=int, default=255, help="Control the complexity of the tree model") parser.add_argument('--min_data_in_leaf', type=int, default=3, help="Large value can avoid growing too deep a tree") parser.add_argument('--num_iterations', type=int, default=500, help="Number of boosting iterations") args = parser.parse_args() run.log("learning-rate:", np.float(args.learning_rate)) run.log("max_depth:", np.int(args.max_depth)) run.log("num_leaves", np.int(args.num_leaves)) run.log("min_data_in_leaf", np.int(args.min_data_in_leaf)) run.log("num_iterations", np.int(args.num_iterations)) factory = TabularDatasetFactory() train_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" valid_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv" train_ds = factory.from_delimited_files(train_data_path) valid_ds = factory.from_delimited_files(valid_data_path) X_train, y_train = clean_data(train_ds) X_valid, y_valid = clean_data(valid_ds) d_train = lgbm.Dataset(X_train, label=y_train) lgbm_params = {} lgbm_params['learning_rate'] = args.learning_rate lgbm_params['boosting_type'] = 'gbdt' lgbm_params['objective'] = 'binary' lgbm_params['metric'] = 'binary_logloss' lgbm_params['max_depth'] = args.max_depth lgbm_params['num_leaves'] = args.num_leaves lgbm_params['min_data_in_leaf'] = args.min_data_in_leaf lgbm_params['colsample_bytree'] = 1.0, model = lgbm.train(lgbm_params, d_train, args.num_iterations) accuracy = accuracy_score( model.predict(X_valid).round(0).astype(int), y_valid) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/bankmarketing-lgbm-model.joblib')
import os import numpy as np from sklearn.metrics import mean_squared_error import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(path) ### YOUR CODE HERE ### x, y = clean_data(ds) # TODO: Split data into train and test sets. ### YOUR CODE HERE ###a x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42) run = Run.get_context() def clean_data(data): # Dict for cleaning data
help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) AUC_weighted = model.score(x_test, y_test) run.log("AUC_weighted", np.float(AUC_weighted)) if __name__ == '__main__': # importing the dataset for use dataset = TabularDatasetFactory.from_delimited_files( "https://raw.githubusercontent.com/ujjwalbb30/nd00333-capstone/ujjwalbb30-patch-1/heart_failure_clinical_records_dataset.csv" ) # converting the dataset imported to pandas dataframe df = dataset.to_pandas_dataframe() x, y = df.drop(columns=['DEATH_EVENT']), df['DEATH_EVENT'] # TODO: Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) run = Run.get_context() main()
import argparse import os import numpy as np from sklearn.metrics import mean_squared_error import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" location = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(path=location) run = Run.get_context() def clean_data(data): # Dict for cleaning data months = { "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8,
import argparse import os import joblib import numpy as np from azureml.data.dataset_factory import TabularDatasetFactory from azureml.core.run import Run from sklearn.model_selection import train_test_split # Select the data URL data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data' # create an output folder for the model file if "outputs" not in os.listdir(): os.mkdir("./outputs") ds = TabularDatasetFactory.from_delimited_files(path=data_url, header=False) def clean_data(data): # Transform the TabularDataset into a pandas dataframe, removing NA's x_df = data.to_pandas_dataframe().dropna() # Rename the columns to it's right labels x_df.columns = [ 'id', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type' ] # Make the target variable the right type x_df.Type = x_df.Type.astype('category') # Remove the id column
from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ### YOUR CODE HERE ### web_path = [ 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv' ] ds = TabularDatasetFactory.from_delimited_files(path=web_path, separator=',') run = Run.get_context() def clean_data(data): # Dict for cleaning data months = { "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8,
from sklearn.linear_model import LogisticRegression import argparse import os import numpy as np from sklearn.metrics import mean_squared_error import joblib from sklearn.model_selection import train_test_split import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory # Load data ds = TabularDatasetFactory.from_delimited_files( "https://raw.githubusercontent.com/GowthamiWudaru/heart-Disease-Prediction-With-Azure/main/heartDisease.csv" ) df = ds.to_pandas_dataframe() y = df['num'] x = df.drop(['num'], axis=1) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42) run = Run.get_context() def main(): # Add arguments to script parser = argparse.ArgumentParser()
from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory from interpret.ext.blackbox import TabularExplainer from azureml.interpret import ExplanationClient OUTPUT_DIR = './outputs/' os.makedirs(OUTPUT_DIR, exist_ok=True) run = Run.get_context() client = ExplanationClient.from_run(run) ds = TabularDatasetFactory.from_delimited_files("https://raw.githubusercontent.com/ketcx/ml-ops-exercise/master/data/hotel_bookings_clean.csv", validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False) def transform(dataframe): le = LabelEncoder() # Label Enconder From Sklearn # Select all categorcial features categorical_features = list(dataframe.columns[dataframe.dtypes == object]) # Apply Label Encoding on all categorical features return dataframe[categorical_features].apply(lambda x: le.fit_transform(x)) def clean_data(data): """ Work with clean dataset. """
#dropping unnecessary columns unwantedcolumnlist = ["gender", "MultipleLines", "PaymentMethod", "tenure"] df = df2.copy() df = df.drop(unwantedcolumnlist, axis=1) #Let's convert all the categorical variables into dummy variables df_dummies = pd.get_dummies(df) y = df_dummies[['Churn']] x = df_dummies.drop(columns=['Churn']) return x, y ds = TabularDatasetFactory.from_delimited_files( path= "https://raw.githubusercontent.com/arfa-t/ML-Nanodegree/main/Capstone%20Project/churn.csv" ) x, y = clean_data(ds) #train test split from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, shuffle=True) from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn import svm from sklearn.svm import SVC import argparse import joblib
import os import numpy as np from sklearn.metrics import auc, roc_curve import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory path = "https://www.kaggle.com/andrewmvd/heart-failure-clinical-data/download" #this ds = TabularDatasetFactory.from_delimited_files(path, validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None) df = ds.to_pandas() X, y = df.loc[:, ~df.columns.isin(["DEATH_EVENT"])], df.loc[:, "DEATH_EVENT"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123) run = Run.get_context() def main(): parser = argparse.ArgumentParser()
x_df = x_df.join(education) x_df["month"] = x_df.month.map(months) x_df["day_of_week"] = x_df.day_of_week.map(weekdays) x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1 if s == "success" else 0) y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0) path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(path, validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False, encoding='utf8') x, y = clean_data(ds) # TODO: Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33) run = Run.get_context()
import argparse import os import numpy as np from sklearn.metrics import mean_squared_error import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv",validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False) def clean_data(data): # Dict for cleaning data months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12} weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7} # Clean and one hot encode data x_df = data.to_pandas_dataframe().dropna() jobs = pd.get_dummies(x_df.job, prefix="job") x_df.drop("job", inplace=True, axis=1) x_df = x_df.join(jobs) x_df["marital"] = x_df.marital.apply(lambda s: 1 if s == "married" else 0) x_df["default"] = x_df.default.apply(lambda s: 1 if s == "yes" else 0) x_df["housing"] = x_df.housing.apply(lambda s: 1 if s == "yes" else 0) x_df["loan"] = x_df.loan.apply(lambda s: 1 if s == "yes" else 0)
import os import numpy as np from sklearn.metrics import mean_squared_error import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" path_to_data = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(path=path_to_data) run = Run.get_context() def clean_data(data): # Dict for cleaning data months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12} weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7} # Clean and one hot encode data x_df = data.to_pandas_dataframe().dropna() jobs = pd.get_dummies(x_df.job, prefix="job") x_df.drop("job", inplace=True, axis=1) x_df = x_df.join(jobs) x_df["marital"] = x_df.marital.apply(lambda s: 1 if s == "married" else 0) x_df["default"] = x_df.default.apply(lambda s: 1 if s == "yes" else 0)
from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split # Azure imports e.g., Workspace,... from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory # VRK: Create TabularDataset using TabularDatasetFactory # create nsl-kdd network train data nsl_kdd_webpath = [ 'https://raw.githubusercontent.com/venkataravikumaralladi/AzureMLCapstoneProject/main/input/KDDTrain.csv' ] #create network analysis data set in tabular format using TabularDatasetFactory nsl_kdd_dataset = TabularDatasetFactory.from_delimited_files(path=nsl_kdd_webpath) class NSLKDDFeatureAnalysis: # class variables network_data_column_names = [ 'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_hot_login', 'is_guest_login',
from sklearn.linear_model import LogisticRegression import argparse import os import numpy as np from sklearn.metrics import mean_squared_error import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory data_loc = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00537/sobar-72.csv' ds = TabularDatasetFactory.from_delimited_files(data_loc) #Save model for current iteration run = Run.get_context() x_df = ds.to_pandas_dataframe().dropna() y_df = x_df.pop('ca_cervix') # TODO: Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=0) def main():
from sklearn.linear_model import LogisticRegression import argparse import os import numpy as np from sklearn.metrics import mean_squared_error import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory web_path = "https://raw.githubusercontent.com/ciccioska/Udacity-AzureML-Capstone-Project/master/healthcare-dataset-stroke-data.csv" ds = TabularDatasetFactory.from_delimited_files(web_path) x_df = ds.to_pandas_dataframe().dropna() x_df["work_type"] = x_df.work_type.apply(lambda s: 1 if s == "Private" else 0 if s == "Self-employed" else -1) x_df["bmi"] = x_df.bmi.apply(lambda s: 0 if s == "N/A" else s) x_df["gender"] = x_df.gender.apply(lambda s: 1 if s == "Male" else 0) x_df["Residence_type"] = x_df.Residence_type.apply(lambda s: 1 if s == "Urban" else 0) x_df["smoking_status"] = x_df.smoking_status.apply( lambda s: 1 if s == "smokes" else -1 if s == "never smoked" else 2 if s == "formerly smoked" else 0) print(x_df) y_df = x_df.pop("stroke")
import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory import joblib run = Run.get_context() # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv" ds = TabularDatasetFactory.from_delimited_files("https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv") def clean_data(data): x_df = data.to_pandas_dataframe() y_df = x_df.pop("DEATH_EVENT") return (x_df, y_df) x, y = clean_data(ds) # TODO: Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
def _create_tabular(self, parameters, validate): source_type = self._json_utility.try_get_value( parameters, self._prop_source_type, None, lambda v: v in self._valid_source_types, 'Property "{}" must be one of {}.'.format( self._prop_source_type, self._valid_source_types)) if source_type == 'sql_query': query = self._get_query(parameters) if is_dataprep_installed(): return TabularDatasetFactory.from_sql_query(query, validate) return self._create_dataset_from_blocks( [_Block.craft_read_sql_block(query)], TabularDataset) path = self._get_path(parameters) include_path = self._json_utility.try_get_value( parameters, self._prop_include_path, self._default_include_path) partition_format = self._json_utility.try_get_value( parameters, self._prop_partition_format, None) if source_type == 'parquet_files': if is_dataprep_installed(): return TabularDatasetFactory.from_parquet_files( path, validate, include_path, partition_format=partition_format) return self._create_dataset_from_blocks([ _Block.craft_get_file_block(path), _Block.craft_read_parquet_block(), _Block.craft_partition_format_block(partition_format) if partition_format else None, _Block.craft_drop_path_column_block() if not include_path else None ], TabularDataset) if source_type == 'json_lines_files': if is_dataprep_installed(): return TabularDatasetFactory.from_json_lines_files( path, validate, include_path, partition_format=partition_format) return self._create_dataset_from_blocks([ _Block.craft_get_file_block(path), _Block.craft_read_json_lines_block(), _Block.craft_partition_format_block(partition_format) if partition_format else None, _Block.craft_drop_path_column_block() if not include_path else None ], TabularDataset) if source_type == 'delimited_files': infer_column_types = self._json_utility.try_get_value( parameters, self._prop_infer_column_types, self._default_infer_column_types) separator = self._json_utility.try_get_value( parameters, self._prop_separator, self._default_separator) header = self._json_utility.try_get_value(parameters, self._prop_header, self._default_header) if is_dataprep_installed(): return TabularDatasetFactory.from_delimited_files( path, validate, include_path=include_path, partition_format=partition_format, infer_column_types=infer_column_types, separator=separator, header=header) if infer_column_types: _raise_dataprep_missing_error( 'Cannot infer column types', self._error_utility.get_error_message( 'setting {} to false'.format( self._prop_infer_column_types))) return self._create_dataset_from_blocks([ _Block.craft_get_file_block(path), _Block.craft_read_delimited_block(separator, header), _Block.craft_partition_format_block(partition_format) if partition_format else None, _Block.craft_drop_path_column_block() if not include_path else None ], TabularDataset) raise RuntimeError('Unexpected code path for source_type: ' + source_type)
# Remove any incomplete entries from the dataset. df.dropna(inplace=True) indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1) x_df = df[indices_to_keep] y_df = x_df.pop("result") x_df.reset_index() return x_df, y_df # Import player data CSV ds = TabularDatasetFactory.from_delimited_files( path= "https://oracleselixir-downloadable-match-data.s3-us-west-2.amazonaws.com/2020_LoL_esports_match_data_from_OraclesElixir_20210126.csv" ) player_data = ds.to_pandas_dataframe() # Process dataframe x, y = clean_data(player_data) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) run = Run.get_context() def main(): # Add arguments parser = argparse.ArgumentParser()
education = pd.get_dummies(x_df.education, prefix="education") x_df.drop("education", inplace=True, axis=1) x_df = x_df.join(education) x_df["month"] = x_df.month.map(months) x_df["day_of_week"] = x_df.day_of_week.map(weekdays) x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1 if s == "success" else 0) y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0) return x_df, y_df # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" data_path= "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds =TabularDatasetFactory.from_delimited_files(path=data_path, separator=",") X, y = clean_data(ds) # TODO: Split data into train and test sets. ### YOUR CODE HERE ### X_train, X_test,y_train,y_test= train_test_split(X, y, test_size=0.33, random_state=42) run = Run.get_context() def main():
from sklearn.linear_model import LogisticRegression import argparse import os import numpy as np from sklearn.metrics import mean_squared_error import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory dataset_path = "https://raw.githubusercontent.com/fati-ma/nd00333-capstone/master/heart_failure_clinical_records_dataset%5B1%5D.csv" ds = TabularDatasetFactory.from_delimited_files(path=dataset_path) x_df = ds.to_pandas_dataframe().dropna() run = Run.get_context() y_df = x_df.pop("DEATH_EVENT") x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.3, random_state=123) def main(): # Add arguments to script parser = argparse.ArgumentParser()
import os import numpy as np from sklearn.metrics import mean_squared_error import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" DATA_URL = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(DATA_URL) def clean_data(data): # Dict for cleaning data months = { "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10,
from sklearn.linear_model import LogisticRegression import argparse import os import numpy as np import joblib from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory url = "https://raw.githubusercontent.com/benbot8/azure_capstone/f54d4da8ea6c2cd5c9082b6703271c25d5f16b40/starter_file/train.csv" df = TabularDatasetFactory.from_delimited_files(path=url) #df = pd.read_csv("train.csv") def clean_data(data): #clean data and convert categorical to indicator variables x_df = data.to_pandas_dataframe().dropna() x_df.reset_index(drop=True, inplace=True) x_df.drop(['state', 'account_length', 'area_code'], axis=1, inplace=True) x_df['international_plan'] = x_df.international_plan.apply( lambda s: 1 if s == "yes" else 0) x_df['voice_mail_plan'] = x_df.voice_mail_plan.apply(lambda s: 1 if s == "yes" else 0) x_df['churn'] = x_df.churn.apply(lambda s: 1 if s == "yes" else 0) x_df.rename(columns={"churn": "y"}, inplace=True) y_df = x_df.pop("y") return x_df, y_df
# Azure imports e.g., Workspace,... from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory # VRK: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" # create bank marketing tabular dataset from public azure ml. bankmarketing_webpath = [ 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv' ] #create bankmarketing data set in tabular format using TabularDatasetFactory bankmarketing_dataset = TabularDatasetFactory.from_delimited_files( path=bankmarketing_webpath) def clean_data(data): # Dict for cleaning data months = { "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10,
x_df["day_of_week"] = x_df.day_of_week.map(weekdays) x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1 if s == "success" else 0) y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0) return (x_df, y_df ) #adding missed return (df with both features and targets) # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" #creating a tabularDataset ds = TabularDatasetFactory.from_delimited_files( 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv' ) ### YOUR CODE HERE ### #cleaning data x, y = clean_data(ds) # TODO: Split data into train and test sets. #spliting data --> 80-20 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) run = Run.get_context() def main():
import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from azureml.core.run import Run import argparse from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score from azureml.data.dataset_factory import TabularDatasetFactory #----------------------------------------------------------------------------- #Import data (out of Azure): #data = pp.load_data('https://www.openml.org/data/get_csv/16826755/phpMYEkMl') #Import data (Within Azure): data = TabularDatasetFactory.from_delimited_files( 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl') dataset = data.to_pandas_dataframe() #----------------------------------------------------------------------------- #Clean Data #Define clean data function def clean_data(df): df = pp.replace_question_marks(df) df['cabin'] = df['cabin'].apply(pp.get_first_cabin) df['title'] = df['name'].apply(pp.get_title) # cast numerical variables as floats